Auto-GPT/benchmark/paper/combined_data.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_pickle('../reports/df.pkl')\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dropna(subset=['benchmark_start_time', 'response', 'model'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_df = df.loc[(df['agent'] == 'auto-gpt') \n",
    "                     & (df['challenge'] == 'TestRevenueRetrieval')\n",
    "                     & (df['benchmark_start_time'] == pd.Timestamp('2023-08-05 08:12:00+0000', tz='UTC'))]\n",
    "selected_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['challenge'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set(left_df['challenge'].unique()) - set(df['challenge'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# selected_df = left_df.loc[(left_df['challenge'] == 'TestReturnCode_Simple')]\n",
    "# selected_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['agent'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# List of agents to ignore\n",
    "agents_to_ignore = ['gpt-engineer', 'smol-developer', 'babyagi', 'evo', 'auto-gpt-turbo']\n",
    "agents_to_check = [agent for agent in df['agent'].unique() if agent not in agents_to_ignore]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Convert 'benchmark_start_time' to datetime if it's not already\n",
    "df['benchmark_start_time'] = pd.to_datetime(df['benchmark_start_time'])\n",
    "\n",
    "# Filter out the agents to ignore\n",
    "filtered_df = df[~df['agent'].isin(agents_to_ignore)]\n",
    "\n",
    "# Group by 'benchmark_start_time', 'agent', and 'challenge', then count unique job_ids\n",
    "grouped_df = filtered_df.groupby(['benchmark_start_time', 'agent', 'challenge'])['job_id'].nunique().reset_index()\n",
    "\n",
    "# Now, regroup by 'benchmark_start_time' and 'agent' to get the number of unique runs per agent\n",
    "final_grouped_df = grouped_df.groupby(['benchmark_start_time', 'agent']).size().reset_index(name='unique_runs')\n",
    "\n",
    "# Create a single plot for all agents\n",
    "plt.figure(figsize=(10, 5))\n",
    "\n",
    "for agent in final_grouped_df['agent'].unique():\n",
    "    agent_data = final_grouped_df[final_grouped_df['agent'] == agent]\n",
    "    plt.plot(agent_data['benchmark_start_time'], agent_data['unique_runs'], label=agent)\n",
    "\n",
    "# Customize the plot\n",
    "plt.xlabel('Benchmark Start Time')\n",
    "plt.ylabel('Number of Challenges Run')\n",
    "plt.title('Agent Benchmark Runs Over Time')\n",
    "plt.legend()\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "\n",
    "# Show the plot\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Convert 'success' column to boolean if it's not\n",
    "df['success'] = df['success'].astype(bool)\n",
    "\n",
    "# Convert 'benchmark_start_time' to datetime if it's not\n",
    "df['benchmark_start_time'] = pd.to_datetime(df['benchmark_start_time'])\n",
    "\n",
    "filtered_df = df[~df['agent'].isin(agents_to_ignore)]\n",
    "\n",
    "# Filter the DataFrame for a specific challenge\n",
    "specific_challenge = 'TestRememberMultipleIds'  # Replace with the challenge you're interested in\n",
    "filtered_df = filtered_df[filtered_df['challenge'] == specific_challenge]\n",
    "\n",
    "# Group by 'benchmark_start_time', 'agent', and 'success', then count occurrences\n",
    "count_by_agent_over_time = filtered_df.groupby(['benchmark_start_time', 'agent', 'success']).size().reset_index(name='count')\n",
    "\n",
    "# Create a list of unique agents\n",
    "unique_agents = count_by_agent_over_time['agent'].unique()\n",
    "\n",
    "# Plotting\n",
    "plt.figure(figsize=(15, 8))\n",
    "\n",
    "# Plot each agent\n",
    "for agent in unique_agents:\n",
    "    subset_df = count_by_agent_over_time[(count_by_agent_over_time['agent'] == agent)]\n",
    "    \n",
    "    # Plot successes and failures separately\n",
    "    for success in [True]:\n",
    "        subsubset_df = subset_df[subset_df['success'] == success]\n",
    "        plt.plot(subsubset_df['benchmark_start_time'], subsubset_df['count'], marker='o', linestyle='-', label=f\"{agent} (Success: {success})\")\n",
    "\n",
    "plt.title(f'Success/Failure Count by Agent Over Time for Challenge: {specific_challenge}')\n",
    "plt.xlabel('Benchmark Start Time')\n",
    "plt.ylabel('Count')\n",
    "plt.grid(True)\n",
    "plt.legend(title='Agents')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_df = df.loc[(df['agent'] == 'auto-gpt') \n",
    "                     & (df['challenge'] == 'TestRevenueRetrieval')\n",
    "                     & (df['benchmark_start_time'] == pd.Timestamp('2023-08-05 08:12:00+0000', tz='UTC'))]\n",
    "df['agent'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from collections import OrderedDict\n",
    "\n",
    "\n",
    "# Convert the JSON-like strings in the 'response' column to nested dictionaries\n",
    "def nested_json(x):\n",
    "    if pd.notna(x):\n",
    "        d = json.loads(x)  # Convert the string to a dict\n",
    "        if \"content\" in d and isinstance(d[\"content\"], str):\n",
    "            try:\n",
    "                d[\"content\"] = json.loads(d[\"content\"])  # Try converting 'content' to a dict\n",
    "            except json.JSONDecodeError:\n",
    "                pass  # If it's not JSON, leave it as is\n",
    "        return d\n",
    "    return x\n",
    "\n",
    "\n",
    "# Initialize an empty dictionary\n",
    "response_dict = OrderedDict()\n",
    "response_nested_dict = OrderedDict()\n",
    "\n",
    "# Get the total number of rows\n",
    "total_rows = len(selected_df)\n",
    "\n",
    "# Loop over the DataFrame's index and rows\n",
    "for i, (_, row) in enumerate(selected_df.iterrows()):\n",
    "    # Convert the JSON-like strings in the 'response' to nested dictionaries\n",
    "    response = json.loads(row['response'])\n",
    "    response_nested = nested_json(row['response'])\n",
    "    \n",
    "    # Insert the record into the dictionary with the row number as the key\n",
    "    response_dict[str(total_rows - i)] = response\n",
    "    response_nested_dict[str(total_rows - i)] = response_nested\n",
    "    \n",
    "# Reverse the order of items in the OrderedDict\n",
    "reversed_response_dict = OrderedDict(reversed(list(response_dict.items())))\n",
    "reversed_response_nested_dict = OrderedDict(reversed(list(response_nested_dict.items())))\n",
    "\n",
    "# Write the dictionary to a JSON file\n",
    "with open('selected_logs.json', 'w') as f:\n",
    "    json.dump(reversed_response_dict, f, indent=4)\n",
    "    \n",
    "# Write the dictionary to a JSON file\n",
    "with open('selected_logs_nested.json', 'w') as f:\n",
    "    json.dump(reversed_response_nested_dict, f, indent=4)\n",
    "\n",
    "# # Show the 'response' column\n",
    "# print(response_column.head(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "from collections import OrderedDict\n",
    "\n",
    "# Function to convert JSON-like strings to nested dictionaries\n",
    "def nested_json(x):\n",
    "    if pd.notna(x):\n",
    "        d = json.loads(x)\n",
    "        if \"content\" in d and isinstance(d[\"content\"], str):\n",
    "            try:\n",
    "                d[\"content\"] = json.loads(d[\"content\"])\n",
    "            except json.JSONDecodeError:\n",
    "                pass\n",
    "        return d\n",
    "    return x\n",
    "\n",
    "challenge = \"TestThreeSum\"\n",
    "\n",
    "# Loop through unique agents\n",
    "for agent in df['agent'].unique():\n",
    "    selected_df = df.loc[(df['agent'] == agent) & (df['challenge'] == challenge)]\n",
    "    \n",
    "    master_response_dict = OrderedDict()\n",
    "    master_response_nested_dict = OrderedDict()\n",
    "\n",
    "    # Group by 'benchmark_start_time'\n",
    "    grouped_df = selected_df.groupby('benchmark_start_time')\n",
    "\n",
    "    for timestamp, group in grouped_df:\n",
    "        response_dict = OrderedDict()\n",
    "        response_nested_dict = OrderedDict()\n",
    "        \n",
    "        total_rows = len(group)\n",
    "        \n",
    "        for i, (_, row) in enumerate(group.iterrows()):\n",
    "            response = json.loads(row['response'])\n",
    "            response_nested = nested_json(row['response'])\n",
    "            \n",
    "            response_dict[str(total_rows-i)] = response  # Starting from 1 as you mentioned\n",
    "            response_nested_dict[str(total_rows-i)] = response_nested\n",
    "        \n",
    "        # Reverse the order within each timestamp group\n",
    "        response_dict = OrderedDict(reversed(list(response_dict.items())))\n",
    "        response_nested_dict = OrderedDict(reversed(list(response_nested_dict.items())))\n",
    "\n",
    "        # Add the timestamp as a key to the master dictionary\n",
    "        master_response_dict[str(timestamp)] = response_dict\n",
    "        master_response_nested_dict[str(timestamp)] = response_nested_dict\n",
    "    \n",
    "    # Create directories\n",
    "    os.makedirs(f'{challenge}', exist_ok=True)\n",
    "    os.makedirs(f'{challenge}/{agent}', exist_ok=True)\n",
    "    \n",
    "    # Write to a JSON file specific to the agent\n",
    "    with open(f'{challenge}/{agent}/selected_logs.json', 'w') as f:\n",
    "        json.dump(master_response_dict, f, indent=4)\n",
    "    \n",
    "    with open(f'{challenge}/{agent}/selected_logs_nested.json', 'w') as f:\n",
    "        json.dump(master_response_nested_dict, f, indent=4)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}