mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-07 08:14:25 +01:00
344 lines
11 KiB
Plaintext
344 lines
11 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"df = pd.read_pickle('../reports/df.pkl')\n",
|
|
"df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.dropna(subset=['benchmark_start_time', 'response', 'model'], inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"selected_df = df.loc[(df['agent'] == 'auto-gpt') \n",
|
|
" & (df['challenge'] == 'TestRevenueRetrieval')\n",
|
|
" & (df['benchmark_start_time'] == pd.Timestamp('2023-08-05 08:12:00+0000', tz='UTC'))]\n",
|
|
"selected_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['challenge'].unique()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"set(left_df['challenge'].unique()) - set(df['challenge'].unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# selected_df = left_df.loc[(left_df['challenge'] == 'TestReturnCode_Simple')]\n",
|
|
"# selected_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['agent'].unique()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# List of agents to ignore\n",
|
|
"agents_to_ignore = ['gpt-engineer', 'smol-developer', 'babyagi', 'evo', 'auto-gpt-turbo']\n",
|
|
"agents_to_check = [agent for agent in df['agent'].unique() if agent not in agents_to_ignore]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"# Convert 'benchmark_start_time' to datetime if it's not already\n",
|
|
"df['benchmark_start_time'] = pd.to_datetime(df['benchmark_start_time'])\n",
|
|
"\n",
|
|
"# Filter out the agents to ignore\n",
|
|
"filtered_df = df[~df['agent'].isin(agents_to_ignore)]\n",
|
|
"\n",
|
|
"# Group by 'benchmark_start_time', 'agent', and 'challenge', then count unique job_ids\n",
|
|
"grouped_df = filtered_df.groupby(['benchmark_start_time', 'agent', 'challenge'])['job_id'].nunique().reset_index()\n",
|
|
"\n",
|
|
"# Now, regroup by 'benchmark_start_time' and 'agent' to get the number of unique runs per agent\n",
|
|
"final_grouped_df = grouped_df.groupby(['benchmark_start_time', 'agent']).size().reset_index(name='unique_runs')\n",
|
|
"\n",
|
|
"# Create a single plot for all agents\n",
|
|
"plt.figure(figsize=(10, 5))\n",
|
|
"\n",
|
|
"for agent in final_grouped_df['agent'].unique():\n",
|
|
" agent_data = final_grouped_df[final_grouped_df['agent'] == agent]\n",
|
|
" plt.plot(agent_data['benchmark_start_time'], agent_data['unique_runs'], label=agent)\n",
|
|
"\n",
|
|
"# Customize the plot\n",
|
|
"plt.xlabel('Benchmark Start Time')\n",
|
|
"plt.ylabel('Number of Challenges Run')\n",
|
|
"plt.title('Agent Benchmark Runs Over Time')\n",
|
|
"plt.legend()\n",
|
|
"plt.xticks(rotation=45)\n",
|
|
"plt.tight_layout()\n",
|
|
"\n",
|
|
"# Show the plot\n",
|
|
"plt.show()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"# Convert 'success' column to boolean if it's not\n",
|
|
"df['success'] = df['success'].astype(bool)\n",
|
|
"\n",
|
|
"# Convert 'benchmark_start_time' to datetime if it's not\n",
|
|
"df['benchmark_start_time'] = pd.to_datetime(df['benchmark_start_time'])\n",
|
|
"\n",
|
|
"filtered_df = df[~df['agent'].isin(agents_to_ignore)]\n",
|
|
"\n",
|
|
"# Filter the DataFrame for a specific challenge\n",
|
|
"specific_challenge = 'TestRememberMultipleIds' # Replace with the challenge you're interested in\n",
|
|
"filtered_df = filtered_df[filtered_df['challenge'] == specific_challenge]\n",
|
|
"\n",
|
|
"# Group by 'benchmark_start_time', 'agent', and 'success', then count occurrences\n",
|
|
"count_by_agent_over_time = filtered_df.groupby(['benchmark_start_time', 'agent', 'success']).size().reset_index(name='count')\n",
|
|
"\n",
|
|
"# Create a list of unique agents\n",
|
|
"unique_agents = count_by_agent_over_time['agent'].unique()\n",
|
|
"\n",
|
|
"# Plotting\n",
|
|
"plt.figure(figsize=(15, 8))\n",
|
|
"\n",
|
|
"# Plot each agent\n",
|
|
"for agent in unique_agents:\n",
|
|
" subset_df = count_by_agent_over_time[(count_by_agent_over_time['agent'] == agent)]\n",
|
|
" \n",
|
|
" # Plot successes and failures separately\n",
|
|
" for success in [True]:\n",
|
|
" subsubset_df = subset_df[subset_df['success'] == success]\n",
|
|
" plt.plot(subsubset_df['benchmark_start_time'], subsubset_df['count'], marker='o', linestyle='-', label=f\"{agent} (Success: {success})\")\n",
|
|
"\n",
|
|
"plt.title(f'Success/Failure Count by Agent Over Time for Challenge: {specific_challenge}')\n",
|
|
"plt.xlabel('Benchmark Start Time')\n",
|
|
"plt.ylabel('Count')\n",
|
|
"plt.grid(True)\n",
|
|
"plt.legend(title='Agents')\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"selected_df = df.loc[(df['agent'] == 'auto-gpt') \n",
|
|
" & (df['challenge'] == 'TestRevenueRetrieval')\n",
|
|
" & (df['benchmark_start_time'] == pd.Timestamp('2023-08-05 08:12:00+0000', tz='UTC'))]\n",
|
|
"df['agent'].unique()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"from collections import OrderedDict\n",
|
|
"\n",
|
|
"\n",
|
|
"# Convert the JSON-like strings in the 'response' column to nested dictionaries\n",
|
|
"def nested_json(x):\n",
|
|
" if pd.notna(x):\n",
|
|
" d = json.loads(x) # Convert the string to a dict\n",
|
|
" if \"content\" in d and isinstance(d[\"content\"], str):\n",
|
|
" try:\n",
|
|
" d[\"content\"] = json.loads(d[\"content\"]) # Try converting 'content' to a dict\n",
|
|
" except json.JSONDecodeError:\n",
|
|
" pass # If it's not JSON, leave it as is\n",
|
|
" return d\n",
|
|
" return x\n",
|
|
"\n",
|
|
"\n",
|
|
"# Initialize an empty dictionary\n",
|
|
"response_dict = OrderedDict()\n",
|
|
"response_nested_dict = OrderedDict()\n",
|
|
"\n",
|
|
"# Get the total number of rows\n",
|
|
"total_rows = len(selected_df)\n",
|
|
"\n",
|
|
"# Loop over the DataFrame's index and rows\n",
|
|
"for i, (_, row) in enumerate(selected_df.iterrows()):\n",
|
|
" # Convert the JSON-like strings in the 'response' to nested dictionaries\n",
|
|
" response = json.loads(row['response'])\n",
|
|
" response_nested = nested_json(row['response'])\n",
|
|
" \n",
|
|
" # Insert the record into the dictionary with the row number as the key\n",
|
|
" response_dict[str(total_rows - i)] = response\n",
|
|
" response_nested_dict[str(total_rows - i)] = response_nested\n",
|
|
" \n",
|
|
"# Reverse the order of items in the OrderedDict\n",
|
|
"reversed_response_dict = OrderedDict(reversed(list(response_dict.items())))\n",
|
|
"reversed_response_nested_dict = OrderedDict(reversed(list(response_nested_dict.items())))\n",
|
|
"\n",
|
|
"# Write the dictionary to a JSON file\n",
|
|
"with open('selected_logs.json', 'w') as f:\n",
|
|
" json.dump(reversed_response_dict, f, indent=4)\n",
|
|
" \n",
|
|
"# Write the dictionary to a JSON file\n",
|
|
"with open('selected_logs_nested.json', 'w') as f:\n",
|
|
" json.dump(reversed_response_nested_dict, f, indent=4)\n",
|
|
"\n",
|
|
"# # Show the 'response' column\n",
|
|
"# print(response_column.head(5))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import json\n",
|
|
"import os\n",
|
|
"from collections import OrderedDict\n",
|
|
"\n",
|
|
"# Function to convert JSON-like strings to nested dictionaries\n",
|
|
"def nested_json(x):\n",
|
|
" if pd.notna(x):\n",
|
|
" d = json.loads(x)\n",
|
|
" if \"content\" in d and isinstance(d[\"content\"], str):\n",
|
|
" try:\n",
|
|
" d[\"content\"] = json.loads(d[\"content\"])\n",
|
|
" except json.JSONDecodeError:\n",
|
|
" pass\n",
|
|
" return d\n",
|
|
" return x\n",
|
|
"\n",
|
|
"challenge = \"TestThreeSum\"\n",
|
|
"\n",
|
|
"# Loop through unique agents\n",
|
|
"for agent in df['agent'].unique():\n",
|
|
" selected_df = df.loc[(df['agent'] == agent) & (df['challenge'] == challenge)]\n",
|
|
" \n",
|
|
" master_response_dict = OrderedDict()\n",
|
|
" master_response_nested_dict = OrderedDict()\n",
|
|
"\n",
|
|
" # Group by 'benchmark_start_time'\n",
|
|
" grouped_df = selected_df.groupby('benchmark_start_time')\n",
|
|
"\n",
|
|
" for timestamp, group in grouped_df:\n",
|
|
" response_dict = OrderedDict()\n",
|
|
" response_nested_dict = OrderedDict()\n",
|
|
" \n",
|
|
" total_rows = len(group)\n",
|
|
" \n",
|
|
" for i, (_, row) in enumerate(group.iterrows()):\n",
|
|
" response = json.loads(row['response'])\n",
|
|
" response_nested = nested_json(row['response'])\n",
|
|
" \n",
|
|
" response_dict[str(total_rows-i)] = response # Starting from 1 as you mentioned\n",
|
|
" response_nested_dict[str(total_rows-i)] = response_nested\n",
|
|
" \n",
|
|
" # Reverse the order within each timestamp group\n",
|
|
" response_dict = OrderedDict(reversed(list(response_dict.items())))\n",
|
|
" response_nested_dict = OrderedDict(reversed(list(response_nested_dict.items())))\n",
|
|
"\n",
|
|
" # Add the timestamp as a key to the master dictionary\n",
|
|
" master_response_dict[str(timestamp)] = response_dict\n",
|
|
" master_response_nested_dict[str(timestamp)] = response_nested_dict\n",
|
|
" \n",
|
|
" # Create directories\n",
|
|
" os.makedirs(f'{challenge}', exist_ok=True)\n",
|
|
" os.makedirs(f'{challenge}/{agent}', exist_ok=True)\n",
|
|
" \n",
|
|
" # Write to a JSON file specific to the agent\n",
|
|
" with open(f'{challenge}/{agent}/selected_logs.json', 'w') as f:\n",
|
|
" json.dump(master_response_dict, f, indent=4)\n",
|
|
" \n",
|
|
" with open(f'{challenge}/{agent}/selected_logs_nested.json', 'w') as f:\n",
|
|
" json.dump(master_response_nested_dict, f, indent=4)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|