mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-02-09 00:04:31 +01:00
bug fixes, sha frontend, updating file locations
This commit is contained in:
19
.github/workflows/ci.yml
vendored
19
.github/workflows/ci.yml
vendored
@@ -123,7 +123,6 @@ jobs:
|
||||
sudo dpkg -i google-chrome-stable_current_amd64.deb
|
||||
sudo apt-get install -f
|
||||
|
||||
|
||||
- name: Set up Python ${{ env.min-python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
@@ -198,7 +197,7 @@ jobs:
|
||||
pip install ../../dist/*.whl
|
||||
|
||||
bash -c "$(curl -fsSL https://raw.githubusercontent.com/merwanehamadi/helicone/b7ab4bc53e51d8ab29fff19ce5986ab7720970c6/mitmproxy.sh)" -s start
|
||||
|
||||
|
||||
if [ "${GITHUB_EVENT_NAME}" == "pull_request" ] || [ "${{ github.event_name }}" == "push" ]; then
|
||||
set +e # Ignore non-zero exit codes and continue execution
|
||||
echo "Running the following command: ${prefix}agbenchmark start --maintain --mock"
|
||||
@@ -211,25 +210,25 @@ jobs:
|
||||
fi
|
||||
echo "Running the following command: ${prefix}agbenchmark start --mock"
|
||||
${prefix}agbenchmark start --mock
|
||||
|
||||
|
||||
echo "Running the following command: ${prefix}agbenchmark start --mock --category=retrieval"
|
||||
${prefix}agbenchmark start --mock --category=retrieval
|
||||
|
||||
|
||||
echo "Running the following command: ${prefix}agbenchmark start --mock --category=interface"
|
||||
${prefix}agbenchmark start --mock --category=interface
|
||||
|
||||
|
||||
echo "Running the following command: ${prefix}agbenchmark start --mock --category=code"
|
||||
${prefix}agbenchmark start --mock --category=code
|
||||
|
||||
|
||||
echo "Running the following command: ${prefix}agbenchmark start --mock --category=memory"
|
||||
${prefix}agbenchmark start --mock --category=memory
|
||||
|
||||
|
||||
echo "Running the following command: ${prefix}agbenchmark start --mock --suite TestRevenueRetrieval"
|
||||
${prefix}agbenchmark start --mock --suite TestRevenueRetrieval
|
||||
|
||||
|
||||
echo "Running the following command: ${prefix}agbenchmark start --test=TestWriteFile"
|
||||
${prefix}agbenchmark start --test=TestWriteFile
|
||||
|
||||
|
||||
cd ../..
|
||||
poetry install
|
||||
poetry run uvicorn server:app --reload &
|
||||
@@ -285,7 +284,7 @@ jobs:
|
||||
git rebase origin/$current_branch
|
||||
if git push origin HEAD; then
|
||||
echo "Success!"
|
||||
poetry run python send_to_googledrive.py || echo "Failed to upload to Google Drive"
|
||||
poetry run python reports/send_to_googledrive.py || echo "Failed to upload to Google Drive"
|
||||
exit 0
|
||||
else
|
||||
echo "Attempt $(($attempts + 1)) failed. Retrying..."
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,5 +1,8 @@
|
||||
agbenchmark/workspace/
|
||||
backend/backend_stdout.txt
|
||||
reports/df.pkl
|
||||
reports/df_backwards.pkl
|
||||
reports/reports_raw.pkl
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
2
frontend
2
frontend
Submodule frontend updated: c6a9572bed...c5c36627d6
181
match_records.py
181
match_records.py
@@ -1,181 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import pandas as pd
|
||||
import glob
|
||||
from gql.transport.aiohttp import AIOHTTPTransport
|
||||
from gql import gql, Client
|
||||
import os
|
||||
|
||||
def get_reports():
|
||||
# Initialize an empty list to store the report data
|
||||
report_data = []
|
||||
|
||||
# Specify the path to the reports directory
|
||||
reports_dir = 'reports'
|
||||
|
||||
# Iterate over all agent directories in the reports directory
|
||||
for agent_name in os.listdir(reports_dir):
|
||||
agent_dir = os.path.join(reports_dir, agent_name)
|
||||
|
||||
# Check if the item is a directory (an agent directory)
|
||||
if os.path.isdir(agent_dir):
|
||||
# Construct the path to the report.json file
|
||||
# Use glob to find all run directories in the agent_dir
|
||||
run_dirs = glob.glob(os.path.join(agent_dir, '*'))
|
||||
|
||||
# For each run directory, add the report.json to the end
|
||||
report_files = [os.path.join(run_dir, 'report.json') for run_dir in run_dirs]
|
||||
for report_file in report_files:
|
||||
# Check if the report.json file exists
|
||||
if os.path.isfile(report_file):
|
||||
# Open the report.json file
|
||||
with open(report_file, 'r') as f:
|
||||
# Load the JSON data from the file
|
||||
report = json.load(f)
|
||||
|
||||
# Iterate over all tests in the report
|
||||
for test_name, test_data in report['tests'].items():
|
||||
try:
|
||||
# Append the relevant data to the report_data list
|
||||
if agent_name is not None:
|
||||
report_data.append({
|
||||
'agent': agent_name.lower(),
|
||||
'benchmark_start_time': report['benchmark_start_time'],
|
||||
'challenge': test_name,
|
||||
'categories': ', '.join(test_data['category']),
|
||||
'task': test_data['task'],
|
||||
'success': test_data['metrics']['success'],
|
||||
'difficulty': test_data['metrics']['difficulty'],
|
||||
'success_%': test_data['metrics']['success_%'],
|
||||
'run_time': test_data['metrics']['run_time']
|
||||
})
|
||||
except KeyError:
|
||||
pass
|
||||
return pd.DataFrame(report_data)
|
||||
|
||||
|
||||
def get_helicone_data():
|
||||
helicone_api_key = os.getenv('HELICONE_API_KEY')
|
||||
|
||||
url = "https://www.helicone.ai/api/graphql"
|
||||
# Replace <KEY> with your personal access key
|
||||
transport = AIOHTTPTransport(url=url, headers={
|
||||
"authorization": f"Bearer {helicone_api_key}"
|
||||
})
|
||||
|
||||
client = Client(transport=transport, fetch_schema_from_transport=True)
|
||||
|
||||
SIZE = 250
|
||||
|
||||
i = 0
|
||||
|
||||
data = []
|
||||
print("Fetching data from Helicone")
|
||||
while True:
|
||||
query = gql(
|
||||
"""
|
||||
query ExampleQuery($limit: Int, $offset: Int){
|
||||
heliconeRequest(
|
||||
limit: $limit
|
||||
offset: $offset
|
||||
) {
|
||||
prompt
|
||||
properties{
|
||||
name
|
||||
value
|
||||
}
|
||||
|
||||
requestBody
|
||||
response
|
||||
createdAt
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
"""
|
||||
)
|
||||
print(f"Fetching {i * SIZE} to {(i + 1) * SIZE} records")
|
||||
try:
|
||||
result = client.execute(query,
|
||||
variable_values={
|
||||
"limit": SIZE,
|
||||
"offset": i * SIZE
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error occurred: {e}")
|
||||
result = None
|
||||
|
||||
|
||||
i += 1
|
||||
|
||||
if result:
|
||||
for item in result["heliconeRequest"]:
|
||||
properties = {prop['name']: prop['value'] for prop in item['properties']}
|
||||
data.append({
|
||||
'createdAt': item['createdAt'],
|
||||
'agent': properties.get('agent'),
|
||||
'job_id': properties.get('job_id'),
|
||||
'challenge': properties.get('challenge'),
|
||||
'benchmark_start_time': properties.get('benchmark_start_time'),
|
||||
'prompt': item['prompt'],
|
||||
'model': item['requestBody'].get('model'),
|
||||
'request': item['requestBody'].get('messages'),
|
||||
})
|
||||
|
||||
if not result or (len(result["heliconeRequest"]) == 0):
|
||||
print("No more results")
|
||||
break
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
# Drop rows where agent is None
|
||||
df = df.dropna(subset=['agent'])
|
||||
|
||||
# Convert the remaining agent names to lowercase
|
||||
df['agent'] = df['agent'].str.lower()
|
||||
|
||||
|
||||
return df
|
||||
|
||||
|
||||
|
||||
if os.path.exists('reports_raw.pkl') and os.path.exists('helicone_raw.pkl'):
|
||||
reports_df = pd.read_pickle('reports_raw.pkl')
|
||||
helicone_df = pd.read_pickle('helicone_raw.pkl')
|
||||
else:
|
||||
reports_df = get_reports()
|
||||
reports_df.to_pickle('reports_raw.pkl')
|
||||
helicone_df = get_helicone_data()
|
||||
helicone_df.to_pickle('helicone_raw.pkl')
|
||||
|
||||
def try_formats(date_str):
|
||||
formats = ['%Y-%m-%d-%H:%M', '%Y-%m-%dT%H:%M:%S%z']
|
||||
for fmt in formats:
|
||||
try:
|
||||
return pd.to_datetime(date_str, format=fmt)
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
helicone_df['benchmark_start_time'] = pd.to_datetime(helicone_df['benchmark_start_time'].apply(try_formats), utc=True)
|
||||
helicone_df = helicone_df.dropna(subset=['benchmark_start_time'])
|
||||
helicone_df['createdAt'] = pd.to_datetime(helicone_df['createdAt'], unit='ms', origin='unix')
|
||||
reports_df['benchmark_start_time'] = pd.to_datetime(reports_df['benchmark_start_time'].apply(try_formats), utc=True)
|
||||
reports_df = reports_df.dropna(subset=['benchmark_start_time'])
|
||||
|
||||
assert pd.api.types.is_datetime64_any_dtype(helicone_df['benchmark_start_time']), "benchmark_start_time in helicone_df is not datetime"
|
||||
assert pd.api.types.is_datetime64_any_dtype(reports_df['benchmark_start_time']), "benchmark_start_time in reports_df is not datetime"
|
||||
|
||||
reports_df['report_time'] = reports_df['benchmark_start_time']
|
||||
|
||||
df = pd.merge_asof(helicone_df.sort_values('benchmark_start_time'),
|
||||
reports_df.sort_values('benchmark_start_time'),
|
||||
left_on='benchmark_start_time',
|
||||
right_on='benchmark_start_time',
|
||||
by=['agent', 'challenge'],
|
||||
direction='backward')
|
||||
|
||||
df.to_pickle('df.pkl')
|
||||
print(df.info())
|
||||
print("Data saved to df.pkl")
|
||||
print("To load the data use: df = pd.read_pickle('df.pkl')")
|
||||
@@ -1 +0,0 @@
|
||||
{}
|
||||
@@ -1,162 +0,0 @@
|
||||
{
|
||||
"TestAdaptLink": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestAdaptSimpleTypoWithGuidance": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestAdaptTeslaRevenue": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestAgentProtocol_CreateAgentTask": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestAgentProtocol_ExecuteAgentTaskStep": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestAgentProtocol_GetAgentTask": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestAgentProtocol_ListAgentTaskSteps": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestAgentProtocol_ListAgentTasksIds": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestBasicMemory": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestBasicRetrieval": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugMultipleTypo": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithGuidance": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithoutGuidance": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestFunctionCodeGeneration": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestPasswordGenerator_Easy": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestPlanCreation": [
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestProductAdvisor_GamingMonitor": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReadFile": [
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberGoal_Advanced": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRememberGoal_Hard": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRememberGoal_Medium": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRememberGoal_Simple": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRememberMultipleIds": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRememberMultipleIdsWithNoise": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRememberMultiplePhrasesWithNoise": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRetrieval3": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReturnCode_Modify": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReturnCode_Simple": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReturnCode_Tests": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReturnCode_Write": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRevenueRetrieval_1.0": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRevenueRetrieval_1.1": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRevenueRetrieval_1.2": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestSearch": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestThreeSum": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestWebApp_ListAnimals": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestWrite5FilesWithArray": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestWrite6Files": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestWriteFile": [
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestWritingCLI_FileOrganizer": [
|
||||
false,
|
||||
false
|
||||
]
|
||||
}
|
||||
214
reports/match_records.py
Normal file
214
reports/match_records.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import os
|
||||
import json
|
||||
import pandas as pd
|
||||
import glob
|
||||
from gql.transport.aiohttp import AIOHTTPTransport
|
||||
from gql import gql, Client
|
||||
import os
|
||||
|
||||
|
||||
def get_reports():
|
||||
# Initialize an empty list to store the report data
|
||||
report_data = []
|
||||
|
||||
# Get the current working directory
|
||||
current_dir = os.getcwd()
|
||||
|
||||
# Check if the current directory ends with 'reports'
|
||||
if current_dir.endswith("reports"):
|
||||
reports_dir = "/"
|
||||
else:
|
||||
reports_dir = "reports"
|
||||
|
||||
# Iterate over all agent directories in the reports directory
|
||||
for agent_name in os.listdir(reports_dir):
|
||||
agent_dir = os.path.join(reports_dir, agent_name)
|
||||
|
||||
# Check if the item is a directory (an agent directory)
|
||||
if os.path.isdir(agent_dir):
|
||||
# Construct the path to the report.json file
|
||||
# Use glob to find all run directories in the agent_dir
|
||||
run_dirs = glob.glob(os.path.join(agent_dir, "*"))
|
||||
|
||||
# For each run directory, add the report.json to the end
|
||||
report_files = [
|
||||
os.path.join(run_dir, "report.json") for run_dir in run_dirs
|
||||
]
|
||||
for report_file in report_files:
|
||||
# Check if the report.json file exists
|
||||
if os.path.isfile(report_file):
|
||||
# Open the report.json file
|
||||
with open(report_file, "r") as f:
|
||||
# Load the JSON data from the file
|
||||
report = json.load(f)
|
||||
|
||||
# Iterate over all tests in the report
|
||||
for test_name, test_data in report["tests"].items():
|
||||
try:
|
||||
# Append the relevant data to the report_data list
|
||||
if agent_name is not None:
|
||||
report_data.append(
|
||||
{
|
||||
"agent": agent_name.lower(),
|
||||
"benchmark_start_time": report[
|
||||
"benchmark_start_time"
|
||||
],
|
||||
"challenge": test_name,
|
||||
"categories": ", ".join(
|
||||
test_data["category"]
|
||||
),
|
||||
"task": test_data["task"],
|
||||
"success": test_data["metrics"]["success"],
|
||||
"difficulty": test_data["metrics"][
|
||||
"difficulty"
|
||||
],
|
||||
"success_%": test_data["metrics"][
|
||||
"success_%"
|
||||
],
|
||||
"run_time": test_data["metrics"][
|
||||
"run_time"
|
||||
],
|
||||
}
|
||||
)
|
||||
except KeyError:
|
||||
pass
|
||||
return pd.DataFrame(report_data)
|
||||
|
||||
|
||||
def get_helicone_data():
|
||||
helicone_api_key = os.getenv("HELICONE_API_KEY")
|
||||
|
||||
url = "https://www.helicone.ai/api/graphql"
|
||||
# Replace <KEY> with your personal access key
|
||||
transport = AIOHTTPTransport(
|
||||
url=url, headers={"authorization": f"Bearer {helicone_api_key}"}
|
||||
)
|
||||
|
||||
client = Client(transport=transport, fetch_schema_from_transport=True)
|
||||
|
||||
SIZE = 250
|
||||
|
||||
i = 0
|
||||
|
||||
data = []
|
||||
print("Fetching data from Helicone")
|
||||
while True:
|
||||
query = gql(
|
||||
"""
|
||||
query ExampleQuery($limit: Int, $offset: Int){
|
||||
heliconeRequest(
|
||||
limit: $limit
|
||||
offset: $offset
|
||||
) {
|
||||
prompt
|
||||
properties{
|
||||
name
|
||||
value
|
||||
}
|
||||
|
||||
requestBody
|
||||
response
|
||||
createdAt
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
"""
|
||||
)
|
||||
print(f"Fetching {i * SIZE} to {(i + 1) * SIZE} records")
|
||||
try:
|
||||
result = client.execute(
|
||||
query, variable_values={"limit": SIZE, "offset": i * SIZE}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error occurred: {e}")
|
||||
result = None
|
||||
|
||||
i += 1
|
||||
|
||||
if result:
|
||||
for item in result["heliconeRequest"]:
|
||||
properties = {
|
||||
prop["name"]: prop["value"] for prop in item["properties"]
|
||||
}
|
||||
data.append(
|
||||
{
|
||||
"createdAt": item["createdAt"],
|
||||
"agent": properties.get("agent"),
|
||||
"job_id": properties.get("job_id"),
|
||||
"challenge": properties.get("challenge"),
|
||||
"benchmark_start_time": properties.get("benchmark_start_time"),
|
||||
"prompt": item["prompt"],
|
||||
"model": item["requestBody"].get("model"),
|
||||
"request": item["requestBody"].get("messages"),
|
||||
}
|
||||
)
|
||||
|
||||
if not result or (len(result["heliconeRequest"]) == 0):
|
||||
print("No more results")
|
||||
break
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
# Drop rows where agent is None
|
||||
df = df.dropna(subset=["agent"])
|
||||
|
||||
# Convert the remaining agent names to lowercase
|
||||
df["agent"] = df["agent"].str.lower()
|
||||
|
||||
return df
|
||||
|
||||
|
||||
if os.path.exists("reports_raw.pkl") and os.path.exists("helicone_raw.pkl"):
|
||||
reports_df = pd.read_pickle("reports_raw.pkl")
|
||||
helicone_df = pd.read_pickle("helicone_raw.pkl")
|
||||
else:
|
||||
reports_df = get_reports()
|
||||
reports_df.to_pickle("reports_raw.pkl")
|
||||
helicone_df = get_helicone_data()
|
||||
helicone_df.to_pickle("helicone_raw.pkl")
|
||||
|
||||
|
||||
def try_formats(date_str):
|
||||
formats = ["%Y-%m-%d-%H:%M", "%Y-%m-%dT%H:%M:%S%z"]
|
||||
for fmt in formats:
|
||||
try:
|
||||
return pd.to_datetime(date_str, format=fmt)
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
helicone_df["benchmark_start_time"] = pd.to_datetime(
|
||||
helicone_df["benchmark_start_time"].apply(try_formats), utc=True
|
||||
)
|
||||
helicone_df = helicone_df.dropna(subset=["benchmark_start_time"])
|
||||
helicone_df["createdAt"] = pd.to_datetime(
|
||||
helicone_df["createdAt"], unit="ms", origin="unix"
|
||||
)
|
||||
reports_df["benchmark_start_time"] = pd.to_datetime(
|
||||
reports_df["benchmark_start_time"].apply(try_formats), utc=True
|
||||
)
|
||||
reports_df = reports_df.dropna(subset=["benchmark_start_time"])
|
||||
|
||||
assert pd.api.types.is_datetime64_any_dtype(
|
||||
helicone_df["benchmark_start_time"]
|
||||
), "benchmark_start_time in helicone_df is not datetime"
|
||||
assert pd.api.types.is_datetime64_any_dtype(
|
||||
reports_df["benchmark_start_time"]
|
||||
), "benchmark_start_time in reports_df is not datetime"
|
||||
|
||||
reports_df["report_time"] = reports_df["benchmark_start_time"]
|
||||
|
||||
df = pd.merge_asof(
|
||||
helicone_df.sort_values("benchmark_start_time"),
|
||||
reports_df.sort_values("benchmark_start_time"),
|
||||
left_on="benchmark_start_time",
|
||||
right_on="benchmark_start_time",
|
||||
by=["agent", "challenge"],
|
||||
direction="backward",
|
||||
)
|
||||
|
||||
df.to_pickle("df.pkl")
|
||||
print(df.info())
|
||||
print("Data saved to df.pkl")
|
||||
print("To load the data use: df = pd.read_pickle('df.pkl')")
|
||||
@@ -29,6 +29,15 @@ creds_info = json.loads(creds_string)
|
||||
# Define the base directory containing JSON files
|
||||
base_dir = "reports"
|
||||
|
||||
# Get the current working directory
|
||||
current_dir = os.getcwd()
|
||||
|
||||
# Check if the current directory ends with 'reports'
|
||||
if current_dir.endswith("reports"):
|
||||
base_dir = "/"
|
||||
else:
|
||||
base_dir = "reports"
|
||||
|
||||
# Create a list to store each row of data
|
||||
rows = []
|
||||
|
||||
Reference in New Issue
Block a user