From 6eeefbd8000033e8fed9459e448ca17391946b90 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Fri, 1 Sep 2023 10:54:35 -0700 Subject: [PATCH] bug fixes, sha frontend, updating file locations --- .github/workflows/ci.yml | 19 +- .gitignore | 3 + frontend | 2 +- match_records.py | 181 --------------- reports/Auto-GPT-Turbo/regression_tests.json | 1 - reports/Auto-GPT-Turbo/success_rate.json | 162 ------------- .../json_to_base_64.py | 0 reports/match_records.py | 214 ++++++++++++++++++ .../send_to_googledrive.py | 9 + 9 files changed, 236 insertions(+), 355 deletions(-) delete mode 100644 match_records.py delete mode 100644 reports/Auto-GPT-Turbo/regression_tests.json delete mode 100644 reports/Auto-GPT-Turbo/success_rate.json rename json_to_base_64.py => reports/json_to_base_64.py (100%) create mode 100644 reports/match_records.py rename send_to_googledrive.py => reports/send_to_googledrive.py (96%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9a5b1c52..232c9cc5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -123,7 +123,6 @@ jobs: sudo dpkg -i google-chrome-stable_current_amd64.deb sudo apt-get install -f - - name: Set up Python ${{ env.min-python-version }} uses: actions/setup-python@v2 with: @@ -198,7 +197,7 @@ jobs: pip install ../../dist/*.whl bash -c "$(curl -fsSL https://raw.githubusercontent.com/merwanehamadi/helicone/b7ab4bc53e51d8ab29fff19ce5986ab7720970c6/mitmproxy.sh)" -s start - + if [ "${GITHUB_EVENT_NAME}" == "pull_request" ] || [ "${{ github.event_name }}" == "push" ]; then set +e # Ignore non-zero exit codes and continue execution echo "Running the following command: ${prefix}agbenchmark start --maintain --mock" @@ -211,25 +210,25 @@ jobs: fi echo "Running the following command: ${prefix}agbenchmark start --mock" ${prefix}agbenchmark start --mock - + echo "Running the following command: ${prefix}agbenchmark start --mock --category=retrieval" ${prefix}agbenchmark start --mock --category=retrieval - + echo "Running the following command: ${prefix}agbenchmark start --mock --category=interface" ${prefix}agbenchmark start --mock --category=interface - + echo "Running the following command: ${prefix}agbenchmark start --mock --category=code" ${prefix}agbenchmark start --mock --category=code - + echo "Running the following command: ${prefix}agbenchmark start --mock --category=memory" ${prefix}agbenchmark start --mock --category=memory - + echo "Running the following command: ${prefix}agbenchmark start --mock --suite TestRevenueRetrieval" ${prefix}agbenchmark start --mock --suite TestRevenueRetrieval - + echo "Running the following command: ${prefix}agbenchmark start --test=TestWriteFile" ${prefix}agbenchmark start --test=TestWriteFile - + cd ../.. poetry install poetry run uvicorn server:app --reload & @@ -285,7 +284,7 @@ jobs: git rebase origin/$current_branch if git push origin HEAD; then echo "Success!" - poetry run python send_to_googledrive.py || echo "Failed to upload to Google Drive" + poetry run python reports/send_to_googledrive.py || echo "Failed to upload to Google Drive" exit 0 else echo "Attempt $(($attempts + 1)) failed. Retrying..." diff --git a/.gitignore b/.gitignore index 248d9dc1..f0c630e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ agbenchmark/workspace/ backend/backend_stdout.txt +reports/df.pkl +reports/df_backwards.pkl +reports/reports_raw.pkl # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/frontend b/frontend index c6a9572b..c5c36627 160000 --- a/frontend +++ b/frontend @@ -1 +1 @@ -Subproject commit c6a9572bed50d2c7febd11eacb15a6685637e606 +Subproject commit c5c36627d6abee95a31b22693682c365c30cd0c5 diff --git a/match_records.py b/match_records.py deleted file mode 100644 index 53f2e206..00000000 --- a/match_records.py +++ /dev/null @@ -1,181 +0,0 @@ -import os -import json -import pandas as pd -import glob -from gql.transport.aiohttp import AIOHTTPTransport -from gql import gql, Client -import os - -def get_reports(): - # Initialize an empty list to store the report data - report_data = [] - - # Specify the path to the reports directory - reports_dir = 'reports' - - # Iterate over all agent directories in the reports directory - for agent_name in os.listdir(reports_dir): - agent_dir = os.path.join(reports_dir, agent_name) - - # Check if the item is a directory (an agent directory) - if os.path.isdir(agent_dir): - # Construct the path to the report.json file - # Use glob to find all run directories in the agent_dir - run_dirs = glob.glob(os.path.join(agent_dir, '*')) - - # For each run directory, add the report.json to the end - report_files = [os.path.join(run_dir, 'report.json') for run_dir in run_dirs] - for report_file in report_files: - # Check if the report.json file exists - if os.path.isfile(report_file): - # Open the report.json file - with open(report_file, 'r') as f: - # Load the JSON data from the file - report = json.load(f) - - # Iterate over all tests in the report - for test_name, test_data in report['tests'].items(): - try: - # Append the relevant data to the report_data list - if agent_name is not None: - report_data.append({ - 'agent': agent_name.lower(), - 'benchmark_start_time': report['benchmark_start_time'], - 'challenge': test_name, - 'categories': ', '.join(test_data['category']), - 'task': test_data['task'], - 'success': test_data['metrics']['success'], - 'difficulty': test_data['metrics']['difficulty'], - 'success_%': test_data['metrics']['success_%'], - 'run_time': test_data['metrics']['run_time'] - }) - except KeyError: - pass - return pd.DataFrame(report_data) - - -def get_helicone_data(): - helicone_api_key = os.getenv('HELICONE_API_KEY') - - url = "https://www.helicone.ai/api/graphql" - # Replace with your personal access key - transport = AIOHTTPTransport(url=url, headers={ - "authorization": f"Bearer {helicone_api_key}" - }) - - client = Client(transport=transport, fetch_schema_from_transport=True) - - SIZE = 250 - - i = 0 - - data = [] - print("Fetching data from Helicone") - while True: - query = gql( - """ - query ExampleQuery($limit: Int, $offset: Int){ - heliconeRequest( - limit: $limit - offset: $offset - ) { - prompt - properties{ - name - value - } - - requestBody - response - createdAt - - } - - } - """ - ) - print(f"Fetching {i * SIZE} to {(i + 1) * SIZE} records") - try: - result = client.execute(query, - variable_values={ - "limit": SIZE, - "offset": i * SIZE - } - ) - except Exception as e: - print(f"Error occurred: {e}") - result = None - - - i += 1 - - if result: - for item in result["heliconeRequest"]: - properties = {prop['name']: prop['value'] for prop in item['properties']} - data.append({ - 'createdAt': item['createdAt'], - 'agent': properties.get('agent'), - 'job_id': properties.get('job_id'), - 'challenge': properties.get('challenge'), - 'benchmark_start_time': properties.get('benchmark_start_time'), - 'prompt': item['prompt'], - 'model': item['requestBody'].get('model'), - 'request': item['requestBody'].get('messages'), - }) - - if not result or (len(result["heliconeRequest"]) == 0): - print("No more results") - break - - df = pd.DataFrame(data) - # Drop rows where agent is None - df = df.dropna(subset=['agent']) - - # Convert the remaining agent names to lowercase - df['agent'] = df['agent'].str.lower() - - - return df - - - -if os.path.exists('reports_raw.pkl') and os.path.exists('helicone_raw.pkl'): - reports_df = pd.read_pickle('reports_raw.pkl') - helicone_df = pd.read_pickle('helicone_raw.pkl') -else: - reports_df = get_reports() - reports_df.to_pickle('reports_raw.pkl') - helicone_df = get_helicone_data() - helicone_df.to_pickle('helicone_raw.pkl') - -def try_formats(date_str): - formats = ['%Y-%m-%d-%H:%M', '%Y-%m-%dT%H:%M:%S%z'] - for fmt in formats: - try: - return pd.to_datetime(date_str, format=fmt) - except ValueError: - pass - return None - -helicone_df['benchmark_start_time'] = pd.to_datetime(helicone_df['benchmark_start_time'].apply(try_formats), utc=True) -helicone_df = helicone_df.dropna(subset=['benchmark_start_time']) -helicone_df['createdAt'] = pd.to_datetime(helicone_df['createdAt'], unit='ms', origin='unix') -reports_df['benchmark_start_time'] = pd.to_datetime(reports_df['benchmark_start_time'].apply(try_formats), utc=True) -reports_df = reports_df.dropna(subset=['benchmark_start_time']) - -assert pd.api.types.is_datetime64_any_dtype(helicone_df['benchmark_start_time']), "benchmark_start_time in helicone_df is not datetime" -assert pd.api.types.is_datetime64_any_dtype(reports_df['benchmark_start_time']), "benchmark_start_time in reports_df is not datetime" - -reports_df['report_time'] = reports_df['benchmark_start_time'] - -df = pd.merge_asof(helicone_df.sort_values('benchmark_start_time'), - reports_df.sort_values('benchmark_start_time'), - left_on='benchmark_start_time', - right_on='benchmark_start_time', - by=['agent', 'challenge'], - direction='backward') - -df.to_pickle('df.pkl') -print(df.info()) -print("Data saved to df.pkl") -print("To load the data use: df = pd.read_pickle('df.pkl')") \ No newline at end of file diff --git a/reports/Auto-GPT-Turbo/regression_tests.json b/reports/Auto-GPT-Turbo/regression_tests.json deleted file mode 100644 index 9e26dfee..00000000 --- a/reports/Auto-GPT-Turbo/regression_tests.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/reports/Auto-GPT-Turbo/success_rate.json b/reports/Auto-GPT-Turbo/success_rate.json deleted file mode 100644 index 37accde9..00000000 --- a/reports/Auto-GPT-Turbo/success_rate.json +++ /dev/null @@ -1,162 +0,0 @@ -{ - "TestAdaptLink": [ - false, - false - ], - "TestAdaptSimpleTypoWithGuidance": [ - false, - false - ], - "TestAdaptTeslaRevenue": [ - false, - false - ], - "TestAgentProtocol_CreateAgentTask": [ - false, - false - ], - "TestAgentProtocol_ExecuteAgentTaskStep": [ - false, - false - ], - "TestAgentProtocol_GetAgentTask": [ - false, - false - ], - "TestAgentProtocol_ListAgentTaskSteps": [ - false, - false - ], - "TestAgentProtocol_ListAgentTasksIds": [ - false, - false - ], - "TestBasicMemory": [ - false, - false - ], - "TestBasicRetrieval": [ - false, - false - ], - "TestDebugMultipleTypo": [ - false, - false - ], - "TestDebugSimpleTypoWithGuidance": [ - false, - false - ], - "TestDebugSimpleTypoWithoutGuidance": [ - false, - false - ], - "TestFunctionCodeGeneration": [ - false, - false - ], - "TestPasswordGenerator_Easy": [ - false, - false - ], - "TestPlanCreation": [ - true, - true - ], - "TestProductAdvisor_GamingMonitor": [ - false, - false - ], - "TestReadFile": [ - true, - true - ], - "TestRememberGoal_Advanced": [ - false, - false - ], - "TestRememberGoal_Hard": [ - false, - false - ], - "TestRememberGoal_Medium": [ - false, - false - ], - "TestRememberGoal_Simple": [ - false, - false - ], - "TestRememberMultipleIds": [ - false, - false - ], - "TestRememberMultipleIdsWithNoise": [ - false, - false - ], - "TestRememberMultiplePhrasesWithNoise": [ - false, - false - ], - "TestRetrieval3": [ - false, - false - ], - "TestReturnCode_Modify": [ - false, - false - ], - "TestReturnCode_Simple": [ - false, - false - ], - "TestReturnCode_Tests": [ - false, - false - ], - "TestReturnCode_Write": [ - false, - false - ], - "TestRevenueRetrieval_1.0": [ - false, - false - ], - "TestRevenueRetrieval_1.1": [ - false, - false - ], - "TestRevenueRetrieval_1.2": [ - false, - false - ], - "TestSearch": [ - false, - false - ], - "TestThreeSum": [ - false, - false - ], - "TestWebApp_ListAnimals": [ - false, - false - ], - "TestWrite5FilesWithArray": [ - false, - false - ], - "TestWrite6Files": [ - false, - false - ], - "TestWriteFile": [ - true, - true - ], - "TestWritingCLI_FileOrganizer": [ - false, - false - ] -} \ No newline at end of file diff --git a/json_to_base_64.py b/reports/json_to_base_64.py similarity index 100% rename from json_to_base_64.py rename to reports/json_to_base_64.py diff --git a/reports/match_records.py b/reports/match_records.py new file mode 100644 index 00000000..e8f3ba54 --- /dev/null +++ b/reports/match_records.py @@ -0,0 +1,214 @@ +import os +import json +import pandas as pd +import glob +from gql.transport.aiohttp import AIOHTTPTransport +from gql import gql, Client +import os + + +def get_reports(): + # Initialize an empty list to store the report data + report_data = [] + + # Get the current working directory + current_dir = os.getcwd() + + # Check if the current directory ends with 'reports' + if current_dir.endswith("reports"): + reports_dir = "/" + else: + reports_dir = "reports" + + # Iterate over all agent directories in the reports directory + for agent_name in os.listdir(reports_dir): + agent_dir = os.path.join(reports_dir, agent_name) + + # Check if the item is a directory (an agent directory) + if os.path.isdir(agent_dir): + # Construct the path to the report.json file + # Use glob to find all run directories in the agent_dir + run_dirs = glob.glob(os.path.join(agent_dir, "*")) + + # For each run directory, add the report.json to the end + report_files = [ + os.path.join(run_dir, "report.json") for run_dir in run_dirs + ] + for report_file in report_files: + # Check if the report.json file exists + if os.path.isfile(report_file): + # Open the report.json file + with open(report_file, "r") as f: + # Load the JSON data from the file + report = json.load(f) + + # Iterate over all tests in the report + for test_name, test_data in report["tests"].items(): + try: + # Append the relevant data to the report_data list + if agent_name is not None: + report_data.append( + { + "agent": agent_name.lower(), + "benchmark_start_time": report[ + "benchmark_start_time" + ], + "challenge": test_name, + "categories": ", ".join( + test_data["category"] + ), + "task": test_data["task"], + "success": test_data["metrics"]["success"], + "difficulty": test_data["metrics"][ + "difficulty" + ], + "success_%": test_data["metrics"][ + "success_%" + ], + "run_time": test_data["metrics"][ + "run_time" + ], + } + ) + except KeyError: + pass + return pd.DataFrame(report_data) + + +def get_helicone_data(): + helicone_api_key = os.getenv("HELICONE_API_KEY") + + url = "https://www.helicone.ai/api/graphql" + # Replace with your personal access key + transport = AIOHTTPTransport( + url=url, headers={"authorization": f"Bearer {helicone_api_key}"} + ) + + client = Client(transport=transport, fetch_schema_from_transport=True) + + SIZE = 250 + + i = 0 + + data = [] + print("Fetching data from Helicone") + while True: + query = gql( + """ + query ExampleQuery($limit: Int, $offset: Int){ + heliconeRequest( + limit: $limit + offset: $offset + ) { + prompt + properties{ + name + value + } + + requestBody + response + createdAt + + } + + } + """ + ) + print(f"Fetching {i * SIZE} to {(i + 1) * SIZE} records") + try: + result = client.execute( + query, variable_values={"limit": SIZE, "offset": i * SIZE} + ) + except Exception as e: + print(f"Error occurred: {e}") + result = None + + i += 1 + + if result: + for item in result["heliconeRequest"]: + properties = { + prop["name"]: prop["value"] for prop in item["properties"] + } + data.append( + { + "createdAt": item["createdAt"], + "agent": properties.get("agent"), + "job_id": properties.get("job_id"), + "challenge": properties.get("challenge"), + "benchmark_start_time": properties.get("benchmark_start_time"), + "prompt": item["prompt"], + "model": item["requestBody"].get("model"), + "request": item["requestBody"].get("messages"), + } + ) + + if not result or (len(result["heliconeRequest"]) == 0): + print("No more results") + break + + df = pd.DataFrame(data) + # Drop rows where agent is None + df = df.dropna(subset=["agent"]) + + # Convert the remaining agent names to lowercase + df["agent"] = df["agent"].str.lower() + + return df + + +if os.path.exists("reports_raw.pkl") and os.path.exists("helicone_raw.pkl"): + reports_df = pd.read_pickle("reports_raw.pkl") + helicone_df = pd.read_pickle("helicone_raw.pkl") +else: + reports_df = get_reports() + reports_df.to_pickle("reports_raw.pkl") + helicone_df = get_helicone_data() + helicone_df.to_pickle("helicone_raw.pkl") + + +def try_formats(date_str): + formats = ["%Y-%m-%d-%H:%M", "%Y-%m-%dT%H:%M:%S%z"] + for fmt in formats: + try: + return pd.to_datetime(date_str, format=fmt) + except ValueError: + pass + return None + + +helicone_df["benchmark_start_time"] = pd.to_datetime( + helicone_df["benchmark_start_time"].apply(try_formats), utc=True +) +helicone_df = helicone_df.dropna(subset=["benchmark_start_time"]) +helicone_df["createdAt"] = pd.to_datetime( + helicone_df["createdAt"], unit="ms", origin="unix" +) +reports_df["benchmark_start_time"] = pd.to_datetime( + reports_df["benchmark_start_time"].apply(try_formats), utc=True +) +reports_df = reports_df.dropna(subset=["benchmark_start_time"]) + +assert pd.api.types.is_datetime64_any_dtype( + helicone_df["benchmark_start_time"] +), "benchmark_start_time in helicone_df is not datetime" +assert pd.api.types.is_datetime64_any_dtype( + reports_df["benchmark_start_time"] +), "benchmark_start_time in reports_df is not datetime" + +reports_df["report_time"] = reports_df["benchmark_start_time"] + +df = pd.merge_asof( + helicone_df.sort_values("benchmark_start_time"), + reports_df.sort_values("benchmark_start_time"), + left_on="benchmark_start_time", + right_on="benchmark_start_time", + by=["agent", "challenge"], + direction="backward", +) + +df.to_pickle("df.pkl") +print(df.info()) +print("Data saved to df.pkl") +print("To load the data use: df = pd.read_pickle('df.pkl')") diff --git a/send_to_googledrive.py b/reports/send_to_googledrive.py similarity index 96% rename from send_to_googledrive.py rename to reports/send_to_googledrive.py index e212e6bc..4131a445 100644 --- a/send_to_googledrive.py +++ b/reports/send_to_googledrive.py @@ -29,6 +29,15 @@ creds_info = json.loads(creds_string) # Define the base directory containing JSON files base_dir = "reports" +# Get the current working directory +current_dir = os.getcwd() + +# Check if the current directory ends with 'reports' +if current_dir.endswith("reports"): + base_dir = "/" +else: + base_dir = "reports" + # Create a list to store each row of data rows = []