diff --git a/benchmark/frontend/package-lock.json b/benchmark/frontend/package-lock.json index f136aa51..cfc254c7 100644 --- a/benchmark/frontend/package-lock.json +++ b/benchmark/frontend/package-lock.json @@ -4254,6 +4254,126 @@ "funding": { "url": "https://github.com/sponsors/colinhacks" } + }, + "node_modules/@next/swc-darwin-arm64": { + "version": "13.4.13", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.4.13.tgz", + "integrity": "sha512-ZptVhHjzUuivnXMNCJ6lER33HN7lC+rZ01z+PM10Ows21NHFYMvGhi5iXkGtBDk6VmtzsbqnAjnx4Oz5um0FjA==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-darwin-x64": { + "version": "13.4.13", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.4.13.tgz", + "integrity": "sha512-t9nTiWCLApw8W4G1kqJyYP7y6/7lyal3PftmRturIxAIBlZss9wrtVN8nci50StDHmIlIDxfguYIEGVr9DbFTg==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-linux-arm64-gnu": { + "version": "13.4.13", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.4.13.tgz", + "integrity": "sha512-xEHUqC8eqR5DHe8SOmMnDU1K3ggrJ28uIKltrQAwqFSSSmzjnN/XMocZkcVhuncuxYrpbri0iMQstRyRVdQVWg==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-linux-arm64-musl": { + "version": "13.4.13", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.4.13.tgz", + "integrity": "sha512-sNf3MnLAm8rquSSAoeD9nVcdaDeRYOeey4stOWOyWIgbBDtP+C93amSgH/LPTDoUV7gNiU6f+ghepTjTjRgIUQ==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-linux-x64-gnu": { + "version": "13.4.13", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.4.13.tgz", + "integrity": "sha512-WhcRaJJSHyx9OWmKjjz+OWHumiPZWRqmM/09Bt7Up4UqUJFFhGExeztR4trtv3rflvULatu9IH/nTV8fUUgaMA==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-linux-x64-musl": { + "version": "13.4.13", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.4.13.tgz", + "integrity": "sha512-+Y4LLhOWWZQIDKVwr2R17lq2KSN0F1c30QVgGIWfnjjHpH8nrIWHEndhqYU+iFuW8It78CiJjQKTw4f51HD7jA==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-win32-arm64-msvc": { + "version": "13.4.13", + "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.4.13.tgz", + "integrity": "sha512-rWurdOR20uxjfqd1X9vDAgv0Jb26KjyL8akF9CBeFqX8rVaBAnW/Wf6A2gYEwyYY4Bai3T7p1kro6DFrsvBAAw==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-win32-ia32-msvc": { + "version": "13.4.13", + "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.4.13.tgz", + "integrity": "sha512-E8bSPwRuY5ibJ3CzLQmJEt8qaWrPYuUTwnrwygPUEWoLzD5YRx9SD37oXRdU81TgGwDzCxpl7z5Nqlfk50xAog==", + "cpu": [ + "ia32" + ], + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } } } } diff --git a/benchmark/frontend/prisma/schema.prisma b/benchmark/frontend/prisma/schema.prisma deleted file mode 100644 index 86993eae..00000000 --- a/benchmark/frontend/prisma/schema.prisma +++ /dev/null @@ -1,81 +0,0 @@ -// This is your Prisma schema file, -// learn more about it in the docs: https://pris.ly/d/prisma-schema - -generator client { - provider = "prisma-client-js" -} - -datasource db { - provider = "sqlite" - url = env("DATABASE_URL") -} - -model Metrics { - id Int @id @default(autoincrement()) - difficulty String - success Boolean - successPercent Float - runTime String? - failReason String? - Test Test[] -} - -model MetricsOverall { - id Int @id @default(autoincrement()) - runTime String - highestDifficulty String - percentage Float? - SuiteTest SuiteTest[] - Report Report[] -} - -model Test { - id Int @id @default(autoincrement()) - dataPath String - isRegression Boolean - answer String - description String - metricsId Int - metrics Metrics @relation(fields: [metricsId], references: [id]) - categoryId Int? - category Category? @relation(fields: [categoryId], references: [id]) - task String? - reachedCutoff Boolean? -} - -model SuiteTest { - id Int @id @default(autoincrement()) - dataPath String - metricsOverallId Int - metricsOverall MetricsOverall @relation(fields: [metricsOverallId], references: [id]) - categoryId Int? - category Category? @relation(fields: [categoryId], references: [id]) - task String? - reachedCutoff Boolean? -} - -model Category { - id Int @id @default(autoincrement()) - name String @unique - tests Test[] - suiteTests SuiteTest[] -} - -model Report { - id Int @id @default(autoincrement()) - command String - completionTime String - benchmarkStartTime String - metricsOverallId Int - metricsOverall MetricsOverall @relation(fields: [metricsOverallId], references: [id]) - configKey String - configValue String - agentId Int - agent Agent @relation(fields: [agentId], references: [id]) -} - -model Agent { - id Int @id @default(autoincrement()) - name String @unique - reports Report[] -} diff --git a/benchmark/frontend/src/env.mjs b/benchmark/frontend/src/env.mjs index 67fa7674..22b22581 100644 --- a/benchmark/frontend/src/env.mjs +++ b/benchmark/frontend/src/env.mjs @@ -7,7 +7,7 @@ export const env = createEnv({ * isn't built with invalid env vars. */ server: { - DATABASE_URL: z.string().url(), + // DATABASE_URL: z.string().url(), NODE_ENV: z.enum(["development", "test", "production"]), }, @@ -25,7 +25,7 @@ export const env = createEnv({ * middlewares) or client-side so we need to destruct manually. */ runtimeEnv: { - DATABASE_URL: process.env.DATABASE_URL, + // DATABASE_URL: process.env.DATABASE_URL, NODE_ENV: process.env.NODE_ENV, // NEXT_PUBLIC_CLIENTVAR: process.env.NEXT_PUBLIC_CLIENTVAR, }, diff --git a/benchmark/paper/combined_data.ipynb b/benchmark/paper/combined_data.ipynb index 1cc98baf..4544c69e 100644 --- a/benchmark/paper/combined_data.ipynb +++ b/benchmark/paper/combined_data.ipynb @@ -51,7 +51,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.dropna(subset=['benchmark_start_time', 'response', 'model'], inplace=True)" + "df.dropna(subset=['benchmark_start_time', 'response', 'model', 'agent'], inplace=True)" ] }, { @@ -684,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -708,8 +708,8 @@ " return x\n", "\n", "challenge = \"TestRememberMultipleIds\"\n", - "agent_array = ['beebot'] # df['agent'].unique()\n", - "request_type = 'request' # 'request' or 'response'\n", + "agent_array = df['agent'].unique()\n", + "request_type = 'response' # 'request' or 'response'\n", "\n", "# Loop through unique agents\n", "for agent in agent_array:\n", @@ -1494,7 +1494,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1533,7 +1533,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/benchmark/reports/match_records.py b/benchmark/reports/match_records.py index cf037823..bd6ce573 100644 --- a/benchmark/reports/match_records.py +++ b/benchmark/reports/match_records.py @@ -6,7 +6,55 @@ from gql.transport.aiohttp import AIOHTTPTransport from gql import gql, Client import os -from agbenchmark.reports.processing.report_types import Report, SuiteTest +# from agbenchmark.reports.processing.report_types import Report, SuiteTest + +from typing import Dict, List, Optional, Union + +from pydantic import BaseModel, Field + + +class Metrics(BaseModel): + difficulty: str + success: bool + success_percent: float = Field(..., alias="success_%") + run_time: Optional[str] = None + fail_reason: Optional[str] = None + attempted: Optional[bool] = None + + +class MetricsOverall(BaseModel): + run_time: str + highest_difficulty: str + percentage: Optional[float] = None + + +class Test(BaseModel): + data_path: str + is_regression: bool + answer: str + description: str + metrics: Metrics + category: List[str] + task: Optional[str] = None + reached_cutoff: Optional[bool] = None + + +class SuiteTest(BaseModel): + data_path: str + metrics: MetricsOverall + tests: Dict[str, Test] + category: Optional[List[str]] = None + task: Optional[str] = None + reached_cutoff: Optional[bool] = None + + +class Report(BaseModel): + command: str + completion_time: str + benchmark_start_time: str + metrics: MetricsOverall + tests: Dict[str, Union[Test, SuiteTest]] + config: Dict[str, str | dict[str, str]] def get_reports(): @@ -31,13 +79,21 @@ def get_reports(): # Check if the item is a directory (an agent directory) if os.path.isdir(agent_dir): # Construct the path to the report.json file - # Use glob to find all run directories in the agent_dir + # Get all directories and files, but note that this will also include any file, not just directories. run_dirs = glob.glob(os.path.join(agent_dir, "*")) + # Get all json files starting with 'file' + # old_report_files = glob.glob(os.path.join(agent_dir, "file*.json")) + # For each run directory, add the report.json to the end + # Only include the path if it's actually a directory report_files = [ - os.path.join(run_dir, "report.json") for run_dir in run_dirs + os.path.join(run_dir, "report.json") + for run_dir in run_dirs + if os.path.isdir(run_dir) ] + # old_report_files already contains the full paths, so no need to join again + # report_files = report_files + old_report_files for report_file in report_files: # Check if the report.json file exists if os.path.isfile(report_file): @@ -45,6 +101,7 @@ def get_reports(): with open(report_file, "r") as f: # Load the JSON data from the file json_data = json.load(f) + print(f"Processing {report_file}") report = Report.parse_obj(json_data) for test_name, test_data in report.tests.items(): @@ -265,7 +322,7 @@ df = pd.merge( helicone_df, reports_df, on=["benchmark_start_time", "agent", "challenge"], - how="left", + how="inner", ) df.to_pickle("df.pkl")