diff --git a/benchmark/agbenchmark/__init__.py b/benchmark/agbenchmark/__init__.py index 3a720e44..9cb4af9f 100644 --- a/benchmark/agbenchmark/__init__.py +++ b/benchmark/agbenchmark/__init__.py @@ -1,10 +1,12 @@ -from pathlib import Path - import json +from datetime import datetime, timezone +from pathlib import Path from .reports.ReportManager import ReportManager from .utils.data_types import AgentBenchmarkConfig +BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00") + def get_agent_benchmark_config() -> AgentBenchmarkConfig: agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json") @@ -24,18 +26,19 @@ def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]: agent_benchmark_config = get_agent_benchmark_config() # tests that consistently pass are considered regression tests REGRESSION_MANAGER = ReportManager( - agent_benchmark_config.get_regression_reports_path() + agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME ) # print(f"Using {REPORTS_PATH} for reports") # user facing reporting information INFO_MANAGER = ReportManager( - str(agent_benchmark_config.get_reports_path() / "report.json") + str(agent_benchmark_config.get_reports_path() / "report.json"), + BENCHMARK_START_TIME, ) # internal db step in replacement track pass/fail rate INTERNAL_INFO_MANAGER = ReportManager( - agent_benchmark_config.get_success_rate_path() + agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME ) return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py index 8f8a8372..3b1f4374 100644 --- a/benchmark/agbenchmark/__main__.py +++ b/benchmark/agbenchmark/__main__.py @@ -11,10 +11,9 @@ import pytest import toml from helicone.lock import HeliconeLockManager +from agbenchmark import BENCHMARK_START_TIME from agbenchmark.utils.data_types import AgentBenchmarkConfig -BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00") - if os.environ.get("HELICONE_API_KEY"): HeliconeLockManager.write_custom_property( "benchmark_start_time", BENCHMARK_START_TIME @@ -58,6 +57,7 @@ def run_benchmark( mock: bool = False, no_dep: bool = False, nc: bool = False, + keep_answers: bool = False, category: Optional[list[str]] = None, skip_category: Optional[list[str]] = None, test: Optional[str] = None, @@ -98,6 +98,9 @@ def run_benchmark( print(f"{key}: {value}") pytest_args = ["-vs"] + if keep_answers: + pytest_args.append("--keep-answers") + if test: print("Running specific test:", test) pytest_args.extend(["-k", test, "--test"]) @@ -187,6 +190,7 @@ def cli() -> None: help="Run without dependencies", ) @click.option("--nc", is_flag=True, help="Run without cutoff") +@click.option("--keep-answers", is_flag=True, help="Keep answers") @click.option("--cutoff", help="Set or override tests cutoff (seconds)") def start( maintain: bool, @@ -195,6 +199,7 @@ def start( mock: bool, no_dep: bool, nc: bool, + keep_answers: bool, category: Optional[list[str]] = None, skip_category: Optional[list[str]] = None, test: Optional[str] = None, @@ -215,6 +220,7 @@ def start( mock=mock, no_dep=no_dep, nc=nc, + keep_answers=keep_answers, category=category, skip_category=skip_category, test=test, @@ -231,6 +237,7 @@ def start( mock=mock, no_dep=no_dep, nc=nc, + keep_answers=keep_answers, category=category, skip_category=skip_category, test=test, diff --git a/benchmark/agbenchmark/conftest.py b/benchmark/agbenchmark/conftest.py index 03520b9b..8edb38b5 100644 --- a/benchmark/agbenchmark/conftest.py +++ b/benchmark/agbenchmark/conftest.py @@ -186,6 +186,7 @@ def pytest_addoption(parser: Any) -> None: The "--explore" option is used to run the tests in exploration mode. The "--test" option is used to run a specific test. The "--no_dep" option is used to run the tests without dependencies. + The "--keep_answers" option is used to keep the answers of the tests. Args: parser (Any): The parser object to which the command-line options are added. @@ -201,6 +202,7 @@ def pytest_addoption(parser: Any) -> None: parser.addoption("--improve", action="store_true", default=False) parser.addoption("--maintain", action="store_true", default=False) parser.addoption("--explore", action="store_true", default=False) + parser.addoption("--keep-answers", action="store_true", default=False) @pytest.fixture(autouse=True) @@ -313,7 +315,7 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: ) if call.when == "call": - answers = getattr(item, 'answers', None) + answers = getattr(item, "answers", None) generate_single_call_report(item, call, challenge_data, answers) if call.when == "teardown": diff --git a/benchmark/agbenchmark/generate_test.py b/benchmark/agbenchmark/generate_test.py index c0701024..9e8390e1 100644 --- a/benchmark/agbenchmark/generate_test.py +++ b/benchmark/agbenchmark/generate_test.py @@ -77,7 +77,9 @@ def create_single_test( await self.setup_challenge(config, timeout) scores = self.get_scores(config) - request.node.answers = scores["answers"] # store answers in request.node + request.node.answers = ( + scores["answers"] if "--keep-answers" in sys.argv else None + ) del scores["answers"] # remove answers from scores request.node.scores = scores # store scores in request.node assert 1 in scores["values"] diff --git a/benchmark/agbenchmark/reports/ReportManager.py b/benchmark/agbenchmark/reports/ReportManager.py index 4c96af52..642cfcea 100644 --- a/benchmark/agbenchmark/reports/ReportManager.py +++ b/benchmark/agbenchmark/reports/ReportManager.py @@ -4,7 +4,6 @@ import sys import time from datetime import datetime, timezone -from agbenchmark.__main__ import BENCHMARK_START_TIME from agbenchmark.reports.processing.graphs import save_single_radar_chart from agbenchmark.reports.processing.process_report import get_agent_category from agbenchmark.reports.processing.report_types import Report @@ -15,9 +14,11 @@ from agbenchmark.utils.utils import get_highest_success_difficulty class ReportManager: """Abstracts interaction with the regression tests file""" - def __init__(self, filename: str): + def __init__(self, filename: str, benchmark_start_time: str): self.filename = filename self.start_time = time.time() + self.benchmark_start_time = benchmark_start_time + self.load() def load(self) -> None: @@ -70,7 +71,7 @@ class ReportManager: "completion_time": datetime.now(timezone.utc).strftime( "%Y-%m-%dT%H:%M:%S+00:00" ), - "benchmark_start_time": BENCHMARK_START_TIME, + "benchmark_start_time": self.benchmark_start_time, "metrics": { "run_time": str(round(time.time() - self.start_time, 2)) + " seconds", "highest_difficulty": get_highest_success_difficulty(self.tests), diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py index 13fe4328..d607d222 100644 --- a/benchmark/agbenchmark/reports/reports.py +++ b/benchmark/agbenchmark/reports/reports.py @@ -89,7 +89,7 @@ def generate_single_call_report( } if answers: info_details["answers"] = answers - + if "metadata" in challenge_data: info_details["metadata"] = challenge_data["metadata"] diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/agbenchmark/utils/data_types.py index 5421c0a1..c01d3c1c 100644 --- a/benchmark/agbenchmark/utils/data_types.py +++ b/benchmark/agbenchmark/utils/data_types.py @@ -17,9 +17,12 @@ class DifficultyLevel(Enum): expert = "expert" human = "human" + class Workspace(BaseModel): input: str output: str + + # map from enum to difficulty level (numeric) DIFFICULTY_MAP = { DifficultyLevel.interface: 1, diff --git a/benchmark/agbenchmark/utils/get_data_from_helicone.py b/benchmark/agbenchmark/utils/get_data_from_helicone.py index 792fa995..f99a49c6 100644 --- a/benchmark/agbenchmark/utils/get_data_from_helicone.py +++ b/benchmark/agbenchmark/utils/get_data_from_helicone.py @@ -4,6 +4,7 @@ from typing import Optional import requests +from agbenchmark import BENCHMARK_START_TIME from agbenchmark.agent_interface import HELICONE_GRAPHQL_LOGS @@ -30,7 +31,7 @@ query ExampleQuery($properties: [PropertyFilter!]){ "name": "agent", }, { - "value": {"equals": agbenchmark.start_agbenchmark.BENCHMARK_START_TIME}, + "value": {"equals": BENCHMARK_START_TIME}, "name": "benchmark_start_time", }, {"value": {"equals": challenge}, "name": "challenge"},