diff --git a/.gitignore b/.gitignore index 1b0f3ba1..31a1c4dd 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,4 @@ cython_debug/ .DS_Store ``` secrets.json +challenges_already_beaten.json diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 7d8523da..75030906 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -94,6 +94,7 @@ def pytest_addoption(parser: Any) -> None: parser.addoption("--cutoff", action="store_true", default=False) parser.addoption("--improve", action="store_true", default=False) parser.addoption("--maintain", action="store_true", default=False) + parser.addoption("--explore", action="store_true", default=False) parser.addoption("--test", action="store_true", default=None) parser.addoption("--no_dep", action="store_true", default=False) parser.addoption("--suite", action="store_true", default=False) @@ -159,7 +160,12 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: except Exception as e: pass - flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv + flags = ( + "--test" in sys.argv + or "--maintain" in sys.argv + or "--improve" in sys.argv + or "--explore" in sys.argv + ) if call.when == "call": # if it's a same task suite, we combine the report. diff --git a/agbenchmark/generate_test.py b/agbenchmark/generate_test.py index 062b39d5..ff72bff0 100644 --- a/agbenchmark/generate_test.py +++ b/agbenchmark/generate_test.py @@ -1,5 +1,6 @@ import glob import importlib +import json import os import sys import types @@ -97,7 +98,23 @@ def create_single_test( # Define test method within the dynamically created class def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore + # create a random number between 0 and 1 + test_name = self.data.name + + try: + with open("challenges_already_beaten.json", "r") as f: + challenges_beaten_in_the_past = json.load(f) + except: + challenges_beaten_in_the_past = {} + + if request.config.getoption("--explore") and challenges_beaten_in_the_past.get( + test_name, False + ): + return None + + # skip optional categories self.skip_optional_categories(config) + from helicone.lock import HeliconeLockManager if os.environ.get("HELICONE_API_KEY"): @@ -108,6 +125,7 @@ def create_single_test( scores = self.get_scores(config) request.node.scores = scores # store scores in request.node + assert 1 in scores["values"] # Parametrize the method here diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py index 22c4dbbb..1e9146cf 100644 --- a/agbenchmark/reports/reports.py +++ b/agbenchmark/reports/reports.py @@ -2,7 +2,7 @@ import json import os import sys from pathlib import Path -from typing import Any +from typing import Any, Dict from agbenchmark.agent_interface import MOCK_FLAG from agbenchmark.reports.ReportManager import ReportManager @@ -144,7 +144,11 @@ def update_regression_tests( def generate_single_call_report( item: Any, call: Any, challenge_data: dict[str, Any] ) -> None: - difficulty = challenge_data["info"]["difficulty"] + + try: + difficulty = challenge_data["info"]["difficulty"] + except KeyError: + return None if isinstance(difficulty, DifficultyLevel): difficulty = difficulty.value @@ -222,9 +226,33 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None: info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"] + update_challenges_already_beaten(info_details, test_name) + if info_details.get("tests") is not None: + for nested_test_name, nested_test_info in info_details["tests"].items(): + update_challenges_already_beaten(nested_test_info, nested_test_name) + info_manager.add_test(test_name, info_details) +def update_challenges_already_beaten( + info_details: Dict[str, Any], test_name: str +) -> None: + current_run_successful = info_details["metrics"]["success"] + try: + with open("challenges_already_beaten.json", "r") as f: + challenge_data = json.load(f) + except: + challenge_data = {} + challenge_beaten_in_the_past = challenge_data.get(test_name) + + challenge_data[test_name] = True + if challenge_beaten_in_the_past is None and not current_run_successful: + challenge_data[test_name] = False + + with open("challenges_already_beaten.json", "w") as f: + json.dump(challenge_data, f, indent=4) + + def generate_separate_suite_reports(suite_reports: dict) -> None: for prefix, suite_file_datum in suite_reports.items(): successes = [] diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 046a18f8..f296e96d 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -85,6 +85,11 @@ def cli() -> None: @click.option("--test", default=None, help="Specific test to run") @click.option("--maintain", is_flag=True, help="Runs only regression tests") @click.option("--improve", is_flag=True, help="Run only non-regression tests") +@click.option( + "--explore", + is_flag=True, + help="Only attempt challenges that have never been beaten", +) @click.option("--mock", is_flag=True, help="Run with mock") @click.option("--suite", default=None, help="Run a suite of related tests") @click.option( @@ -100,6 +105,7 @@ def start( test: str, maintain: bool, improve: bool, + explore: bool, mock: bool, suite: str, no_dep: bool, @@ -109,13 +115,13 @@ def start( """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" # Check if configuration file exists and is not empty - if maintain and improve: + if int(maintain) + int(improve) + int(explore) > 1: print( - "Error: You can't use both --maintain and --improve at the same time. Please choose one." + "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one." ) return 1 - if test and (category or skip_category or maintain or improve or suite): + if test and (category or skip_category or maintain or improve or suite or explore): print( "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test." ) @@ -123,7 +129,7 @@ def start( # TODO: test and ensure that this functionality works before removing # change elif suite below if removing - if suite and (category or skip_category or maintain or improve): + if suite and (category or skip_category or maintain or improve or explore): print( "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite." ) @@ -193,6 +199,9 @@ def start( elif improve: print("Running only non-regression tests") pytest_args.append("--improve") + elif explore: + print("Only attempt challenges that have never been beaten") + pytest_args.append("--explore") if mock: pytest_args.append("--mock") diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py index 63168731..efee9cc7 100644 --- a/agbenchmark/utils/challenge.py +++ b/agbenchmark/utils/challenge.py @@ -261,6 +261,7 @@ class Challenge(ABC): return scores_data def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None: + return 1 # remove this once this works if 1 in scores.get("scores_obj", {}).get(test_name, []): return 1 diff --git a/agbenchmark/utils/utils.py b/agbenchmark/utils/utils.py index 41b659e9..f9bff83d 100644 --- a/agbenchmark/utils/utils.py +++ b/agbenchmark/utils/utils.py @@ -47,6 +47,9 @@ def calculate_info_test_path(reports_path: Path) -> str: elif "--improve" in command: test_index = command.index("--improve") test_arg = "improve" + elif "--improve" in command: + test_index = command.index("--explore") + test_arg = "explore" if test_index: if not test_arg: