Implement the 'explore' mode (#284)

2026-02-14 02:34:27 +01:00 · 2023-08-09 17:59:48 -07:00
parent 582c3e06a4
commit 1b20e45ec1
7 changed files with 73 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ cython_debug/
 .DS_Store
 ```
 secrets.json
+challenges_already_beaten.json
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -94,6 +94,7 @@ def pytest_addoption(parser: Any) -> None:
    parser.addoption("--cutoff", action="store_true", default=False)
    parser.addoption("--improve", action="store_true", default=False)
    parser.addoption("--maintain", action="store_true", default=False)
+    parser.addoption("--explore", action="store_true", default=False)
    parser.addoption("--test", action="store_true", default=None)
    parser.addoption("--no_dep", action="store_true", default=False)
    parser.addoption("--suite", action="store_true", default=False)
@@ -159,7 +160,12 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
    except Exception as e:
        pass

-    flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
+    flags = (
+        "--test" in sys.argv
+        or "--maintain" in sys.argv
+        or "--improve" in sys.argv
+        or "--explore" in sys.argv
+    )

    if call.when == "call":
        # if it's a same task suite, we combine the report.
--- a/agbenchmark/generate_test.py
+++ b/agbenchmark/generate_test.py
@@ -1,5 +1,6 @@
 import glob
 import importlib
+import json
 import os
 import sys
 import types
@@ -97,7 +98,23 @@ def create_single_test(

    # Define test method within the dynamically created class
    def test_method(self, config: Dict[str, Any], request) -> None:  # type: ignore
+        # create a random number between 0 and 1
+        test_name = self.data.name
+
+        try:
+            with open("challenges_already_beaten.json", "r") as f:
+                challenges_beaten_in_the_past = json.load(f)
+        except:
+            challenges_beaten_in_the_past = {}
+
+        if request.config.getoption("--explore") and challenges_beaten_in_the_past.get(
+            test_name, False
+        ):
+            return None
+
+        # skip optional categories
        self.skip_optional_categories(config)
+
        from helicone.lock import HeliconeLockManager

        if os.environ.get("HELICONE_API_KEY"):
@@ -108,6 +125,7 @@ def create_single_test(

        scores = self.get_scores(config)
        request.node.scores = scores  # store scores in request.node
+
        assert 1 in scores["values"]

    # Parametrize the method here
--- a/agbenchmark/reports/reports.py
+++ b/agbenchmark/reports/reports.py
@@ -2,7 +2,7 @@ import json
 import os
 import sys
 from pathlib import Path
-from typing import Any
+from typing import Any, Dict

 from agbenchmark.agent_interface import MOCK_FLAG
 from agbenchmark.reports.ReportManager import ReportManager
@@ -144,7 +144,11 @@ def update_regression_tests(
 def generate_single_call_report(
    item: Any, call: Any, challenge_data: dict[str, Any]
 ) -> None:
-    difficulty = challenge_data["info"]["difficulty"]
+
+    try:
+        difficulty = challenge_data["info"]["difficulty"]
+    except KeyError:
+        return None

    if isinstance(difficulty, DifficultyLevel):
        difficulty = difficulty.value
@@ -222,9 +226,33 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:

            info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]

+            update_challenges_already_beaten(info_details, test_name)
+            if info_details.get("tests") is not None:
+                for nested_test_name, nested_test_info in info_details["tests"].items():
+                    update_challenges_already_beaten(nested_test_info, nested_test_name)
+
        info_manager.add_test(test_name, info_details)


+def update_challenges_already_beaten(
+    info_details: Dict[str, Any], test_name: str
+) -> None:
+    current_run_successful = info_details["metrics"]["success"]
+    try:
+        with open("challenges_already_beaten.json", "r") as f:
+            challenge_data = json.load(f)
+    except:
+        challenge_data = {}
+    challenge_beaten_in_the_past = challenge_data.get(test_name)
+
+    challenge_data[test_name] = True
+    if challenge_beaten_in_the_past is None and not current_run_successful:
+        challenge_data[test_name] = False
+
+    with open("challenges_already_beaten.json", "w") as f:
+        json.dump(challenge_data, f, indent=4)
+
+
 def generate_separate_suite_reports(suite_reports: dict) -> None:
    for prefix, suite_file_datum in suite_reports.items():
        successes = []
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -85,6 +85,11 @@ def cli() -> None:
@click.option("--test", default=None, help="Specific test to run")
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
+@click.option(
+    "--explore",
+    is_flag=True,
+    help="Only attempt challenges that have never been beaten",
+)
@click.option("--mock", is_flag=True, help="Run with mock")
@click.option("--suite", default=None, help="Run a suite of related tests")
@click.option(
@@ -100,6 +105,7 @@ def start(
    test: str,
    maintain: bool,
    improve: bool,
+    explore: bool,
    mock: bool,
    suite: str,
    no_dep: bool,
@@ -109,13 +115,13 @@ def start(
    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
    # Check if configuration file exists and is not empty

-    if maintain and improve:
+    if int(maintain) + int(improve) + int(explore) > 1:
        print(
-            "Error: You can't use both --maintain and --improve at the same time. Please choose one."
+            "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
        )
        return 1

-    if test and (category or skip_category or maintain or improve or suite):
+    if test and (category or skip_category or maintain or improve or suite or explore):
        print(
            "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
        )
@@ -123,7 +129,7 @@ def start(

    # TODO: test and ensure that this functionality works before removing
    # change elif suite below if removing
-    if suite and (category or skip_category or maintain or improve):
+    if suite and (category or skip_category or maintain or improve or explore):
        print(
            "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
        )
@@ -193,6 +199,9 @@ def start(
        elif improve:
            print("Running only non-regression tests")
            pytest_args.append("--improve")
+        elif explore:
+            print("Only attempt challenges that have never been beaten")
+            pytest_args.append("--explore")

    if mock:
        pytest_args.append("--mock")
--- a/agbenchmark/utils/challenge.py
+++ b/agbenchmark/utils/challenge.py
@@ -261,6 +261,7 @@ class Challenge(ABC):
        return scores_data

    def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None:
+        return 1  # remove this once this works
        if 1 in scores.get("scores_obj", {}).get(test_name, []):
            return 1

--- a/agbenchmark/utils/utils.py
+++ b/agbenchmark/utils/utils.py
@@ -47,6 +47,9 @@ def calculate_info_test_path(reports_path: Path) -> str:
    elif "--improve" in command:
        test_index = command.index("--improve")
        test_arg = "improve"
+    elif "--improve" in command:
+        test_index = command.index("--explore")
+        test_arg = "explore"

    if test_index:
        if not test_arg: