Implement the 'explore' mode (#284)

This commit is contained in:
merwanehamadi
2023-08-09 17:59:48 -07:00
committed by GitHub
parent 582c3e06a4
commit 1b20e45ec1
7 changed files with 73 additions and 7 deletions

1
.gitignore vendored
View File

@@ -163,3 +163,4 @@ cython_debug/
.DS_Store
```
secrets.json
challenges_already_beaten.json

View File

@@ -94,6 +94,7 @@ def pytest_addoption(parser: Any) -> None:
parser.addoption("--cutoff", action="store_true", default=False)
parser.addoption("--improve", action="store_true", default=False)
parser.addoption("--maintain", action="store_true", default=False)
parser.addoption("--explore", action="store_true", default=False)
parser.addoption("--test", action="store_true", default=None)
parser.addoption("--no_dep", action="store_true", default=False)
parser.addoption("--suite", action="store_true", default=False)
@@ -159,7 +160,12 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
except Exception as e:
pass
flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
flags = (
"--test" in sys.argv
or "--maintain" in sys.argv
or "--improve" in sys.argv
or "--explore" in sys.argv
)
if call.when == "call":
# if it's a same task suite, we combine the report.

View File

@@ -1,5 +1,6 @@
import glob
import importlib
import json
import os
import sys
import types
@@ -97,7 +98,23 @@ def create_single_test(
# Define test method within the dynamically created class
def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore
# create a random number between 0 and 1
test_name = self.data.name
try:
with open("challenges_already_beaten.json", "r") as f:
challenges_beaten_in_the_past = json.load(f)
except:
challenges_beaten_in_the_past = {}
if request.config.getoption("--explore") and challenges_beaten_in_the_past.get(
test_name, False
):
return None
# skip optional categories
self.skip_optional_categories(config)
from helicone.lock import HeliconeLockManager
if os.environ.get("HELICONE_API_KEY"):
@@ -108,6 +125,7 @@ def create_single_test(
scores = self.get_scores(config)
request.node.scores = scores # store scores in request.node
assert 1 in scores["values"]
# Parametrize the method here

View File

@@ -2,7 +2,7 @@ import json
import os
import sys
from pathlib import Path
from typing import Any
from typing import Any, Dict
from agbenchmark.agent_interface import MOCK_FLAG
from agbenchmark.reports.ReportManager import ReportManager
@@ -144,7 +144,11 @@ def update_regression_tests(
def generate_single_call_report(
item: Any, call: Any, challenge_data: dict[str, Any]
) -> None:
difficulty = challenge_data["info"]["difficulty"]
try:
difficulty = challenge_data["info"]["difficulty"]
except KeyError:
return None
if isinstance(difficulty, DifficultyLevel):
difficulty = difficulty.value
@@ -222,9 +226,33 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]
update_challenges_already_beaten(info_details, test_name)
if info_details.get("tests") is not None:
for nested_test_name, nested_test_info in info_details["tests"].items():
update_challenges_already_beaten(nested_test_info, nested_test_name)
info_manager.add_test(test_name, info_details)
def update_challenges_already_beaten(
info_details: Dict[str, Any], test_name: str
) -> None:
current_run_successful = info_details["metrics"]["success"]
try:
with open("challenges_already_beaten.json", "r") as f:
challenge_data = json.load(f)
except:
challenge_data = {}
challenge_beaten_in_the_past = challenge_data.get(test_name)
challenge_data[test_name] = True
if challenge_beaten_in_the_past is None and not current_run_successful:
challenge_data[test_name] = False
with open("challenges_already_beaten.json", "w") as f:
json.dump(challenge_data, f, indent=4)
def generate_separate_suite_reports(suite_reports: dict) -> None:
for prefix, suite_file_datum in suite_reports.items():
successes = []

View File

@@ -85,6 +85,11 @@ def cli() -> None:
@click.option("--test", default=None, help="Specific test to run")
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
@click.option(
"--explore",
is_flag=True,
help="Only attempt challenges that have never been beaten",
)
@click.option("--mock", is_flag=True, help="Run with mock")
@click.option("--suite", default=None, help="Run a suite of related tests")
@click.option(
@@ -100,6 +105,7 @@ def start(
test: str,
maintain: bool,
improve: bool,
explore: bool,
mock: bool,
suite: str,
no_dep: bool,
@@ -109,13 +115,13 @@ def start(
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
# Check if configuration file exists and is not empty
if maintain and improve:
if int(maintain) + int(improve) + int(explore) > 1:
print(
"Error: You can't use both --maintain and --improve at the same time. Please choose one."
"Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
)
return 1
if test and (category or skip_category or maintain or improve or suite):
if test and (category or skip_category or maintain or improve or suite or explore):
print(
"Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
)
@@ -123,7 +129,7 @@ def start(
# TODO: test and ensure that this functionality works before removing
# change elif suite below if removing
if suite and (category or skip_category or maintain or improve):
if suite and (category or skip_category or maintain or improve or explore):
print(
"Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
)
@@ -193,6 +199,9 @@ def start(
elif improve:
print("Running only non-regression tests")
pytest_args.append("--improve")
elif explore:
print("Only attempt challenges that have never been beaten")
pytest_args.append("--explore")
if mock:
pytest_args.append("--mock")

View File

@@ -261,6 +261,7 @@ class Challenge(ABC):
return scores_data
def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None:
return 1 # remove this once this works
if 1 in scores.get("scores_obj", {}).get(test_name, []):
return 1

View File

@@ -47,6 +47,9 @@ def calculate_info_test_path(reports_path: Path) -> str:
elif "--improve" in command:
test_index = command.index("--improve")
test_arg = "improve"
elif "--improve" in command:
test_index = command.index("--explore")
test_arg = "explore"
if test_index:
if not test_arg: