mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-29 03:44:28 +01:00
Implement the 'explore' mode (#284)
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -163,3 +163,4 @@ cython_debug/
|
||||
.DS_Store
|
||||
```
|
||||
secrets.json
|
||||
challenges_already_beaten.json
|
||||
|
||||
@@ -94,6 +94,7 @@ def pytest_addoption(parser: Any) -> None:
|
||||
parser.addoption("--cutoff", action="store_true", default=False)
|
||||
parser.addoption("--improve", action="store_true", default=False)
|
||||
parser.addoption("--maintain", action="store_true", default=False)
|
||||
parser.addoption("--explore", action="store_true", default=False)
|
||||
parser.addoption("--test", action="store_true", default=None)
|
||||
parser.addoption("--no_dep", action="store_true", default=False)
|
||||
parser.addoption("--suite", action="store_true", default=False)
|
||||
@@ -159,7 +160,12 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
|
||||
flags = (
|
||||
"--test" in sys.argv
|
||||
or "--maintain" in sys.argv
|
||||
or "--improve" in sys.argv
|
||||
or "--explore" in sys.argv
|
||||
)
|
||||
|
||||
if call.when == "call":
|
||||
# if it's a same task suite, we combine the report.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import glob
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
@@ -97,7 +98,23 @@ def create_single_test(
|
||||
|
||||
# Define test method within the dynamically created class
|
||||
def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore
|
||||
# create a random number between 0 and 1
|
||||
test_name = self.data.name
|
||||
|
||||
try:
|
||||
with open("challenges_already_beaten.json", "r") as f:
|
||||
challenges_beaten_in_the_past = json.load(f)
|
||||
except:
|
||||
challenges_beaten_in_the_past = {}
|
||||
|
||||
if request.config.getoption("--explore") and challenges_beaten_in_the_past.get(
|
||||
test_name, False
|
||||
):
|
||||
return None
|
||||
|
||||
# skip optional categories
|
||||
self.skip_optional_categories(config)
|
||||
|
||||
from helicone.lock import HeliconeLockManager
|
||||
|
||||
if os.environ.get("HELICONE_API_KEY"):
|
||||
@@ -108,6 +125,7 @@ def create_single_test(
|
||||
|
||||
scores = self.get_scores(config)
|
||||
request.node.scores = scores # store scores in request.node
|
||||
|
||||
assert 1 in scores["values"]
|
||||
|
||||
# Parametrize the method here
|
||||
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.agent_interface import MOCK_FLAG
|
||||
from agbenchmark.reports.ReportManager import ReportManager
|
||||
@@ -144,7 +144,11 @@ def update_regression_tests(
|
||||
def generate_single_call_report(
|
||||
item: Any, call: Any, challenge_data: dict[str, Any]
|
||||
) -> None:
|
||||
difficulty = challenge_data["info"]["difficulty"]
|
||||
|
||||
try:
|
||||
difficulty = challenge_data["info"]["difficulty"]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
if isinstance(difficulty, DifficultyLevel):
|
||||
difficulty = difficulty.value
|
||||
@@ -222,9 +226,33 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
|
||||
|
||||
info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]
|
||||
|
||||
update_challenges_already_beaten(info_details, test_name)
|
||||
if info_details.get("tests") is not None:
|
||||
for nested_test_name, nested_test_info in info_details["tests"].items():
|
||||
update_challenges_already_beaten(nested_test_info, nested_test_name)
|
||||
|
||||
info_manager.add_test(test_name, info_details)
|
||||
|
||||
|
||||
def update_challenges_already_beaten(
|
||||
info_details: Dict[str, Any], test_name: str
|
||||
) -> None:
|
||||
current_run_successful = info_details["metrics"]["success"]
|
||||
try:
|
||||
with open("challenges_already_beaten.json", "r") as f:
|
||||
challenge_data = json.load(f)
|
||||
except:
|
||||
challenge_data = {}
|
||||
challenge_beaten_in_the_past = challenge_data.get(test_name)
|
||||
|
||||
challenge_data[test_name] = True
|
||||
if challenge_beaten_in_the_past is None and not current_run_successful:
|
||||
challenge_data[test_name] = False
|
||||
|
||||
with open("challenges_already_beaten.json", "w") as f:
|
||||
json.dump(challenge_data, f, indent=4)
|
||||
|
||||
|
||||
def generate_separate_suite_reports(suite_reports: dict) -> None:
|
||||
for prefix, suite_file_datum in suite_reports.items():
|
||||
successes = []
|
||||
|
||||
@@ -85,6 +85,11 @@ def cli() -> None:
|
||||
@click.option("--test", default=None, help="Specific test to run")
|
||||
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
|
||||
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
|
||||
@click.option(
|
||||
"--explore",
|
||||
is_flag=True,
|
||||
help="Only attempt challenges that have never been beaten",
|
||||
)
|
||||
@click.option("--mock", is_flag=True, help="Run with mock")
|
||||
@click.option("--suite", default=None, help="Run a suite of related tests")
|
||||
@click.option(
|
||||
@@ -100,6 +105,7 @@ def start(
|
||||
test: str,
|
||||
maintain: bool,
|
||||
improve: bool,
|
||||
explore: bool,
|
||||
mock: bool,
|
||||
suite: str,
|
||||
no_dep: bool,
|
||||
@@ -109,13 +115,13 @@ def start(
|
||||
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
|
||||
# Check if configuration file exists and is not empty
|
||||
|
||||
if maintain and improve:
|
||||
if int(maintain) + int(improve) + int(explore) > 1:
|
||||
print(
|
||||
"Error: You can't use both --maintain and --improve at the same time. Please choose one."
|
||||
"Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
|
||||
)
|
||||
return 1
|
||||
|
||||
if test and (category or skip_category or maintain or improve or suite):
|
||||
if test and (category or skip_category or maintain or improve or suite or explore):
|
||||
print(
|
||||
"Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
|
||||
)
|
||||
@@ -123,7 +129,7 @@ def start(
|
||||
|
||||
# TODO: test and ensure that this functionality works before removing
|
||||
# change elif suite below if removing
|
||||
if suite and (category or skip_category or maintain or improve):
|
||||
if suite and (category or skip_category or maintain or improve or explore):
|
||||
print(
|
||||
"Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
|
||||
)
|
||||
@@ -193,6 +199,9 @@ def start(
|
||||
elif improve:
|
||||
print("Running only non-regression tests")
|
||||
pytest_args.append("--improve")
|
||||
elif explore:
|
||||
print("Only attempt challenges that have never been beaten")
|
||||
pytest_args.append("--explore")
|
||||
|
||||
if mock:
|
||||
pytest_args.append("--mock")
|
||||
|
||||
@@ -261,6 +261,7 @@ class Challenge(ABC):
|
||||
return scores_data
|
||||
|
||||
def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None:
|
||||
return 1 # remove this once this works
|
||||
if 1 in scores.get("scores_obj", {}).get(test_name, []):
|
||||
return 1
|
||||
|
||||
|
||||
@@ -47,6 +47,9 @@ def calculate_info_test_path(reports_path: Path) -> str:
|
||||
elif "--improve" in command:
|
||||
test_index = command.index("--improve")
|
||||
test_arg = "improve"
|
||||
elif "--improve" in command:
|
||||
test_index = command.index("--explore")
|
||||
test_arg = "explore"
|
||||
|
||||
if test_index:
|
||||
if not test_arg:
|
||||
|
||||
Reference in New Issue
Block a user