mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-17 05:54:26 +01:00
Fixing benchmarks
This commit is contained in:
@@ -1,303 +0,0 @@
|
||||
import glob
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
import agbenchmark.start_benchmark
|
||||
from agbenchmark.utils.challenge import Challenge
|
||||
from agbenchmark.utils.data_types import ChallengeData, SuiteConfig
|
||||
from agbenchmark.utils.utils import get_test_path
|
||||
|
||||
DATA_CATEGORY = {}
|
||||
|
||||
|
||||
def setup_dummy_dependencies(
|
||||
file_datum: list[dict[str, Any]],
|
||||
challenge_class: Any,
|
||||
challenge_data: ChallengeData,
|
||||
) -> None:
|
||||
"""Sets up the dependencies if it's a suite. Creates tests that pass
|
||||
based on the main test run."""
|
||||
|
||||
def create_test_func(test_name: str) -> Callable[[Any, dict[str, Any]], None]:
|
||||
# This function will return another function
|
||||
|
||||
# Define a dummy test function that does nothing
|
||||
def setup_dependency_test(self: Any, scores: dict[str, Any]) -> None:
|
||||
scores = self.get_dummy_scores(test_name, scores)
|
||||
assert scores == 1
|
||||
|
||||
return setup_dependency_test
|
||||
|
||||
for datum in file_datum:
|
||||
DATA_CATEGORY[datum["name"]] = challenge_data.category[0]
|
||||
test_func = create_test_func(datum["name"])
|
||||
# TODO: replace this once I figure out actual dependencies
|
||||
test_func = pytest.mark.depends(on=[challenge_data.name], name=datum["name"])(
|
||||
test_func
|
||||
)
|
||||
test_func = pytest.mark.parametrize(
|
||||
"challenge_data",
|
||||
[None],
|
||||
indirect=True,
|
||||
)(test_func)
|
||||
|
||||
# Add category markers
|
||||
for category in challenge_data.category:
|
||||
test_func = getattr(pytest.mark, category)(test_func)
|
||||
|
||||
test_func = pytest.mark.usefixtures("scores")(test_func)
|
||||
setattr(challenge_class, f"test_{datum['name']}", test_func)
|
||||
|
||||
|
||||
def create_single_test(
|
||||
data: Dict[str, Any] | ChallengeData,
|
||||
challenge_location: str,
|
||||
file_datum: Optional[list[dict[str, Any]]] = None,
|
||||
) -> None:
|
||||
challenge_data = None
|
||||
artifacts_location = None
|
||||
if isinstance(data, ChallengeData):
|
||||
challenge_data = data
|
||||
data = data.get_data()
|
||||
|
||||
DATA_CATEGORY[data["name"]] = data["category"][0]
|
||||
|
||||
# Define test class dynamically
|
||||
challenge_class = types.new_class(data["name"], (Challenge,))
|
||||
|
||||
clean_challenge_location = get_test_path(challenge_location)
|
||||
setattr(challenge_class, "CHALLENGE_LOCATION", clean_challenge_location)
|
||||
|
||||
# in the case of a suite
|
||||
if isinstance(challenge_data, ChallengeData):
|
||||
if file_datum: # same task suite
|
||||
setup_dummy_dependencies(file_datum, challenge_class, challenge_data)
|
||||
|
||||
artifacts_location = str(Path(challenge_location).resolve())
|
||||
if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv:
|
||||
artifacts_location = str(Path(challenge_location).resolve().parent.parent)
|
||||
setattr(
|
||||
challenge_class,
|
||||
"_data_cache",
|
||||
{clean_challenge_location: challenge_data},
|
||||
)
|
||||
|
||||
setattr(
|
||||
challenge_class,
|
||||
"ARTIFACTS_LOCATION",
|
||||
artifacts_location or str(Path(challenge_location).resolve().parent),
|
||||
)
|
||||
|
||||
# Define test method within the dynamically created class
|
||||
@pytest.mark.asyncio
|
||||
async def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore
|
||||
# create a random number between 0 and 1
|
||||
test_name = self.data.name
|
||||
|
||||
try:
|
||||
with open("challenges_already_beaten.json", "r") as f:
|
||||
challenges_beaten_in_the_past = json.load(f)
|
||||
except:
|
||||
challenges_beaten_in_the_past = {}
|
||||
|
||||
if request.config.getoption("--explore") and challenges_beaten_in_the_past.get(
|
||||
test_name, False
|
||||
):
|
||||
return None
|
||||
|
||||
# skip optional categories
|
||||
self.skip_optional_categories(config)
|
||||
|
||||
from helicone.lock import HeliconeLockManager
|
||||
|
||||
if os.environ.get("HELICONE_API_KEY"):
|
||||
HeliconeLockManager.write_custom_property("challenge", self.data.name)
|
||||
|
||||
cutoff = self.data.cutoff or 60
|
||||
|
||||
timeout = cutoff
|
||||
if "--nc" in sys.argv:
|
||||
timeout = 100000
|
||||
if "--cutoff" in sys.argv:
|
||||
timeout = int(sys.argv[sys.argv.index("--cutoff") + 1])
|
||||
|
||||
await self.setup_challenge(config, timeout)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
request.node.scores = scores # store scores in request.node
|
||||
assert 1 in scores["values"]
|
||||
|
||||
# Parametrize the method here
|
||||
test_method = pytest.mark.parametrize(
|
||||
"challenge_data",
|
||||
[data],
|
||||
indirect=True,
|
||||
)(test_method)
|
||||
|
||||
setattr(challenge_class, "test_method", test_method)
|
||||
|
||||
# Attach the new class to a module so it can be discovered by pytest
|
||||
module = importlib.import_module(__name__)
|
||||
setattr(module, data["name"], challenge_class)
|
||||
|
||||
|
||||
def create_single_suite_challenge(challenge_data: ChallengeData, path: Path) -> None:
|
||||
create_single_test(challenge_data, str(path))
|
||||
|
||||
|
||||
def create_challenge(
|
||||
data: Dict[str, Any],
|
||||
json_file: str,
|
||||
suite_config: SuiteConfig | None,
|
||||
json_files: deque,
|
||||
) -> deque:
|
||||
path = Path(json_file).resolve()
|
||||
if suite_config is not None:
|
||||
grandparent_dir = path.parent.parent
|
||||
|
||||
# if its a single test running we dont care about the suite
|
||||
if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv:
|
||||
challenge_data = suite_config.challenge_from_test_data(data)
|
||||
create_single_suite_challenge(challenge_data, path)
|
||||
return json_files
|
||||
|
||||
# Get all data.json files within the grandparent directory
|
||||
suite_files = suite_config.get_data_paths(grandparent_dir)
|
||||
|
||||
# Remove all data.json files from json_files list, except for current_file
|
||||
json_files = deque(
|
||||
file
|
||||
for file in json_files
|
||||
if file not in suite_files
|
||||
and Path(file).resolve() != Path(json_file).resolve()
|
||||
)
|
||||
|
||||
suite_file_datum = [
|
||||
ChallengeData.get_json_from_path(suite_file)
|
||||
for suite_file in suite_files
|
||||
if suite_file != json_file
|
||||
]
|
||||
|
||||
file_datum = [data, *suite_file_datum]
|
||||
|
||||
if suite_config.same_task:
|
||||
challenge_data = suite_config.challenge_from_datum(file_datum)
|
||||
|
||||
create_single_test(
|
||||
challenge_data, str(grandparent_dir), file_datum=file_datum
|
||||
)
|
||||
else:
|
||||
reverse = suite_config.reverse_order
|
||||
|
||||
# TODO: reversing doesn't work, for the same reason why the ordering of dummy tests doesn't work
|
||||
if reverse:
|
||||
paired_data = list(reversed(list(zip(file_datum, suite_files))))
|
||||
else:
|
||||
paired_data = list(zip(file_datum, suite_files))
|
||||
|
||||
for file_data, file_path in paired_data:
|
||||
# if we're running in reverse we don't want dependencies to get in the way
|
||||
if reverse:
|
||||
file_data["dependencies"] = []
|
||||
create_single_test(file_data, file_path)
|
||||
|
||||
else:
|
||||
create_single_test(data, str(path))
|
||||
|
||||
return json_files
|
||||
|
||||
|
||||
# if there's any suite.json files with that prefix
|
||||
|
||||
|
||||
def generate_tests() -> None: # sourcery skip: invert-any-all
|
||||
print("Generating tests...")
|
||||
|
||||
json_files = deque(
|
||||
glob.glob(
|
||||
f"{agbenchmark.start_benchmark.CHALLENGES_PATH}/**/data.json",
|
||||
recursive=True,
|
||||
)
|
||||
)
|
||||
regression_tests = agbenchmark.start_benchmark.get_regression_data()
|
||||
|
||||
# for suites to know if the file has already been used to generate the tests
|
||||
# Dynamic class creation
|
||||
|
||||
while json_files:
|
||||
json_file = (
|
||||
json_files.popleft()
|
||||
) # Take and remove the first element from json_files
|
||||
if challenge_should_be_ignored(json_file):
|
||||
continue
|
||||
data = ChallengeData.get_json_from_path(json_file)
|
||||
suite_config = SuiteConfig.suite_data_if_suite(Path(json_file))
|
||||
|
||||
commands = sys.argv
|
||||
# --category flag
|
||||
if "--category" in commands:
|
||||
categories = data.get("category", [])
|
||||
commands_set = set(commands)
|
||||
|
||||
# Add the shared category if the conditions are met
|
||||
if suite_config and suite_config.same_task:
|
||||
# handled by if same_task is false in types
|
||||
categories += suite_config.shared_category # type: ignore
|
||||
|
||||
# Convert the combined list to a set
|
||||
categories_set = set(categories)
|
||||
|
||||
# If there's no overlap with commands
|
||||
if not categories_set.intersection(commands_set):
|
||||
continue
|
||||
|
||||
# --test flag, only run the test if it's the exact one specified
|
||||
test_flag = "--test" in commands
|
||||
if test_flag and data["name"] not in commands:
|
||||
continue
|
||||
|
||||
# --maintain and --improve flag
|
||||
in_regression = regression_tests.get(data["name"], None)
|
||||
improve_flag = in_regression and "--improve" in commands
|
||||
maintain_flag = not in_regression and "--maintain" in commands
|
||||
if "--maintain" in commands and maintain_flag:
|
||||
continue
|
||||
elif "--improve" in commands and improve_flag:
|
||||
continue
|
||||
|
||||
# "--suite flag
|
||||
if "--suite" in commands:
|
||||
if not suite_config:
|
||||
# not a test from a suite
|
||||
continue
|
||||
elif not any(command in data["name"] for command in commands):
|
||||
continue
|
||||
|
||||
# elif (
|
||||
# not any(command in data["name"] for command in commands)
|
||||
# and suite_config.prefix not in data["name"]
|
||||
# ):
|
||||
# # a part of the suite but not the one specified
|
||||
# continue
|
||||
|
||||
json_files = create_challenge(data, json_file, suite_config, json_files)
|
||||
|
||||
if suite_config and not (test_flag or maintain_flag or improve_flag):
|
||||
print(f"Generated suite for {suite_config.prefix}.")
|
||||
else:
|
||||
print(f"Generated test for {data['name']}.")
|
||||
|
||||
|
||||
def challenge_should_be_ignored(json_file):
|
||||
return "challenges/deprecated" in json_file or "challenges/library" in json_file
|
||||
|
||||
|
||||
generate_tests()
|
||||
Reference in New Issue
Block a user