import glob import importlib import json import os import sys import types from collections import deque from pathlib import Path from typing import Any, Dict, Optional import pytest from agbenchmark.utils.challenge import Challenge from agbenchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData DATA_CATEGORY = {} def create_single_test( data: Dict[str, Any] | ChallengeData, challenge_location: str, file_datum: Optional[list[dict[str, Any]]] = None, ) -> None: challenge_data = None artifacts_location = None if isinstance(data, ChallengeData): challenge_data = data data = data.get_data() DATA_CATEGORY[data["name"]] = data["category"][0] # Define test class dynamically challenge_class = types.new_class(data["name"], (Challenge,)) print(challenge_location) # clean_challenge_location = get_test_path(challenge_location) setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location) setattr( challenge_class, "ARTIFACTS_LOCATION", artifacts_location or str(Path(challenge_location).resolve().parent), ) # Define test method within the dynamically created class @pytest.mark.asyncio async def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore # create a random number between 0 and 1 test_name = self.data.name try: with open("challenges_already_beaten.json", "r") as f: challenges_beaten_in_the_past = json.load(f) except: challenges_beaten_in_the_past = {} if request.config.getoption("--explore") and challenges_beaten_in_the_past.get( test_name, False ): return None # skip optional categories self.skip_optional_categories(config) from helicone.lock import HeliconeLockManager if os.environ.get("HELICONE_API_KEY"): HeliconeLockManager.write_custom_property("challenge", self.data.name) cutoff = self.data.cutoff or 60 timeout = cutoff if "--nc" in sys.argv: timeout = 100000 if "--cutoff" in sys.argv: timeout = int(sys.argv[sys.argv.index("--cutoff") + 1]) await self.setup_challenge(config, timeout) scores = self.get_scores(config) request.node.answers = ( scores["answers"] if "--keep-answers" in sys.argv else None ) del scores["answers"] # remove answers from scores request.node.scores = scores # store scores in request.node assert 1 in scores["values"] # Parametrize the method here test_method = pytest.mark.parametrize( "challenge_data", [data], indirect=True, )(test_method) setattr(challenge_class, "test_method", test_method) # Attach the new class to a module so it can be discovered by pytest module = importlib.import_module(__name__) setattr(module, data["name"], challenge_class) def create_single_suite_challenge(challenge_data: ChallengeData, path: Path) -> None: create_single_test(challenge_data, str(path)) def create_challenge( data: Dict[str, Any], json_file: str, json_files: deque, ) -> deque: path = Path(json_file).resolve() print("Creating challenge for", path) create_single_test(data, str(path)) print("Creation complete for", path) return json_files def generate_tests() -> None: # sourcery skip: invert-any-all print("Generating tests...") challenges_path = os.path.join(os.path.dirname(__file__), "challenges") print(f"Looking for challenges in {challenges_path}...") json_files = deque( glob.glob( f"{challenges_path}/**/data.json", recursive=True, ) ) print(f"Found {len(json_files)} challenges.") print(f"Sample path: {json_files[0]}") agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json") try: with open(agent_benchmark_config_path, "r") as f: agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) agent_benchmark_config.agent_benchmark_config_path = ( agent_benchmark_config_path ) except json.JSONDecodeError: print("Error: benchmark_config.json is not a valid JSON file.") raise regression_reports_path = agent_benchmark_config.get_regression_reports_path() if regression_reports_path and os.path.exists(regression_reports_path): with open(regression_reports_path, "r") as f: regression_tests = json.load(f) else: regression_tests = {} while json_files: json_file = ( json_files.popleft() ) # Take and remove the first element from json_files if challenge_should_be_ignored(json_file): continue data = ChallengeData.get_json_from_path(json_file) commands = sys.argv # --by flag if "--category" in commands: categories = data.get("category", []) commands_set = set(commands) # Convert the combined list to a set categories_set = set(categories) # If there's no overlap with commands if not categories_set.intersection(commands_set): continue # --test flag, only run the test if it's the exact one specified test_flag = "--test" in commands if test_flag and data["name"] not in commands: continue # --maintain and --improve flag in_regression = regression_tests.get(data["name"], None) improve_flag = in_regression and "--improve" in commands maintain_flag = not in_regression and "--maintain" in commands if "--maintain" in commands and maintain_flag: continue elif "--improve" in commands and improve_flag: continue json_files = create_challenge(data, json_file, json_files) print(f"Generated test for {data['name']}.") print("Test generation complete.") def challenge_should_be_ignored(json_file): return "challenges/deprecated" in json_file or "challenges/library" in json_file generate_tests()