Fixing benchmarks

2025-12-17 05:54:26 +01:00 · 2023-09-11 17:23:38 +02:00
parent bce4bd6755
commit c73e90c4e6
273 changed files with 580 additions and 144 deletions
--- a/benchmark/agbenchmark/generate_test.py
+++ b/benchmark/agbenchmark/generate_test.py
@@ -1,303 +0,0 @@
-import glob
-import importlib
-import json
-import os
-import sys
-import types
-from collections import deque
-from pathlib import Path
-from typing import Any, Callable, Dict, Optional
-
-import pytest
-
-import agbenchmark.start_benchmark
-from agbenchmark.utils.challenge import Challenge
-from agbenchmark.utils.data_types import ChallengeData, SuiteConfig
-from agbenchmark.utils.utils import get_test_path
-
-DATA_CATEGORY = {}
-
-
-def setup_dummy_dependencies(
-    file_datum: list[dict[str, Any]],
-    challenge_class: Any,
-    challenge_data: ChallengeData,
-) -> None:
-    """Sets up the dependencies if it's a suite. Creates tests that pass
-    based on the main test run."""
-
-    def create_test_func(test_name: str) -> Callable[[Any, dict[str, Any]], None]:
-        # This function will return another function
-
-        # Define a dummy test function that does nothing
-        def setup_dependency_test(self: Any, scores: dict[str, Any]) -> None:
-            scores = self.get_dummy_scores(test_name, scores)
-            assert scores == 1
-
-        return setup_dependency_test
-
-    for datum in file_datum:
-        DATA_CATEGORY[datum["name"]] = challenge_data.category[0]
-        test_func = create_test_func(datum["name"])
-        # TODO: replace this once I figure out actual dependencies
-        test_func = pytest.mark.depends(on=[challenge_data.name], name=datum["name"])(
-            test_func
-        )
-        test_func = pytest.mark.parametrize(
-            "challenge_data",
-            [None],
-            indirect=True,
-        )(test_func)
-
-        # Add category markers
-        for category in challenge_data.category:
-            test_func = getattr(pytest.mark, category)(test_func)
-
-        test_func = pytest.mark.usefixtures("scores")(test_func)
-        setattr(challenge_class, f"test_{datum['name']}", test_func)
-
-
-def create_single_test(
-    data: Dict[str, Any] | ChallengeData,
-    challenge_location: str,
-    file_datum: Optional[list[dict[str, Any]]] = None,
-) -> None:
-    challenge_data = None
-    artifacts_location = None
-    if isinstance(data, ChallengeData):
-        challenge_data = data
-        data = data.get_data()
-
-    DATA_CATEGORY[data["name"]] = data["category"][0]
-
-    # Define test class dynamically
-    challenge_class = types.new_class(data["name"], (Challenge,))
-
-    clean_challenge_location = get_test_path(challenge_location)
-    setattr(challenge_class, "CHALLENGE_LOCATION", clean_challenge_location)
-
-    # in the case of a suite
-    if isinstance(challenge_data, ChallengeData):
-        if file_datum:  # same task suite
-            setup_dummy_dependencies(file_datum, challenge_class, challenge_data)
-
-        artifacts_location = str(Path(challenge_location).resolve())
-        if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv:
-            artifacts_location = str(Path(challenge_location).resolve().parent.parent)
-        setattr(
-            challenge_class,
-            "_data_cache",
-            {clean_challenge_location: challenge_data},
-        )
-
-    setattr(
-        challenge_class,
-        "ARTIFACTS_LOCATION",
-        artifacts_location or str(Path(challenge_location).resolve().parent),
-    )
-
-    # Define test method within the dynamically created class
-    @pytest.mark.asyncio
-    async def test_method(self, config: Dict[str, Any], request) -> None:  # type: ignore
-        # create a random number between 0 and 1
-        test_name = self.data.name
-
-        try:
-            with open("challenges_already_beaten.json", "r") as f:
-                challenges_beaten_in_the_past = json.load(f)
-        except:
-            challenges_beaten_in_the_past = {}
-
-        if request.config.getoption("--explore") and challenges_beaten_in_the_past.get(
-            test_name, False
-        ):
-            return None
-
-        # skip optional categories
-        self.skip_optional_categories(config)
-
-        from helicone.lock import HeliconeLockManager
-
-        if os.environ.get("HELICONE_API_KEY"):
-            HeliconeLockManager.write_custom_property("challenge", self.data.name)
-
-        cutoff = self.data.cutoff or 60
-
-        timeout = cutoff
-        if "--nc" in sys.argv:
-            timeout = 100000
-        if "--cutoff" in sys.argv:
-            timeout = int(sys.argv[sys.argv.index("--cutoff") + 1])
-
-        await self.setup_challenge(config, timeout)
-
-        scores = self.get_scores(config)
-        request.node.scores = scores  # store scores in request.node
-        assert 1 in scores["values"]
-
-    # Parametrize the method here
-    test_method = pytest.mark.parametrize(
-        "challenge_data",
-        [data],
-        indirect=True,
-    )(test_method)
-
-    setattr(challenge_class, "test_method", test_method)
-
-    # Attach the new class to a module so it can be discovered by pytest
-    module = importlib.import_module(__name__)
-    setattr(module, data["name"], challenge_class)
-
-
-def create_single_suite_challenge(challenge_data: ChallengeData, path: Path) -> None:
-    create_single_test(challenge_data, str(path))
-
-
-def create_challenge(
-    data: Dict[str, Any],
-    json_file: str,
-    suite_config: SuiteConfig | None,
-    json_files: deque,
-) -> deque:
-    path = Path(json_file).resolve()
-    if suite_config is not None:
-        grandparent_dir = path.parent.parent
-
-        # if its a single test running we dont care about the suite
-        if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv:
-            challenge_data = suite_config.challenge_from_test_data(data)
-            create_single_suite_challenge(challenge_data, path)
-            return json_files
-
-        # Get all data.json files within the grandparent directory
-        suite_files = suite_config.get_data_paths(grandparent_dir)
-
-        # Remove all data.json files from json_files list, except for current_file
-        json_files = deque(
-            file
-            for file in json_files
-            if file not in suite_files
-            and Path(file).resolve() != Path(json_file).resolve()
-        )
-
-        suite_file_datum = [
-            ChallengeData.get_json_from_path(suite_file)
-            for suite_file in suite_files
-            if suite_file != json_file
-        ]
-
-        file_datum = [data, *suite_file_datum]
-
-        if suite_config.same_task:
-            challenge_data = suite_config.challenge_from_datum(file_datum)
-
-            create_single_test(
-                challenge_data, str(grandparent_dir), file_datum=file_datum
-            )
-        else:
-            reverse = suite_config.reverse_order
-
-            # TODO: reversing doesn't work, for the same reason why the ordering of dummy tests doesn't work
-            if reverse:
-                paired_data = list(reversed(list(zip(file_datum, suite_files))))
-            else:
-                paired_data = list(zip(file_datum, suite_files))
-
-            for file_data, file_path in paired_data:
-                # if we're running in reverse we don't want dependencies to get in the way
-                if reverse:
-                    file_data["dependencies"] = []
-                create_single_test(file_data, file_path)
-
-    else:
-        create_single_test(data, str(path))
-
-    return json_files
-
-
-# if there's any suite.json files with that prefix
-
-
-def generate_tests() -> None:  # sourcery skip: invert-any-all
-    print("Generating tests...")
-
-    json_files = deque(
-        glob.glob(
-            f"{agbenchmark.start_benchmark.CHALLENGES_PATH}/**/data.json",
-            recursive=True,
-        )
-    )
-    regression_tests = agbenchmark.start_benchmark.get_regression_data()
-
-    # for suites to know if the file has already been used to generate the tests
-    # Dynamic class creation
-
-    while json_files:
-        json_file = (
-            json_files.popleft()
-        )  # Take and remove the first element from json_files
-        if challenge_should_be_ignored(json_file):
-            continue
-        data = ChallengeData.get_json_from_path(json_file)
-        suite_config = SuiteConfig.suite_data_if_suite(Path(json_file))
-
-        commands = sys.argv
-        # --category flag
-        if "--category" in commands:
-            categories = data.get("category", [])
-            commands_set = set(commands)
-
-            # Add the shared category if the conditions are met
-            if suite_config and suite_config.same_task:
-                # handled by if same_task is false in types
-                categories += suite_config.shared_category  # type: ignore
-
-            # Convert the combined list to a set
-            categories_set = set(categories)
-
-            # If there's no overlap with commands
-            if not categories_set.intersection(commands_set):
-                continue
-
-        # --test flag, only run the test if it's the exact one specified
-        test_flag = "--test" in commands
-        if test_flag and data["name"] not in commands:
-            continue
-
-        # --maintain and --improve flag
-        in_regression = regression_tests.get(data["name"], None)
-        improve_flag = in_regression and "--improve" in commands
-        maintain_flag = not in_regression and "--maintain" in commands
-        if "--maintain" in commands and maintain_flag:
-            continue
-        elif "--improve" in commands and improve_flag:
-            continue
-
-        # "--suite flag
-        if "--suite" in commands:
-            if not suite_config:
-                # not a test from a suite
-                continue
-            elif not any(command in data["name"] for command in commands):
-                continue
-
-            # elif (
-            #     not any(command in data["name"] for command in commands)
-            #     and suite_config.prefix not in data["name"]
-            # ):
-            #     # a part of the suite but not the one specified
-            #     continue
-
-        json_files = create_challenge(data, json_file, suite_config, json_files)
-
-        if suite_config and not (test_flag or maintain_flag or improve_flag):
-            print(f"Generated suite for {suite_config.prefix}.")
-        else:
-            print(f"Generated test for {data['name']}.")
-
-
-def challenge_should_be_ignored(json_file):
-    return "challenges/deprecated" in json_file or "challenges/library" in json_file
-
-
-generate_tests()