Fixing benchmarks

2025-12-17 05:54:26 +01:00 · 2023-09-11 17:23:38 +02:00
parent bce4bd6755
commit c73e90c4e6
273 changed files with 580 additions and 144 deletions
--- a/benchmark/agbenchmark/README.md
+++ b/benchmark/agbenchmark/README.md
--- a/benchmark/benchmark/init.py
+++ b/benchmark/benchmark/init.py
@@ -0,0 +1,5 @@
+# import pydevd_pycharm
+
+# pydevd_pycharm.settrace(
+#     "localhost", port=9739, stdoutToServer=True, stderrToServer=True
+# )
--- a/benchmark/benchmark/main.py
+++ b/benchmark/benchmark/main.py
@@ -0,0 +1,274 @@
+import glob
+import json
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional
+import toml
+
+import click
+import pytest
+from helicone.lock import HeliconeLockManager
+
+from benchmark.utils.data_types import AgentBenchmarkConfig
+
+BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
+
+if os.environ.get("HELICONE_API_KEY"):
+    HeliconeLockManager.write_custom_property(
+        "benchmark_start_time", BENCHMARK_START_TIME
+    )
+
+with open(
+    Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
+) as f:
+    OPTIONAL_CATEGORIES = json.load(f)["optional_categories"]
+
+
+def get_unique_categories() -> set[str]:
+    """Find all data.json files in the directory relative to this file and its subdirectories,
+    read the "category" field from each file, and return a set of unique categories."""
+    categories = set()
+
+    # Get the directory of this file
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+
+    glob_path = os.path.join(this_dir, "./challenges/**/data.json")
+    # Use it as the base for the glob pattern
+    for data_file in glob.glob(glob_path, recursive=True):
+        with open(data_file, "r") as f:
+            try:
+                data = json.load(f)
+                categories.update(data.get("category", []))
+            except json.JSONDecodeError:
+                print(f"Error: {data_file} is not a valid JSON file.")
+                continue
+            except IOError:
+                print(f"IOError: file could not be read: {data_file}")
+                continue
+
+    return categories
+
+
+def run_benchmark(
+    agent_benchmark_config_path: AgentBenchmarkConfig,
+    maintain: bool = False,
+    improve: bool = False,
+    explore: bool = False,
+    mock: bool = False,
+    no_dep: bool = False,
+    nc: bool = False,
+    category: Optional[list[str]] = None,
+    skip_category: Optional[list[str]] = None,
+    test: Optional[str] = None,
+    suite: Optional[str] = None,
+    cutoff: Optional[int] = None,
+    server: bool = False,
+) -> int:
+    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
+    # Check if configuration file exists and is not empty
+
+    try:
+        with open(agent_benchmark_config_path, "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        return 1
+
+    if maintain and improve and explore:
+        print(
+            "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
+        )
+        return 1
+
+    if test and (category or skip_category or maintain or improve or suite or explore):
+        print(
+            "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
+        )
+        return 1
+
+    # TODO: test and ensure that this functionality works before removing
+    # change elif suite below if removing
+    if suite and (category or skip_category or maintain or improve or explore):
+        print(
+            "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
+        )
+        return 1
+    
+    assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \
+        "Error: host needs to be added to the config if api_mode is set to True."
+
+    print("Current configuration:")
+    for key, value in vars(agent_benchmark_config).items():
+        print(f"{key}: {value}")
+
+    pytest_args = ["-vs"]
+    pytest_args.extend(["--agent_config_path", agent_benchmark_config_path])
+    if test:
+        print("Running specific test:", test)
+        pytest_args.extend(["-k", test, "--test"])
+    elif suite:
+        print("Running specific suite:", suite)
+        pytest_args.extend(["--suite"])
+    else:
+        # Categories that are used in the challenges
+        categories = get_unique_categories()
+        if category:
+            invalid_categories = set(category) - categories
+            assert (
+                not invalid_categories
+            ), f"Invalid categories: {invalid_categories}. Valid categories are: {categories}"
+
+        if category:
+            categories_to_run = set(category)
+            if skip_category:
+                categories_to_run = categories_to_run.difference(set(skip_category))
+                assert categories_to_run, "Error: You can't skip all categories"
+            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
+            print("Running tests of category:", categories_to_run)
+        elif skip_category:
+            categories_to_run = categories - set(skip_category)
+            assert categories_to_run, "Error: You can't skip all categories"
+            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
+            print("Running tests of category:", categories_to_run)
+        else:
+            print("Running all categories")
+
+        if maintain:
+            print("Running only regression tests")
+            pytest_args.append("--maintain")
+        elif improve:
+            print("Running only non-regression tests")
+            pytest_args.append("--improve")
+        elif explore:
+            print("Only attempt challenges that have never been beaten")
+            pytest_args.append("--explore")
+
+    if mock:
+        pytest_args.append("--mock")
+
+    if no_dep:
+        pytest_args.append("--no_dep")
+
+    if nc and cutoff:
+        print(
+            "Error: You can't use both --nc and --cutoff at the same time. Please choose one."
+        )
+        return 1
+
+    if nc:
+        pytest_args.append("--nc")
+    if cutoff:
+        pytest_args.append("--cutoff")
+        print(f"Setting cuttoff override to {cutoff} seconds.")
+    current_dir = Path(__file__).resolve().parent
+    print(f"Current directory: {current_dir}")
+    pytest_args.extend((str(current_dir), "--cache-clear"))
+    return pytest.main(pytest_args)
+
+
+@click.group()
+def cli() -> None:
+    pass
+
+
+@cli.command()
+@click.option("--backend", is_flag=True, help="If it's being run from the cli")
+@click.option("-c", "--category", multiple=True, help="Specific category to run")
+@click.option(
+    "-s",
+    "--skip-category",
+    multiple=True,
+    help="Skips preventing the tests from this category from running",
+)
+@click.option("--test", help="Specific test to run")
+@click.option("--maintain", is_flag=True, help="Runs only regression tests")
+@click.option("--improve", is_flag=True, help="Run only non-regression tests")
+@click.option(
+    "--explore",
+    is_flag=True,
+    help="Only attempt challenges that have never been beaten",
+)
+@click.option("--mock", is_flag=True, help="Run with mock")
+@click.option("--suite", help="Run a suite of related tests")
+@click.option(
+    "--no_dep",
+    is_flag=True,
+    help="Run without dependencies (can be useful for a suite run)",
+)
+@click.option("--nc", is_flag=True, help="Run without cutoff")
+@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
+@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True)
+def start(
+    maintain: bool,
+    improve: bool,
+    explore: bool,
+    mock: bool,
+    no_dep: bool,
+    nc: bool,
+    agent_config: click.Path,
+    category: Optional[list[str]] = None,
+    skip_category: Optional[list[str]] = None,
+    test: Optional[str] = None,
+    suite: Optional[str] = None,
+    cutoff: Optional[int] = None,
+    backend: Optional[bool] = False,
+) -> Any:
+    # Redirect stdout if backend is True
+    original_stdout = sys.stdout  # Save the original standard output
+    exit_code = None
+
+
+    assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided"
+
+    if backend:
+        with open("backend/backend_stdout.txt", "w") as f:
+            sys.stdout = f
+            exit_code = run_benchmark(
+                agent_benchmark_config_path=agent_config,
+                maintain=maintain,
+                improve=improve,
+                explore=explore,
+                mock=mock,
+                no_dep=no_dep,
+                nc=nc,
+                category=category,
+                skip_category=skip_category,
+                test=test,
+                suite=suite,
+                cutoff=cutoff,
+            )
+
+        sys.stdout = original_stdout
+
+    else:
+        exit_code = run_benchmark(
+            agent_benchmark_config_path=agent_config,
+            maintain=maintain,
+            improve=improve,
+            explore=explore,
+            mock=mock,
+            no_dep=no_dep,
+            nc=nc,
+            category=category,
+            skip_category=skip_category,
+            test=test,
+            suite=suite,
+            cutoff=cutoff,
+        )
+
+        sys.exit(exit_code)
+
+
+@cli.command()
+def version():
+    """Print the version of the benchmark tool."""
+    current_directory = Path(__file__).resolve().parent
+    version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
+    print(f"Benchmark Tool Version {version}")
+
+
+if __name__ == "__main__":
+    cli()
--- a/benchmark/agbenchmark/agent_api_interface.py
+++ b/benchmark/agbenchmark/agent_api_interface.py
@@ -5,8 +5,8 @@ from typing import Any, Dict, Optional

 from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody

-from agbenchmark.agent_interface import get_list_of_file_paths
-from agbenchmark.utils.data_types import ChallengeData
+from benchmark.agent_interface import get_list_of_file_paths
+from benchmark.utils.data_types import ChallengeData


 async def run_api_agent(
--- a/benchmark/agbenchmark/agent_interface.py
+++ b/benchmark/agbenchmark/agent_interface.py
@@ -12,7 +12,6 @@ from typing import Any, List
 import psutil
 from dotenv import load_dotenv

-import agbenchmark.start_benchmark

 load_dotenv()

@@ -77,7 +76,7 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
 def run_agent(task: str, timeout: int) -> None:
    """Calling to get a response"""

-    entry_path = "agbenchmark.benchmarks"
+    entry_path = "benchmark.benchmarks"

    print(f"Running '{entry_path}' with timeout {timeout}")

@@ -87,7 +86,7 @@ def run_agent(task: str, timeout: int) -> None:
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
-        cwd=agbenchmark.start_benchmark.HOME_DIRECTORY,
+        cwd=benchmark.start_benchmark.HOME_DIRECTORY,
        bufsize=1,
    )

@@ -109,7 +108,7 @@ def get_list_of_file_paths(
 ) -> List[str]:
    # this file is at agbenchmark\agent_interface.py
    source_dir = os.path.join(
-        agbenchmark.start_benchmark.CURRENT_DIRECTORY,
+        benchmark.start_benchmark.CURRENT_DIRECTORY,
        "..",
        challenge_dir_path,
        artifact_folder_name,
--- a/benchmark/agbenchmark/app.py
+++ b/benchmark/agbenchmark/app.py
--- a/benchmark/agbenchmark/challenges/CHALLENGE.md
+++ b/benchmark/agbenchmark/challenges/CHALLENGE.md
--- a/benchmark/agbenchmark/challenges/README.md
+++ b/benchmark/agbenchmark/challenges/README.md
--- a/benchmark/agbenchmark/challenges/SUITES.md
+++ b/benchmark/agbenchmark/challenges/SUITES.md
--- a/benchmark/agbenchmark/challenges/init.py
+++ b/benchmark/agbenchmark/challenges/init.py
--- a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt
--- a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt
--- a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt
--- a/benchmark/agbenchmark/challenges/abilities/read_file/data.json
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/data.json
--- a/benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt
+++ b/benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt
--- a/benchmark/agbenchmark/challenges/abilities/write_file/data.json
+++ b/benchmark/agbenchmark/challenges/abilities/write_file/data.json
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py
--- a/Show More
+++ b/Show More