Fixing benchmarks

2025-12-17 14:04:27 +01:00 · 2023-09-11 17:23:38 +02:00
parent bce4bd6755
commit c73e90c4e6
273 changed files with 580 additions and 144 deletions
--- a/benchmark/agbenchmark/README.md
+++ b/benchmark/agbenchmark/README.md
--- a/benchmark/benchmark/init.py
+++ b/benchmark/benchmark/init.py
@@ -0,0 +1,5 @@
 # import pydevd_pycharm
 # pydevd_pycharm.settrace(
 #     "localhost", port=9739, stdoutToServer=True, stderrToServer=True
 # )
--- a/benchmark/benchmark/main.py
+++ b/benchmark/benchmark/main.py
@@ -0,0 +1,274 @@
 import glob
 import json
 import os
 import sys
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Optional
 import toml
 import click
 import pytest
 from helicone.lock import HeliconeLockManager
 from benchmark.utils.data_types import AgentBenchmarkConfig
 BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
 if os.environ.get("HELICONE_API_KEY"):
    HeliconeLockManager.write_custom_property(
        "benchmark_start_time", BENCHMARK_START_TIME
    )
 with open(
    Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
 ) as f:
    OPTIONAL_CATEGORIES = json.load(f)["optional_categories"]
 def get_unique_categories() -> set[str]:
    """Find all data.json files in the directory relative to this file and its subdirectories,
    read the "category" field from each file, and return a set of unique categories."""
    categories = set()
    # Get the directory of this file
    this_dir = os.path.dirname(os.path.abspath(__file__))
    glob_path = os.path.join(this_dir, "./challenges/**/data.json")
    # Use it as the base for the glob pattern
    for data_file in glob.glob(glob_path, recursive=True):
        with open(data_file, "r") as f:
            try:
                data = json.load(f)
                categories.update(data.get("category", []))
            except json.JSONDecodeError:
                print(f"Error: {data_file} is not a valid JSON file.")
                continue
            except IOError:
                print(f"IOError: file could not be read: {data_file}")
                continue
    return categories
 def run_benchmark(
    agent_benchmark_config_path: AgentBenchmarkConfig,
    maintain: bool = False,
    improve: bool = False,
    explore: bool = False,
    mock: bool = False,
    no_dep: bool = False,
    nc: bool = False,
    category: Optional[list[str]] = None,
    skip_category: Optional[list[str]] = None,
    test: Optional[str] = None,
    suite: Optional[str] = None,
    cutoff: Optional[int] = None,
    server: bool = False,
 ) -> int:
    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
    # Check if configuration file exists and is not empty
    try:
        with open(agent_benchmark_config_path, "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        return 1
    if maintain and improve and explore:
        print(
            "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
        )
        return 1
    if test and (category or skip_category or maintain or improve or suite or explore):
        print(
            "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
        )
        return 1
    # TODO: test and ensure that this functionality works before removing
    # change elif suite below if removing
    if suite and (category or skip_category or maintain or improve or explore):
        print(
            "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
        )
        return 1
    assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \
        "Error: host needs to be added to the config if api_mode is set to True."
    print("Current configuration:")
    for key, value in vars(agent_benchmark_config).items():
        print(f"{key}: {value}")
    pytest_args = ["-vs"]
    pytest_args.extend(["--agent_config_path", agent_benchmark_config_path])
    if test:
        print("Running specific test:", test)
        pytest_args.extend(["-k", test, "--test"])
    elif suite:
        print("Running specific suite:", suite)
        pytest_args.extend(["--suite"])
    else:
        # Categories that are used in the challenges
        categories = get_unique_categories()
        if category:
            invalid_categories = set(category) - categories
            assert (
                not invalid_categories
            ), f"Invalid categories: {invalid_categories}. Valid categories are: {categories}"
        if category:
            categories_to_run = set(category)
            if skip_category:
                categories_to_run = categories_to_run.difference(set(skip_category))
                assert categories_to_run, "Error: You can't skip all categories"
            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
            print("Running tests of category:", categories_to_run)
        elif skip_category:
            categories_to_run = categories - set(skip_category)
            assert categories_to_run, "Error: You can't skip all categories"
            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
            print("Running tests of category:", categories_to_run)
        else:
            print("Running all categories")
        if maintain:
            print("Running only regression tests")
            pytest_args.append("--maintain")
        elif improve:
            print("Running only non-regression tests")
            pytest_args.append("--improve")
        elif explore:
            print("Only attempt challenges that have never been beaten")
            pytest_args.append("--explore")
    if mock:
        pytest_args.append("--mock")
    if no_dep:
        pytest_args.append("--no_dep")
    if nc and cutoff:
        print(
            "Error: You can't use both --nc and --cutoff at the same time. Please choose one."
        )
        return 1
    if nc:
        pytest_args.append("--nc")
    if cutoff:
        pytest_args.append("--cutoff")
        print(f"Setting cuttoff override to {cutoff} seconds.")
    current_dir = Path(__file__).resolve().parent
    print(f"Current directory: {current_dir}")
    pytest_args.extend((str(current_dir), "--cache-clear"))
    return pytest.main(pytest_args)
@click.group()
 def cli() -> None:
    pass
@cli.command()
@click.option("--backend", is_flag=True, help="If it's being run from the cli")
@click.option("-c", "--category", multiple=True, help="Specific category to run")
@click.option(
    "-s",
    "--skip-category",
    multiple=True,
    help="Skips preventing the tests from this category from running",
 )
@click.option("--test", help="Specific test to run")
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
@click.option(
    "--explore",
    is_flag=True,
    help="Only attempt challenges that have never been beaten",
 )
@click.option("--mock", is_flag=True, help="Run with mock")
@click.option("--suite", help="Run a suite of related tests")
@click.option(
    "--no_dep",
    is_flag=True,
    help="Run without dependencies (can be useful for a suite run)",
 )
@click.option("--nc", is_flag=True, help="Run without cutoff")
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True)
 def start(
    maintain: bool,
    improve: bool,
    explore: bool,
    mock: bool,
    no_dep: bool,
    nc: bool,
    agent_config: click.Path,
    category: Optional[list[str]] = None,
    skip_category: Optional[list[str]] = None,
    test: Optional[str] = None,
    suite: Optional[str] = None,
    cutoff: Optional[int] = None,
    backend: Optional[bool] = False,
 ) -> Any:
    # Redirect stdout if backend is True
    original_stdout = sys.stdout  # Save the original standard output
    exit_code = None
    assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided"
    if backend:
        with open("backend/backend_stdout.txt", "w") as f:
            sys.stdout = f
            exit_code = run_benchmark(
                agent_benchmark_config_path=agent_config,
                maintain=maintain,
                improve=improve,
                explore=explore,
                mock=mock,
                no_dep=no_dep,
                nc=nc,
                category=category,
                skip_category=skip_category,
                test=test,
                suite=suite,
                cutoff=cutoff,
            )
        sys.stdout = original_stdout
    else:
        exit_code = run_benchmark(
            agent_benchmark_config_path=agent_config,
            maintain=maintain,
            improve=improve,
            explore=explore,
            mock=mock,
            no_dep=no_dep,
            nc=nc,
            category=category,
            skip_category=skip_category,
            test=test,
            suite=suite,
            cutoff=cutoff,
        )
        sys.exit(exit_code)
@cli.command()
 def version():
    """Print the version of the benchmark tool."""
    current_directory = Path(__file__).resolve().parent
    version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
    print(f"Benchmark Tool Version {version}")
 if __name__ == "__main__":
    cli()
--- a/benchmark/agbenchmark/agent_api_interface.py
+++ b/benchmark/agbenchmark/agent_api_interface.py
@@ -5,8 +5,8 @@ from typing import Any, Dict, Optional
 from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody
-from agbenchmark.agent_interface import get_list_of_file_paths
+from benchmark.agent_interface import get_list_of_file_paths
-from agbenchmark.utils.data_types import ChallengeData
+from benchmark.utils.data_types import ChallengeData
 async def run_api_agent(
--- a/benchmark/agbenchmark/agent_interface.py
+++ b/benchmark/agbenchmark/agent_interface.py
@@ -12,7 +12,6 @@ from typing import Any, List
 import psutil
 from dotenv import load_dotenv
 import agbenchmark.start_benchmark
 load_dotenv()
@@ -77,7 +76,7 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
 def run_agent(task: str, timeout: int) -> None:
    """Calling to get a response"""
-    entry_path = "agbenchmark.benchmarks"
+    entry_path = "benchmark.benchmarks"
    print(f"Running '{entry_path}' with timeout {timeout}")
@@ -87,7 +86,7 @@ def run_agent(task: str, timeout: int) -> None:
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
-        cwd=agbenchmark.start_benchmark.HOME_DIRECTORY,
+        cwd=benchmark.start_benchmark.HOME_DIRECTORY,
        bufsize=1,
    )
@@ -109,7 +108,7 @@ def get_list_of_file_paths(
 ) -> List[str]:
    # this file is at agbenchmark\agent_interface.py
    source_dir = os.path.join(
-        agbenchmark.start_benchmark.CURRENT_DIRECTORY,
+        benchmark.start_benchmark.CURRENT_DIRECTORY,
        "..",
        challenge_dir_path,
        artifact_folder_name,
--- a/benchmark/agbenchmark/app.py
+++ b/benchmark/agbenchmark/app.py
--- a/benchmark/agbenchmark/challenges/CHALLENGE.md
+++ b/benchmark/agbenchmark/challenges/CHALLENGE.md
--- a/benchmark/agbenchmark/challenges/README.md
+++ b/benchmark/agbenchmark/challenges/README.md
--- a/benchmark/agbenchmark/challenges/SUITES.md
+++ b/benchmark/agbenchmark/challenges/SUITES.md
--- a/benchmark/agbenchmark/challenges/init.py
+++ b/benchmark/agbenchmark/challenges/init.py
--- a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt
--- a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt
--- a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt
--- a/benchmark/agbenchmark/challenges/abilities/read_file/data.json
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/data.json
--- a/benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt
+++ b/benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt
--- a/benchmark/agbenchmark/challenges/abilities/write_file/data.json
+++ b/benchmark/agbenchmark/challenges/abilities/write_file/data.json
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py
--- a/Show More
+++ b/Show More