From c73e90c4e6ceef0d5b6e69ba8af2a30bb14dddc9 Mon Sep 17 00:00:00 2001 From: SwiftyOS Date: Mon, 11 Sep 2023 17:23:38 +0200 Subject: [PATCH] Fixing benchmarks --- .../{agbenchmark => benchmark}/README.md | 0 benchmark/benchmark/__init__.py | 5 + benchmark/benchmark/__main__.py | 274 ++++++++++++++++++ .../agent_api_interface.py | 4 +- .../agent_interface.py | 7 +- benchmark/{agbenchmark => benchmark}/app.py | 0 .../challenges/CHALLENGE.md | 0 .../challenges/README.md | 0 .../challenges/SUITES.md | 0 .../challenges/__init__.py | 0 .../read_file/artifacts_in/file_to_read.txt | 0 .../read_file/artifacts_out/file_to_check.txt | 0 .../read_file/artifacts_out/output.txt | 0 .../challenges/abilities/read_file/data.json | 0 .../write_file/artifacts_out/random_file.txt | 0 .../challenges/abilities/write_file/data.json | 0 .../artifacts_in/instructions.txt | 0 .../1_distraction/artifacts_out/goal.txt | 0 .../goal_loss/1_distraction/data.json | 0 .../2_injection/artifacts_in/instructions.txt | 0 .../artifacts_in/instructions_2.txt | 0 .../2_injection/artifacts_out/goal.txt | 0 .../alignment/goal_loss/2_injection/data.json | 0 .../challenges/alignment/goal_loss/suite.json | 0 .../a1_debug/artifacts_in/__init__.py | 0 .../a1_debug/artifacts_in/sample_code.py | 0 .../a1_debug/artifacts_in/test.py | 0 .../a1_debug/artifacts_out/__init__.py | 0 .../a1_debug/artifacts_out/sample_code.py | 0 .../a1_debug/artifacts_out/test.py | 0 .../adapatability/a1_debug/data.json | 0 .../artifacts_out/random_file.txt | 0 .../adapatability/a2_tesla_revenue/data.json | 0 .../artifacts_out/random_file.txt | 0 .../adapatability/a3_book_price/data.json | 0 .../1_return/artifacts_in/__init__.py | 0 .../1_return/artifacts_in/sample_code.py | 0 .../1_return/artifacts_in/test.py | 0 .../1_return/artifacts_out/__init__.py | 0 .../1_return/artifacts_out/sample_code.py | 0 .../1_return/artifacts_out/test.py | 0 .../c1_writing_suite_1/1_return/data.json | 0 .../2_write/artifacts_in/__init__.py | 0 .../2_write/artifacts_in/sample_code.py | 0 .../2_write/artifacts_in/test.py | 0 .../2_write/artifacts_out/__init__.py | 0 .../2_write/artifacts_out/sample_code.py | 0 .../2_write/artifacts_out/test.py | 0 .../code/c1_writing_suite_1/2_write/data.json | 0 .../3_modify/artifacts_in/__init__.py | 0 .../3_modify/artifacts_in/sample_code.py | 0 .../3_modify/artifacts_in/test.py | 0 .../3_modify/artifacts_out/__init__.py | 0 .../3_modify/artifacts_out/sample_code.py | 0 .../3_modify/artifacts_out/test.py | 0 .../c1_writing_suite_1/3_modify/data.json | 0 .../4_tests/artifacts_in/__init__.py | 0 .../4_tests/artifacts_in/sample_code.py | 0 .../4_tests/artifacts_in/testfile.py | 0 .../4_tests/artifacts_out/__init__.py | 0 .../4_tests/artifacts_out/sample_code.py | 0 .../4_tests/artifacts_out/testfile.py | 0 .../4_tests/custom_python/test.py | 0 .../code/c1_writing_suite_1/4_tests/data.json | 0 .../code/c1_writing_suite_1/suite.json | 0 .../d2.1_guided/artifacts_in/__init__.py | 0 .../d2.1_guided/artifacts_in/sample_code.py | 0 .../d2.1_guided/artifacts_in/test.py | 0 .../d2.1_guided/artifacts_out/__init__.py | 0 .../d2.1_guided/artifacts_out/sample_code.py | 0 .../d2.1_guided/artifacts_out/test.py | 0 .../code/c2_debug_suite/d2.1_guided/data.json | 0 .../d2.2_vague/artifacts_in/__init__.py | 0 .../d2.2_vague/artifacts_in/sample_code.py | 0 .../d2.2_vague/artifacts_in/test.py | 0 .../d2.2_vague/artifacts_out/__init__.py | 0 .../d2.2_vague/artifacts_out/sample_code.py | 0 .../d2.2_vague/artifacts_out/test.py | 0 .../code/c2_debug_suite/d2.2_vague/data.json | 0 .../d2.3_import/artifacts_in/__init__.py | 0 .../d2.3_import/artifacts_in/sample_code.py | 0 .../d2.3_import/artifacts_in/test.py | 0 .../d2.3_import/artifacts_out/__init__.py | 0 .../d2.3_import/artifacts_out/sample_code.py | 0 .../d2.3_import/artifacts_out/test.py | 0 .../code/c2_debug_suite/d2.3_import/data.json | 0 .../d3.1_three_sum/artifacts_out/__init__.py | 0 .../artifacts_out/sample_code.py | 0 .../d3.1_three_sum/custom_python/test.py | 0 .../d3.1_three_sum/data.json | 0 .../d3_two_sum/artifacts_out/__init__.py | 0 .../d3_two_sum/artifacts_out/sample_code.py | 0 .../d3_two_sum/custom_python/test.py | 0 .../c3_writing_suite_2/d3_two_sum/data.json | 0 .../artifacts_out/__init__.py | 0 .../artifacts_out/password_generator.py | 0 .../custom_python/test.py | 0 .../1_password_generator/data.json | 0 .../artifacts_out/__init__.py | 0 .../artifacts_out/organize_files.py | 0 .../2_file_organizer/custom_python/test.py | 0 .../2_file_organizer/data.json | 0 .../code/c4_writing_cli_suite_3/suite.json | 0 .../artifacts_out/animal_list.html | 0 .../1_list_animals/custom_python/test.py | 0 .../c5_web_app_suite/1_list_animals/data.json | 0 .../code/c5_web_app_suite/suite.json | 0 .../2_plan/artifacts_out/output.txt | 0 .../deprecated/content_gen/2_plan/data.json | 0 .../d2.1_guided/artifacts_in/__init__.py | 0 .../d2.1_guided/artifacts_in/sample_code.py | 0 .../d2.1_guided/artifacts_in/test.py | 0 .../d2.1_guided/artifacts_out/__init__.py | 0 .../d2.1_guided/artifacts_out/sample_code.py | 0 .../d2.1_guided/artifacts_out/test.py | 0 .../deprecated/d2.1_guided/data.json | 0 .../read_file/artifacts_in/file_to_read.txt | 0 .../read_file/artifacts_out/file_to_check.txt | 0 .../read_file/artifacts_out/output.txt | 0 .../deprecated/interface/read_file/data.json | 0 .../search/artifacts_out/random_file.txt | 0 .../deprecated/interface/search/data.json | 0 .../write_file/artifacts_out/random_file.txt | 0 .../deprecated/interface/write_file/data.json | 0 .../m1_id/artifacts_in/instructions_1.txt | 0 .../m1_id/artifacts_in/instructions_2.txt | 0 .../m1_id/artifacts_in/instructions_3.txt | 0 .../m1_id/artifacts_in/instructions_4.txt | 0 .../m1_id/artifacts_in/instructions_5.txt | 0 .../memory/m1_id/artifacts_out/result.txt | 0 .../deprecated/memory/m1_id/data.json | 0 .../artifacts_in/instructions_1.txt | 0 .../artifacts_in/instructions_2.txt | 0 .../artifacts_in/instructions_3.txt | 0 .../artifacts_in/instructions_4.txt | 0 .../artifacts_in/instructions_5.txt | 0 .../m2_multiple/artifacts_out/result.txt | 0 .../deprecated/memory/m2_multiple/data.json | 0 .../m3_noise/artifacts_in/instructions_1.txt | 0 .../m3_noise/artifacts_in/instructions_2.txt | 0 .../m3_noise/artifacts_in/instructions_3.txt | 0 .../m3_noise/artifacts_in/instructions_4.txt | 0 .../m3_noise/artifacts_in/instructions_5.txt | 0 .../memory/m3_noise/artifacts_out/result.txt | 0 .../deprecated/memory/m3_noise/data.json | 0 .../artifacts_in/instructions_1.txt | 0 .../artifacts_in/instructions_2.txt | 0 .../artifacts_in/instructions_3.txt | 0 .../artifacts_in/instructions_4.txt | 0 .../artifacts_in/instructions_5.txt | 0 .../m4_phrases/artifacts_out/result.txt | 0 .../deprecated/memory/m4_phrases/data.json | 0 .../artifacts_out/random_file.txt | 0 .../retrieval/r1_book_price/data.json | 0 .../1_tesla_revenue/data.json | 0 .../r2_search_suite_1/2_specific/data.json | 0 .../r2_search_suite_1/3_formatting/data.json | 0 .../artifacts_out/random_file.txt | 0 .../retrieval/r2_search_suite_1/suite.json | 0 .../r3/artifacts_out/random_file.txt | 0 .../deprecated/retrieval/r3/data.json | 0 .../1_simple/artifacts_in/instructions.txt | 0 .../1_simple/artifacts_out/goal.txt | 0 .../safety/s1_loss_suite_1/1_simple/data.json | 0 .../2_medium/artifacts_in/instructions.txt | 0 .../2_medium/artifacts_out/goal.txt | 0 .../safety/s1_loss_suite_1/2_medium/data.json | 0 .../3_advanced/artifacts_in/instructions.txt | 0 .../artifacts_in/instructions_2.txt | 0 .../3_advanced/artifacts_out/goal.txt | 0 .../s1_loss_suite_1/3_advanced/data.json | 0 .../4_hard/artifacts_in/instructions.txt | 0 .../4_hard/artifacts_in/instructions_2.txt | 0 .../4_hard/artifacts_out/goal.txt | 0 .../safety/s1_loss_suite_1/4_hard/data.json | 0 .../safety/s2_divergence/artifacts_out/1.txt | 0 .../safety/s2_divergence/artifacts_out/2.txt | 0 .../safety/s2_divergence/artifacts_out/3.txt | 0 .../safety/s2_divergence/artifacts_out/4.txt | 0 .../safety/s2_divergence/artifacts_out/5.txt | 0 .../s2_divergence/custom_python/test.py | 0 .../deprecated/safety/s2_divergence/data.json | 0 .../safety/s2_divergence/data_draft.json | 0 .../s3_instructions/artifacts_out/1.txt | 0 .../s3_instructions/artifacts_out/2.txt | 0 .../s3_instructions/artifacts_out/3.txt | 0 .../s3_instructions/artifacts_out/4.txt | 0 .../s3_instructions/artifacts_out/5.txt | 0 .../s3_instructions/artifacts_out/6.txt | 0 .../s3_instructions/custom_python/test.py | 0 .../safety/s3_instructions/data.json | 0 .../safety/s3_instructions/data_draft.json | 0 .../challenges/library/README.md | 0 .../check_price/artifacts_in/__init__.py | 0 .../check_price/artifacts_in/sample_code.py | 0 .../ethereum/check_price/artifacts_in/test.py | 0 .../check_price/artifacts_out/__init__.py | 0 .../check_price/artifacts_out/sample_code.py | 0 .../check_price/artifacts_out/test.py | 0 .../library/ethereum/check_price/data.json} | 0 .../ethereum/check_price/data_draft.json | 21 ++ .../challenges/optional_categories.json | 0 .../1_three_sum/artifacts_out/__init__.py | 0 .../1_three_sum/artifacts_out/sample_code.py | 0 .../code/1_three_sum/custom_python/test.py | 0 .../verticals/code/1_three_sum/data.json | 0 .../artifacts_out/__init__.py | 0 .../artifacts_out/password_generator.py | 0 .../custom_python/test.py | 0 .../code/2_password_generator/data.json | 0 .../artifacts_out/__init__.py | 0 .../artifacts_out/organize_files.py | 0 .../3_file_organizer/custom_python/test.py | 0 .../verticals/code/3_file_organizer/data.json | 0 .../4_url_shortener/artifacts_out/__init__.py | 0 .../4_url_shortener/artifacts_out/test.py | 0 .../artifacts_out/url_shortener.py | 0 .../verticals/code/4_url_shortener/data.json | 0 .../5_tic_tac_toe/artifacts_out/__init__.py | 0 .../artifacts_out/tic_tac_toe.py | 0 .../code/5_tic_tac_toe/custom_python/test.py | 0 .../code/5_tic_tac_toe/data_draft.json | 0 .../6_battleship/artifacts_in/__init__.py | 0 .../artifacts_in/abstract_class.py | 0 .../6_battleship/artifacts_in/conftest.py | 0 .../artifacts_in/product_requirements.txt | 0 .../artifacts_in/test_negative.py | 0 .../artifacts_in/test_positive.py | 0 .../artifacts_in/user_stories.txt | 0 .../6_battleship/artifacts_out/__init__.py | 0 .../artifacts_out/abstract_class.py | 0 .../6_battleship/artifacts_out/battleship.py | 0 .../6_battleship/artifacts_out/conftest.py | 0 .../artifacts_out/test_negative.py | 0 .../artifacts_out/test_positive.py | 0 .../code/6_battleship/data_draft.json | 0 .../basic/artifacts_out/random_file.txt | 0 .../verticals/scraping/basic/data.json | 0 .../artifacts_out/random_file.txt | 0 .../scraping/r1_book_price/data.json | 0 .../1_summary/artifacts_in/challenges.txt | 0 .../1_summary/artifacts_in/companies.txt | 0 .../1_summary/artifacts_out/output.txt | 0 .../synthesize/1_summary/data_draft.json | 0 .../1_tesla_revenue/data.json | 0 .../r2_search_suite_1/2_specific/data.json | 0 .../r2_search_suite_1/3_formatting/data.json | 0 .../artifacts_out/random_file.txt | 0 .../synthesize/r2_search_suite_1/suite.json | 0 .../r3/artifacts_out/random_file.txt | 0 .../verticals/synthesize/r3/data.json | 0 .../{agbenchmark => benchmark}/conftest.py | 64 ++-- .../generate_test.py | 36 ++- .../reports/ReportManager.py | 30 +- .../reports/processing/gen_combined_chart.py | 4 +- .../reports/processing/get_files.py | 0 .../reports/processing/graphs.py | 0 .../reports/processing/process_report.py | 6 +- .../reports/processing/report_types.py | 0 .../reports/reports.py | 76 +++-- .../start_benchmark.py | 21 +- .../utils/challenge.py | 20 +- .../utils/data_types.py | 77 ++++- .../utils/dependencies/__init__.py | 1 - .../utils/dependencies/constants.py | 0 .../utils/dependencies/graphs.py | 4 +- .../utils/dependencies/main.py | 0 .../utils/dependencies/util.py | 0 .../utils/get_data_from_helicone.py | 5 +- .../utils/prompts.py | 0 .../{agbenchmark => benchmark}/utils/utils.py | 67 +---- benchmark/pyproject.toml | 2 - benchmark/run.sh | 0 273 files changed, 580 insertions(+), 144 deletions(-) rename benchmark/{agbenchmark => benchmark}/README.md (100%) create mode 100644 benchmark/benchmark/__init__.py create mode 100644 benchmark/benchmark/__main__.py rename benchmark/{agbenchmark => benchmark}/agent_api_interface.py (95%) rename benchmark/{agbenchmark => benchmark}/agent_interface.py (95%) rename benchmark/{agbenchmark => benchmark}/app.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/CHALLENGE.md (100%) rename benchmark/{agbenchmark => benchmark}/challenges/README.md (100%) rename benchmark/{agbenchmark => benchmark}/challenges/SUITES.md (100%) rename benchmark/{agbenchmark => benchmark}/challenges/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/abilities/read_file/artifacts_in/file_to_read.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/abilities/read_file/artifacts_out/file_to_check.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/abilities/read_file/artifacts_out/output.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/abilities/read_file/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/abilities/write_file/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/abilities/write_file/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/1_distraction/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/2_injection/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/suite.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a2_tesla_revenue/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a3_book_price/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/suite.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c5_web_app_suite/suite.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/content_gen/2_plan/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_in/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_out/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/read_file/artifacts_out/output.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/read_file/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/search/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/search/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/write_file/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_out/result.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r1_book_price/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/suite.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r3/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/data_draft.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/data_draft.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/library/README.md (100%) rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_in/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_in/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_out/test.py (100%) rename benchmark/{agbenchmark/challenges/library/ethereum/check_price/data_draft.json => benchmark/challenges/library/ethereum/check_price/data.json} (100%) create mode 100644 benchmark/benchmark/challenges/library/ethereum/check_price/data_draft.json rename benchmark/{agbenchmark => benchmark}/challenges/optional_categories.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/1_three_sum/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/1_three_sum/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/2_password_generator/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/2_password_generator/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/3_file_organizer/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/3_file_organizer/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/4_url_shortener/artifacts_out/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/4_url_shortener/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/5_tic_tac_toe/data_draft.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/conftest.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/__init__.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/battleship.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/conftest.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/data_draft.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/scraping/basic/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/scraping/basic/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/scraping/r1_book_price/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/1_summary/data_draft.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/suite.json (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt (100%) rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r3/data.json (100%) rename benchmark/{agbenchmark => benchmark}/conftest.py (78%) rename benchmark/{agbenchmark => benchmark}/generate_test.py (88%) rename benchmark/{agbenchmark => benchmark}/reports/ReportManager.py (74%) rename benchmark/{agbenchmark => benchmark}/reports/processing/gen_combined_chart.py (91%) rename benchmark/{agbenchmark => benchmark}/reports/processing/get_files.py (100%) rename benchmark/{agbenchmark => benchmark}/reports/processing/graphs.py (100%) rename benchmark/{agbenchmark => benchmark}/reports/processing/process_report.py (91%) rename benchmark/{agbenchmark => benchmark}/reports/processing/report_types.py (100%) rename benchmark/{agbenchmark => benchmark}/reports/reports.py (78%) rename benchmark/{agbenchmark => benchmark}/start_benchmark.py (95%) rename benchmark/{agbenchmark => benchmark}/utils/challenge.py (95%) rename benchmark/{agbenchmark => benchmark}/utils/data_types.py (73%) rename benchmark/{agbenchmark => benchmark}/utils/dependencies/__init__.py (99%) rename benchmark/{agbenchmark => benchmark}/utils/dependencies/constants.py (100%) rename benchmark/{agbenchmark => benchmark}/utils/dependencies/graphs.py (98%) rename benchmark/{agbenchmark => benchmark}/utils/dependencies/main.py (100%) rename benchmark/{agbenchmark => benchmark}/utils/dependencies/util.py (100%) rename benchmark/{agbenchmark => benchmark}/utils/get_data_from_helicone.py (92%) rename benchmark/{agbenchmark => benchmark}/utils/prompts.py (100%) rename benchmark/{agbenchmark => benchmark}/utils/utils.py (80%) mode change 100644 => 100755 benchmark/run.sh diff --git a/benchmark/agbenchmark/README.md b/benchmark/benchmark/README.md similarity index 100% rename from benchmark/agbenchmark/README.md rename to benchmark/benchmark/README.md diff --git a/benchmark/benchmark/__init__.py b/benchmark/benchmark/__init__.py new file mode 100644 index 00000000..e8b22704 --- /dev/null +++ b/benchmark/benchmark/__init__.py @@ -0,0 +1,5 @@ +# import pydevd_pycharm + +# pydevd_pycharm.settrace( +# "localhost", port=9739, stdoutToServer=True, stderrToServer=True +# ) diff --git a/benchmark/benchmark/__main__.py b/benchmark/benchmark/__main__.py new file mode 100644 index 00000000..f7f0a77f --- /dev/null +++ b/benchmark/benchmark/__main__.py @@ -0,0 +1,274 @@ +import glob +import json +import os +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional +import toml + +import click +import pytest +from helicone.lock import HeliconeLockManager + +from benchmark.utils.data_types import AgentBenchmarkConfig + +BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00") + +if os.environ.get("HELICONE_API_KEY"): + HeliconeLockManager.write_custom_property( + "benchmark_start_time", BENCHMARK_START_TIME + ) + +with open( + Path(__file__).resolve().parent / "challenges" / "optional_categories.json" +) as f: + OPTIONAL_CATEGORIES = json.load(f)["optional_categories"] + + +def get_unique_categories() -> set[str]: + """Find all data.json files in the directory relative to this file and its subdirectories, + read the "category" field from each file, and return a set of unique categories.""" + categories = set() + + # Get the directory of this file + this_dir = os.path.dirname(os.path.abspath(__file__)) + + glob_path = os.path.join(this_dir, "./challenges/**/data.json") + # Use it as the base for the glob pattern + for data_file in glob.glob(glob_path, recursive=True): + with open(data_file, "r") as f: + try: + data = json.load(f) + categories.update(data.get("category", [])) + except json.JSONDecodeError: + print(f"Error: {data_file} is not a valid JSON file.") + continue + except IOError: + print(f"IOError: file could not be read: {data_file}") + continue + + return categories + + +def run_benchmark( + agent_benchmark_config_path: AgentBenchmarkConfig, + maintain: bool = False, + improve: bool = False, + explore: bool = False, + mock: bool = False, + no_dep: bool = False, + nc: bool = False, + category: Optional[list[str]] = None, + skip_category: Optional[list[str]] = None, + test: Optional[str] = None, + suite: Optional[str] = None, + cutoff: Optional[int] = None, + server: bool = False, +) -> int: + """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" + # Check if configuration file exists and is not empty + + try: + with open(agent_benchmark_config_path, "r") as f: + agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) + agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + except json.JSONDecodeError: + print("Error: benchmark_config.json is not a valid JSON file.") + return 1 + + if maintain and improve and explore: + print( + "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one." + ) + return 1 + + if test and (category or skip_category or maintain or improve or suite or explore): + print( + "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test." + ) + return 1 + + # TODO: test and ensure that this functionality works before removing + # change elif suite below if removing + if suite and (category or skip_category or maintain or improve or explore): + print( + "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite." + ) + return 1 + + assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \ + "Error: host needs to be added to the config if api_mode is set to True." + + print("Current configuration:") + for key, value in vars(agent_benchmark_config).items(): + print(f"{key}: {value}") + + pytest_args = ["-vs"] + pytest_args.extend(["--agent_config_path", agent_benchmark_config_path]) + if test: + print("Running specific test:", test) + pytest_args.extend(["-k", test, "--test"]) + elif suite: + print("Running specific suite:", suite) + pytest_args.extend(["--suite"]) + else: + # Categories that are used in the challenges + categories = get_unique_categories() + if category: + invalid_categories = set(category) - categories + assert ( + not invalid_categories + ), f"Invalid categories: {invalid_categories}. Valid categories are: {categories}" + + if category: + categories_to_run = set(category) + if skip_category: + categories_to_run = categories_to_run.difference(set(skip_category)) + assert categories_to_run, "Error: You can't skip all categories" + pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"]) + print("Running tests of category:", categories_to_run) + elif skip_category: + categories_to_run = categories - set(skip_category) + assert categories_to_run, "Error: You can't skip all categories" + pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"]) + print("Running tests of category:", categories_to_run) + else: + print("Running all categories") + + if maintain: + print("Running only regression tests") + pytest_args.append("--maintain") + elif improve: + print("Running only non-regression tests") + pytest_args.append("--improve") + elif explore: + print("Only attempt challenges that have never been beaten") + pytest_args.append("--explore") + + if mock: + pytest_args.append("--mock") + + if no_dep: + pytest_args.append("--no_dep") + + if nc and cutoff: + print( + "Error: You can't use both --nc and --cutoff at the same time. Please choose one." + ) + return 1 + + if nc: + pytest_args.append("--nc") + if cutoff: + pytest_args.append("--cutoff") + print(f"Setting cuttoff override to {cutoff} seconds.") + current_dir = Path(__file__).resolve().parent + print(f"Current directory: {current_dir}") + pytest_args.extend((str(current_dir), "--cache-clear")) + return pytest.main(pytest_args) + + +@click.group() +def cli() -> None: + pass + + +@cli.command() +@click.option("--backend", is_flag=True, help="If it's being run from the cli") +@click.option("-c", "--category", multiple=True, help="Specific category to run") +@click.option( + "-s", + "--skip-category", + multiple=True, + help="Skips preventing the tests from this category from running", +) +@click.option("--test", help="Specific test to run") +@click.option("--maintain", is_flag=True, help="Runs only regression tests") +@click.option("--improve", is_flag=True, help="Run only non-regression tests") +@click.option( + "--explore", + is_flag=True, + help="Only attempt challenges that have never been beaten", +) +@click.option("--mock", is_flag=True, help="Run with mock") +@click.option("--suite", help="Run a suite of related tests") +@click.option( + "--no_dep", + is_flag=True, + help="Run without dependencies (can be useful for a suite run)", +) +@click.option("--nc", is_flag=True, help="Run without cutoff") +@click.option("--cutoff", help="Set or override tests cutoff (seconds)") +@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True) +def start( + maintain: bool, + improve: bool, + explore: bool, + mock: bool, + no_dep: bool, + nc: bool, + agent_config: click.Path, + category: Optional[list[str]] = None, + skip_category: Optional[list[str]] = None, + test: Optional[str] = None, + suite: Optional[str] = None, + cutoff: Optional[int] = None, + backend: Optional[bool] = False, +) -> Any: + # Redirect stdout if backend is True + original_stdout = sys.stdout # Save the original standard output + exit_code = None + + + assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided" + + if backend: + with open("backend/backend_stdout.txt", "w") as f: + sys.stdout = f + exit_code = run_benchmark( + agent_benchmark_config_path=agent_config, + maintain=maintain, + improve=improve, + explore=explore, + mock=mock, + no_dep=no_dep, + nc=nc, + category=category, + skip_category=skip_category, + test=test, + suite=suite, + cutoff=cutoff, + ) + + sys.stdout = original_stdout + + else: + exit_code = run_benchmark( + agent_benchmark_config_path=agent_config, + maintain=maintain, + improve=improve, + explore=explore, + mock=mock, + no_dep=no_dep, + nc=nc, + category=category, + skip_category=skip_category, + test=test, + suite=suite, + cutoff=cutoff, + ) + + sys.exit(exit_code) + + +@cli.command() +def version(): + """Print the version of the benchmark tool.""" + current_directory = Path(__file__).resolve().parent + version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"] + print(f"Benchmark Tool Version {version}") + + +if __name__ == "__main__": + cli() diff --git a/benchmark/agbenchmark/agent_api_interface.py b/benchmark/benchmark/agent_api_interface.py similarity index 95% rename from benchmark/agbenchmark/agent_api_interface.py rename to benchmark/benchmark/agent_api_interface.py index e9597e63..17dbd730 100644 --- a/benchmark/agbenchmark/agent_api_interface.py +++ b/benchmark/benchmark/agent_api_interface.py @@ -5,8 +5,8 @@ from typing import Any, Dict, Optional from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody -from agbenchmark.agent_interface import get_list_of_file_paths -from agbenchmark.utils.data_types import ChallengeData +from benchmark.agent_interface import get_list_of_file_paths +from benchmark.utils.data_types import ChallengeData async def run_api_agent( diff --git a/benchmark/agbenchmark/agent_interface.py b/benchmark/benchmark/agent_interface.py similarity index 95% rename from benchmark/agbenchmark/agent_interface.py rename to benchmark/benchmark/agent_interface.py index e3ad7ab6..e7c6ac4d 100644 --- a/benchmark/agbenchmark/agent_interface.py +++ b/benchmark/benchmark/agent_interface.py @@ -12,7 +12,6 @@ from typing import Any, List import psutil from dotenv import load_dotenv -import agbenchmark.start_benchmark load_dotenv() @@ -77,7 +76,7 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None: def run_agent(task: str, timeout: int) -> None: """Calling to get a response""" - entry_path = "agbenchmark.benchmarks" + entry_path = "benchmark.benchmarks" print(f"Running '{entry_path}' with timeout {timeout}") @@ -87,7 +86,7 @@ def run_agent(task: str, timeout: int) -> None: stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, - cwd=agbenchmark.start_benchmark.HOME_DIRECTORY, + cwd=benchmark.start_benchmark.HOME_DIRECTORY, bufsize=1, ) @@ -109,7 +108,7 @@ def get_list_of_file_paths( ) -> List[str]: # this file is at agbenchmark\agent_interface.py source_dir = os.path.join( - agbenchmark.start_benchmark.CURRENT_DIRECTORY, + benchmark.start_benchmark.CURRENT_DIRECTORY, "..", challenge_dir_path, artifact_folder_name, diff --git a/benchmark/agbenchmark/app.py b/benchmark/benchmark/app.py similarity index 100% rename from benchmark/agbenchmark/app.py rename to benchmark/benchmark/app.py diff --git a/benchmark/agbenchmark/challenges/CHALLENGE.md b/benchmark/benchmark/challenges/CHALLENGE.md similarity index 100% rename from benchmark/agbenchmark/challenges/CHALLENGE.md rename to benchmark/benchmark/challenges/CHALLENGE.md diff --git a/benchmark/agbenchmark/challenges/README.md b/benchmark/benchmark/challenges/README.md similarity index 100% rename from benchmark/agbenchmark/challenges/README.md rename to benchmark/benchmark/challenges/README.md diff --git a/benchmark/agbenchmark/challenges/SUITES.md b/benchmark/benchmark/challenges/SUITES.md similarity index 100% rename from benchmark/agbenchmark/challenges/SUITES.md rename to benchmark/benchmark/challenges/SUITES.md diff --git a/benchmark/agbenchmark/challenges/__init__.py b/benchmark/benchmark/challenges/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/__init__.py rename to benchmark/benchmark/challenges/__init__.py diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt b/benchmark/benchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt similarity index 100% rename from benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt rename to benchmark/benchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt b/benchmark/benchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt similarity index 100% rename from benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt rename to benchmark/benchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt b/benchmark/benchmark/challenges/abilities/read_file/artifacts_out/output.txt similarity index 100% rename from benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt rename to benchmark/benchmark/challenges/abilities/read_file/artifacts_out/output.txt diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/data.json b/benchmark/benchmark/challenges/abilities/read_file/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/abilities/read_file/data.json rename to benchmark/benchmark/challenges/abilities/read_file/data.json diff --git a/benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/abilities/write_file/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/abilities/write_file/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/abilities/write_file/data.json b/benchmark/benchmark/challenges/abilities/write_file/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/abilities/write_file/data.json rename to benchmark/benchmark/challenges/abilities/write_file/data.json diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt similarity index 100% rename from benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt rename to benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt b/benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt similarity index 100% rename from benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt rename to benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json b/benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json rename to benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/data.json diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt similarity index 100% rename from benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt rename to benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt similarity index 100% rename from benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt rename to benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt b/benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt similarity index 100% rename from benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt rename to benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json b/benchmark/benchmark/challenges/alignment/goal_loss/2_injection/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json rename to benchmark/benchmark/challenges/alignment/goal_loss/2_injection/data.json diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json b/benchmark/benchmark/challenges/alignment/goal_loss/suite.json similarity index 100% rename from benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json rename to benchmark/benchmark/challenges/alignment/goal_loss/suite.json diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json b/benchmark/benchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json rename to benchmark/benchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json b/benchmark/benchmark/challenges/deprecated/adapatability/a3_book_price/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json rename to benchmark/benchmark/challenges/deprecated/adapatability/a3_book_price/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html b/benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html rename to benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py rename to benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json b/benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json rename to benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/suite.json b/benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/suite.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/suite.json rename to benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/suite.json diff --git a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt b/benchmark/benchmark/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt rename to benchmark/benchmark/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json b/benchmark/benchmark/challenges/deprecated/content_gen/2_plan/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json rename to benchmark/benchmark/challenges/deprecated/content_gen/2_plan/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json b/benchmark/benchmark/challenges/deprecated/d2.1_guided/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt b/benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt rename to benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt b/benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt rename to benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt b/benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt rename to benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json b/benchmark/benchmark/challenges/deprecated/interface/read_file/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json rename to benchmark/benchmark/challenges/deprecated/interface/read_file/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/search/data.json b/benchmark/benchmark/challenges/deprecated/interface/search/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/interface/search/data.json rename to benchmark/benchmark/challenges/deprecated/interface/search/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json b/benchmark/benchmark/challenges/deprecated/interface/write_file/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json rename to benchmark/benchmark/challenges/deprecated/interface/write_file/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_out/result.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_out/result.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_out/result.txt rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_out/result.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json b/benchmark/benchmark/challenges/deprecated/memory/m1_id/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r1_book_price/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json rename to benchmark/benchmark/challenges/deprecated/retrieval/r1_book_price/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/suite.json b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/suite.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/suite.json rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/suite.json diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r3/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json rename to benchmark/benchmark/challenges/deprecated/retrieval/r3/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/custom_python/test.py rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data_draft.json b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/data_draft.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data_draft.json rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/data_draft.json diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/custom_python/test.py rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/data.json diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data_draft.json b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/data_draft.json similarity index 100% rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data_draft.json rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/data_draft.json diff --git a/benchmark/agbenchmark/challenges/library/README.md b/benchmark/benchmark/challenges/library/README.md similarity index 100% rename from benchmark/agbenchmark/challenges/library/README.md rename to benchmark/benchmark/challenges/library/README.md diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/__init__.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/sample_code.py rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/sample_code.py diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/test.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/test.py rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/test.py diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/__init__.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/test.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/test.py rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/data_draft.json b/benchmark/benchmark/challenges/library/ethereum/check_price/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/data_draft.json rename to benchmark/benchmark/challenges/library/ethereum/check_price/data.json diff --git a/benchmark/benchmark/challenges/library/ethereum/check_price/data_draft.json b/benchmark/benchmark/challenges/library/ethereum/check_price/data_draft.json new file mode 100644 index 00000000..fa4a4af9 --- /dev/null +++ b/benchmark/benchmark/challenges/library/ethereum/check_price/data_draft.json @@ -0,0 +1,21 @@ +{ + "name": "TestGetEthereumGasPrice", + "category": ["ethereum"], + "task": "Get the price of Ethereum in usd and save its digits to a file named 'output.txt'", + "dependencies": ["TestWriteFile"], + "cutoff": 75, + "ground": { + "answer": "Should be a number within a 50 USD range of the actual price according to coingecko.", + "should_contain": ["Matches"], + "should_not_contain": ["Text or letters"], + "files": ["output.txt"], + "eval": { + "type": "python" + } + }, + "info": { + "difficulty": "basic", + "description": "Getting the price of Ethereum Gas in gwei and saving it as a file", + "side_effects": [] + } +} diff --git a/benchmark/agbenchmark/challenges/optional_categories.json b/benchmark/benchmark/challenges/optional_categories.json similarity index 100% rename from benchmark/agbenchmark/challenges/optional_categories.json rename to benchmark/benchmark/challenges/optional_categories.json diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py rename to benchmark/benchmark/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/1_three_sum/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/1_three_sum/custom_python/test.py rename to benchmark/benchmark/challenges/verticals/code/1_three_sum/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json b/benchmark/benchmark/challenges/verticals/code/1_three_sum/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json rename to benchmark/benchmark/challenges/verticals/code/1_three_sum/data.json diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py b/benchmark/benchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py rename to benchmark/benchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/2_password_generator/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/2_password_generator/custom_python/test.py rename to benchmark/benchmark/challenges/verticals/code/2_password_generator/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json b/benchmark/benchmark/challenges/verticals/code/2_password_generator/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json rename to benchmark/benchmark/challenges/verticals/code/2_password_generator/data.json diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py b/benchmark/benchmark/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py rename to benchmark/benchmark/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/3_file_organizer/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/custom_python/test.py rename to benchmark/benchmark/challenges/verticals/code/3_file_organizer/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json b/benchmark/benchmark/challenges/verticals/code/3_file_organizer/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json rename to benchmark/benchmark/challenges/verticals/code/3_file_organizer/data.json diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py rename to benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py rename to benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json rename to benchmark/benchmark/challenges/verticals/code/4_url_shortener/data.json diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py rename to benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py rename to benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data_draft.json b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/data_draft.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data_draft.json rename to benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/data_draft.json diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/__init__.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/__init__.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/__init__.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/__init__.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data_draft.json b/benchmark/benchmark/challenges/verticals/code/6_battleship/data_draft.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/data_draft.json rename to benchmark/benchmark/challenges/verticals/code/6_battleship/data_draft.json diff --git a/benchmark/agbenchmark/challenges/verticals/scraping/basic/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/verticals/scraping/basic/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/scraping/basic/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/verticals/scraping/basic/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/verticals/scraping/basic/data.json b/benchmark/benchmark/challenges/verticals/scraping/basic/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/scraping/basic/data.json rename to benchmark/benchmark/challenges/verticals/scraping/basic/data.json diff --git a/benchmark/agbenchmark/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/verticals/scraping/r1_book_price/data.json b/benchmark/benchmark/challenges/verticals/scraping/r1_book_price/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/scraping/r1_book_price/data.json rename to benchmark/benchmark/challenges/verticals/scraping/r1_book_price/data.json diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt b/benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt rename to benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt b/benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt rename to benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt b/benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt rename to benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/data_draft.json b/benchmark/benchmark/challenges/verticals/synthesize/1_summary/data_draft.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/data_draft.json rename to benchmark/benchmark/challenges/verticals/synthesize/1_summary/data_draft.json diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/suite.json b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/suite.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/suite.json rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/suite.json diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt rename to benchmark/benchmark/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r3/data.json b/benchmark/benchmark/challenges/verticals/synthesize/r3/data.json similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/synthesize/r3/data.json rename to benchmark/benchmark/challenges/verticals/synthesize/r3/data.json diff --git a/benchmark/agbenchmark/conftest.py b/benchmark/benchmark/conftest.py similarity index 78% rename from benchmark/agbenchmark/conftest.py rename to benchmark/benchmark/conftest.py index d5aded19..f1e6ad8b 100644 --- a/benchmark/agbenchmark/conftest.py +++ b/benchmark/benchmark/conftest.py @@ -10,23 +10,37 @@ from typing import Any, Dict, Generator import pytest -import agbenchmark.start_benchmark -from agbenchmark.reports.reports import ( +from benchmark.reports.reports import ( finalize_reports, generate_combined_suite_report, generate_single_call_report, session_finish, ) -from agbenchmark.utils.data_types import SuiteConfig +from benchmark.utils.data_types import SuiteConfig, AgentBenchmarkConfig GLOBAL_TIMEOUT = ( 1500 # The tests will stop after 25 minutes so we can send the reports. ) -pytest_plugins = ["agbenchmark.utils.dependencies"] +pytest_plugins = ["benchmark.utils.dependencies"] collect_ignore = ["challenges"] +def load_config_from_request(request: Any) -> AgentBenchmarkConfig: + agent_benchmark_config_path = request.config.getoption("--agent_config_path") + try: + with open(agent_benchmark_config_path, "r") as f: + agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) + agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + return agent_benchmark_config + except json.JSONDecodeError: + print("Error: benchmark_config.json is not a valid JSON file.") + raise + + + + + def resolve_workspace(workspace: str) -> str: if workspace.startswith("${") and workspace.endswith("}"): # Extract the string inside ${...} @@ -46,16 +60,21 @@ def resolve_workspace(workspace: str) -> str: @pytest.fixture(scope="module") -def config(request: Any) -> None: - print(f"Config file: {agbenchmark.start_benchmark.CONFIG_PATH}") - with open(agbenchmark.start_benchmark.CONFIG_PATH, "r") as f: - config = json.load(f) +def config(request: Any) -> Any: + agent_benchmark_config_path = request.config.getoption("--agent_config_path") + try: + with open(agent_benchmark_config_path, "r") as f: + agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) + agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + except json.JSONDecodeError: + print("Error: benchmark_config.json is not a valid JSON file.") + raise if isinstance(config["workspace"], str): - config["workspace"] = resolve_workspace(config["workspace"]) + config["workspace"] = resolve_workspace(agent_benchmark_config.workspace) else: # it's a input output dict - config["workspace"]["input"] = resolve_workspace(config["workspace"]["input"]) - config["workspace"]["output"] = resolve_workspace(config["workspace"]["output"]) + config["workspace"]["input"] = resolve_workspace(agent_benchmark_config.workspace / "input") + config["workspace"]["output"] = resolve_workspace(agent_benchmark_config.workspace / "output") return config @@ -89,6 +108,7 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]: def pytest_addoption(parser: Any) -> None: + parser.addoption("--agent_config_path", action="store_true", default=False) parser.addoption("--mock", action="store_true", default=False) parser.addoption("--api_mode", action="store_true", default=False) parser.addoption("--host", action="store_true", default=None) @@ -106,7 +126,9 @@ def pytest_addoption(parser: Any) -> None: @pytest.fixture(autouse=True) def check_regression(request: Any) -> None: test_name = request.node.parent.name - data = agbenchmark.start_benchmark.get_regression_data() + agent_benchmark_config = load_config_from_request(request) + + data = json.loads(agent_benchmark_config.get_regression_reports_path()) # Get the true location of the test challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "") @@ -215,7 +237,15 @@ def scores(request: Any) -> None: # this is adding the dependency marker and category markers automatically from the json def pytest_collection_modifyitems(items: Any, config: Any) -> None: - data = agbenchmark.start_benchmark.get_regression_data() + try: + with open(config.getoption('--agent_config_path'), "r") as f: + agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) + agent_benchmark_config.agent_benchmark_config_path = config.getoption('--agent_config_path') + except json.JSONDecodeError: + print("Error: benchmark_config.json is not a valid JSON file.") + raise + + data = json.loads(agent_benchmark_config.get_regression_reports_path()) for item in items: # Assuming item.cls is your test class @@ -252,17 +282,15 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None: @pytest.fixture(scope="session", autouse=True) def run_agent(request: Any) -> Any: - with open(agbenchmark.start_benchmark.CONFIG_PATH, "r") as f: - config = json.load(f) - + agent_benchmark_config_path = request.config.getoption("--agent_config_path") if "--api_mode" not in sys.argv: - command = [sys.executable, "-m", "agbenchmark.benchmarks"] + command = [sys.executable, "-m", "benchmark.benchmarks"] process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, - cwd=agbenchmark.start_benchmark.HOME_DIRECTORY, + cwd=agent_benchmark_config_path.entry_path.parent.parent, ) time.sleep(3) yield diff --git a/benchmark/agbenchmark/generate_test.py b/benchmark/benchmark/generate_test.py similarity index 88% rename from benchmark/agbenchmark/generate_test.py rename to benchmark/benchmark/generate_test.py index b4d6b201..1180119b 100644 --- a/benchmark/agbenchmark/generate_test.py +++ b/benchmark/benchmark/generate_test.py @@ -10,10 +10,9 @@ from typing import Any, Callable, Dict, Optional import pytest -import agbenchmark.start_benchmark -from agbenchmark.utils.challenge import Challenge -from agbenchmark.utils.data_types import ChallengeData, SuiteConfig -from agbenchmark.utils.utils import get_test_path +from benchmark.utils.challenge import Challenge +from benchmark.utils.data_types import ChallengeData, SuiteConfig, AgentBenchmarkConfig +from benchmark.utils.utils import get_test_path DATA_CATEGORY = {} @@ -72,7 +71,7 @@ def create_single_test( # Define test class dynamically challenge_class = types.new_class(data["name"], (Challenge,)) - + print(challenge_location) clean_challenge_location = get_test_path(challenge_location) setattr(challenge_class, "CHALLENGE_LOCATION", clean_challenge_location) @@ -132,6 +131,8 @@ def create_single_test( await self.setup_challenge(config, timeout) scores = self.get_scores(config) + request.node.answers = scores["answers"] # store answers in request.node + del scores["answers"] # remove answers from scores request.node.scores = scores # store scores in request.node assert 1 in scores["values"] @@ -221,14 +222,34 @@ def create_challenge( def generate_tests() -> None: # sourcery skip: invert-any-all print("Generating tests...") + challenges_path = os.path.join(os.path.dirname(__file__), 'challenges') + json_files = deque( glob.glob( - f"{agbenchmark.start_benchmark.CHALLENGES_PATH}/**/data.json", + f"{challenges_path}/**/data.json", recursive=True, ) ) - regression_tests = agbenchmark.start_benchmark.get_regression_data() + agent_config_path = None + if "--agent-config" in sys.argv: + agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1] + else: + print(sys.argv) + try: + with open(agent_benchmark_config_path, "r") as f: + agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) + agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + except json.JSONDecodeError: + print("Error: benchmark_config.json is not a valid JSON file.") + raise + + regression_reports_path = agent_benchmark_config.get_regression_reports_path() + if regression_reports_path and os.path.exists(regression_reports_path): + with open(regression_reports_path, 'r') as f: + regression_tests = json.load(f) + else: + regression_tests = {} # for suites to know if the file has already been used to generate the tests # Dynamic class creation @@ -287,7 +308,6 @@ def generate_tests() -> None: # sourcery skip: invert-any-all # ): # # a part of the suite but not the one specified # continue - json_files = create_challenge(data, json_file, suite_config, json_files) if suite_config and not (test_flag or maintain_flag or improve_flag): diff --git a/benchmark/agbenchmark/reports/ReportManager.py b/benchmark/benchmark/reports/ReportManager.py similarity index 74% rename from benchmark/agbenchmark/reports/ReportManager.py rename to benchmark/benchmark/reports/ReportManager.py index 51feca1c..991dd7cf 100644 --- a/benchmark/agbenchmark/reports/ReportManager.py +++ b/benchmark/benchmark/reports/ReportManager.py @@ -6,11 +6,12 @@ from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict -from agbenchmark.reports.processing.graphs import save_single_radar_chart -from agbenchmark.reports.processing.process_report import get_agent_category -from agbenchmark.reports.processing.report_types import Report -from agbenchmark.utils.utils import get_highest_success_difficulty - +from benchmark.reports.processing.graphs import save_single_radar_chart +from benchmark.reports.processing.process_report import get_agent_category +from benchmark.reports.processing.report_types import Report +from benchmark.utils.utils import get_highest_success_difficulty +from benchmark.utils.data_types import AgentBenchmarkConfig +from benchmark.__main__ import BENCHMARK_START_TIME class ReportManager: """Abstracts interaction with the regression tests file""" @@ -21,6 +22,11 @@ class ReportManager: self.load() def load(self) -> None: + if not os.path.exists(self.filename): + os.makedirs(os.path.dirname(self.filename), exist_ok=True) + with open(self.filename, 'w') as f: + pass + try: with open(self.filename, "r") as f: file_content = ( @@ -55,26 +61,25 @@ class ReportManager: self.tests = {} self.save() - def end_info_report(self, config: Dict[str, Any]) -> None: - import agbenchmark.start_benchmark + def end_info_report(self, config: AgentBenchmarkConfig) -> None: command = " ".join(sys.argv) self.tests = { "command": command.split(os.sep)[-1], - "benchmark_git_commit_sha": agbenchmark.start_benchmark.BENCHMARK_GIT_COMMIT_SHA, - "agent_git_commit_sha": agbenchmark.start_benchmark.AGENT_GIT_COMMIT_SHA, + "benchmark_git_commit_sha": '---', + "agent_git_commit_sha": '---', "completion_time": datetime.now(timezone.utc).strftime( "%Y-%m-%dT%H:%M:%S+00:00" ), - "benchmark_start_time": agbenchmark.start_benchmark.BENCHMARK_START_TIME, + "benchmark_start_time": BENCHMARK_START_TIME, "metrics": { "run_time": str(round(time.time() - self.start_time, 2)) + " seconds", "highest_difficulty": get_highest_success_difficulty(self.tests), "total_cost": self.get_total_costs(), }, "tests": self.tests, - "config": config, + "config": {k: v for k, v in json.loads(config.json()).items() if v is not None}, } converted_data = Report.parse_obj(self.tests) @@ -83,7 +88,8 @@ class ReportManager: save_single_radar_chart( agent_categories, - Path(agbenchmark.start_benchmark.REPORTS_PATH) / "radar_chart.png", + + config.get_reports_path() / "radar_chart.png", ) self.save() diff --git a/benchmark/agbenchmark/reports/processing/gen_combined_chart.py b/benchmark/benchmark/reports/processing/gen_combined_chart.py similarity index 91% rename from benchmark/agbenchmark/reports/processing/gen_combined_chart.py rename to benchmark/benchmark/reports/processing/gen_combined_chart.py index f7140de6..47d4c05e 100644 --- a/benchmark/agbenchmark/reports/processing/gen_combined_chart.py +++ b/benchmark/benchmark/reports/processing/gen_combined_chart.py @@ -2,11 +2,11 @@ import json import os from pathlib import Path -from agbenchmark.reports.processing.graphs import ( +from benchmark.reports.processing.graphs import ( save_combined_bar_chart, save_combined_radar_chart, ) -from agbenchmark.reports.processing.process_report import ( +from benchmark.reports.processing.process_report import ( all_agent_categories, get_reports_data, ) diff --git a/benchmark/agbenchmark/reports/processing/get_files.py b/benchmark/benchmark/reports/processing/get_files.py similarity index 100% rename from benchmark/agbenchmark/reports/processing/get_files.py rename to benchmark/benchmark/reports/processing/get_files.py diff --git a/benchmark/agbenchmark/reports/processing/graphs.py b/benchmark/benchmark/reports/processing/graphs.py similarity index 100% rename from benchmark/agbenchmark/reports/processing/graphs.py rename to benchmark/benchmark/reports/processing/graphs.py diff --git a/benchmark/agbenchmark/reports/processing/process_report.py b/benchmark/benchmark/reports/processing/process_report.py similarity index 91% rename from benchmark/agbenchmark/reports/processing/process_report.py rename to benchmark/benchmark/reports/processing/process_report.py index 25f9303a..a94f76fe 100644 --- a/benchmark/agbenchmark/reports/processing/process_report.py +++ b/benchmark/benchmark/reports/processing/process_report.py @@ -3,11 +3,11 @@ import os from pathlib import Path from typing import Any -from agbenchmark.reports.processing.get_files import ( +from benchmark.reports.processing.get_files import ( get_latest_report_from_agent_directories, ) -from agbenchmark.reports.processing.report_types import Report, SuiteTest, Test -from agbenchmark.utils.data_types import STRING_DIFFICULTY_MAP +from benchmark.reports.processing.report_types import Report, SuiteTest, Test +from benchmark.utils.data_types import STRING_DIFFICULTY_MAP def get_reports_data(report_path: str) -> dict[str, Any]: diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/benchmark/reports/processing/report_types.py similarity index 100% rename from benchmark/agbenchmark/reports/processing/report_types.py rename to benchmark/benchmark/reports/processing/report_types.py diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/benchmark/reports/reports.py similarity index 78% rename from benchmark/agbenchmark/reports/reports.py rename to benchmark/benchmark/reports/reports.py index 1e5ba1e6..1cb81fd3 100644 --- a/benchmark/agbenchmark/reports/reports.py +++ b/benchmark/benchmark/reports/reports.py @@ -4,15 +4,48 @@ import sys from pathlib import Path from typing import Any, Dict -import agbenchmark.start_benchmark -from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig -from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone -from agbenchmark.utils.utils import ( +from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig, AgentBenchmarkConfig +from benchmark.utils.get_data_from_helicone import get_data_from_helicone +from benchmark.utils.utils import ( calculate_success_percentage, get_highest_success_difficulty, get_test_path, replace_backslash, ) +from benchmark.reports.ReportManager import ReportManager + + + +def get_agent_benchmark_config() -> AgentBenchmarkConfig: + if "--agent-config" in sys.argv: + agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1] + else: + print(sys.argv) + try: + with open(agent_benchmark_config_path, "r") as f: + agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) + agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + return agent_benchmark_config + except json.JSONDecodeError: + print("Error: benchmark_config.json is not a valid JSON file.") + raise + +def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]: + agent_benchmark_config = get_agent_benchmark_config() + # tests that consistently pass are considered regression tests + REGRESSION_MANAGER = ReportManager(agent_benchmark_config.get_regression_reports_path()) + + # print(f"Using {REPORTS_PATH} for reports") + # user facing reporting information + INFO_MANAGER = ReportManager(str(agent_benchmark_config.get_reports_path() / "report.json")) + + # internal db step in replacement track pass/fail rate + INTERNAL_INFO_MANAGER = ReportManager(agent_benchmark_config.get_success_rate_path()) + + return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER + + +(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers() def generate_combined_suite_report( @@ -26,6 +59,7 @@ def generate_combined_suite_report( data_paths = suite_config.get_data_paths(root_path / Path(challenge_location)) scores = getattr(item, "scores", {}) + mock = "--mock" in sys.argv # Check if --mock is in sys.argv tests = {} @@ -65,7 +99,7 @@ def generate_combined_suite_report( # add dependency fail here if not mock: # don't remove if it's a mock test - agbenchmark.start_benchmark.REGRESSION_MANAGER.remove_test(test_name) + REGRESSION_MANAGER.remove_test(test_name) prev_test_results: list[bool] = get_previous_test_results( test_name, test_info_details @@ -76,7 +110,7 @@ def generate_combined_suite_report( ) tests[test_name] = test_info_details - + info_details: Any = { "data_path": challenge_location, "task": challenge_data["task"], @@ -98,14 +132,14 @@ def get_previous_test_results( agent_tests: dict[str, list[bool]] = {} mock = "--mock" in sys.argv # Check if --mock is in sys.argv - prev_test_results = agbenchmark.start_benchmark.INTERNAL_INFO_MANAGER.tests.get( + prev_test_results = INTERNAL_INFO_MANAGER.tests.get( test_name, [] ) if not mock: # only add if it's an actual test prev_test_results.append(info_details["metrics"]["success"]) - agbenchmark.start_benchmark.INTERNAL_INFO_MANAGER.add_test( + INTERNAL_INFO_MANAGER.add_test( test_name, prev_test_results ) @@ -126,7 +160,7 @@ def update_regression_tests( if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]: # if the last 3 tests were successful, add to the regression tests info_details["is_regression"] = True - agbenchmark.start_benchmark.REGRESSION_MANAGER.add_test(test_name, test_details) + REGRESSION_MANAGER.add_test(test_name, test_details) def generate_single_call_report( @@ -144,6 +178,7 @@ def generate_single_call_report( challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "") test_name = item.nodeid.split("::")[1] item.test_name = test_name + answers = call.node.answers test_details = { "difficulty": difficulty, @@ -162,7 +197,10 @@ def generate_single_call_report( "success": False, "attempted": True, }, + "answers": answers, } + if 'metadata' in challenge_data: + info_details['metadata'] = challenge_data['metadata'] mock = "--mock" in sys.argv # Check if --mock is in sys.argv @@ -170,7 +208,7 @@ def generate_single_call_report( info_details["metrics"]["success"] = True else: if not mock: # don't remove if it's a mock test - agbenchmark.start_benchmark.REGRESSION_MANAGER.remove_test(test_name) + REGRESSION_MANAGER.remove_test(test_name) info_details["metrics"]["fail_reason"] = str(call.excinfo.value) if call.excinfo.typename == "Skipped": info_details["metrics"]["attempted"] = False @@ -221,7 +259,7 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None: nested_test_info, nested_test_name ) - agbenchmark.start_benchmark.INFO_MANAGER.add_test(test_name, info_details) + INFO_MANAGER.add_test(test_name, info_details) def update_challenges_already_beaten( @@ -260,11 +298,11 @@ def generate_separate_suite_reports(suite_reports: dict) -> None: } for name in suite_file_datum: - test_data = agbenchmark.start_benchmark.INFO_MANAGER.tests[ + test_data = INFO_MANAGER.tests[ name ] # get the individual test reports data[name] = test_data # this is for calculating highest difficulty - agbenchmark.start_benchmark.INFO_MANAGER.remove_test(name) + INFO_MANAGER.remove_test(name) successes.append(test_data["metrics"]["success"]) run_time += float(test_data["metrics"]["run_time"].split(" ")[0]) @@ -282,7 +320,7 @@ def generate_separate_suite_reports(suite_reports: dict) -> None: Path(next(iter(data.values()))["data_path"]).resolve().parent.parent ) info_details["data_path"] = get_test_path(suite_path) - agbenchmark.start_benchmark.INFO_MANAGER.add_test(prefix, info_details) + INFO_MANAGER.add_test(prefix, info_details) def session_finish(suite_reports: dict) -> None: @@ -290,9 +328,9 @@ def session_finish(suite_reports: dict) -> None: if not flags: generate_separate_suite_reports(suite_reports) - with open(agbenchmark.start_benchmark.CONFIG_PATH, "r") as f: - config = json.load(f) + agent_benchmark_config = get_agent_benchmark_config() - agbenchmark.start_benchmark.INTERNAL_INFO_MANAGER.save() - agbenchmark.start_benchmark.INFO_MANAGER.end_info_report(config) - agbenchmark.start_benchmark.REGRESSION_MANAGER.save() + + INTERNAL_INFO_MANAGER.save() + INFO_MANAGER.end_info_report(agent_benchmark_config) + REGRESSION_MANAGER.save() diff --git a/benchmark/agbenchmark/start_benchmark.py b/benchmark/benchmark/start_benchmark.py similarity index 95% rename from benchmark/agbenchmark/start_benchmark.py rename to benchmark/benchmark/start_benchmark.py index a94ae234..77044b5c 100644 --- a/benchmark/agbenchmark/start_benchmark.py +++ b/benchmark/benchmark/start_benchmark.py @@ -9,12 +9,14 @@ from typing import Any, Optional import click import pytest from helicone.lock import HeliconeLockManager +import sys +sys.path.append('/Users/swifty/dev/Auto-GPT/benchmark') from agbenchmark.reports.ReportManager import ReportManager from agbenchmark.utils.utils import ( AGENT_NAME, calculate_dynamic_paths, - get_git_commit_sha, + # get_git_commit_sha, ) CURRENT_DIRECTORY = Path(__file__).resolve().parent @@ -32,8 +34,8 @@ if os.environ.get("HELICONE_API_KEY"): SUCCESS_RATE_PATH, CHALLENGES_PATH, ) = calculate_dynamic_paths() -BENCHMARK_GIT_COMMIT_SHA = get_git_commit_sha(HOME_DIRECTORY / ".." / "..") -AGENT_GIT_COMMIT_SHA = get_git_commit_sha(HOME_DIRECTORY) +BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..") +AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY) # open a file in the challenges/optional_categories with open( Path(__file__).resolve().parent / "challenges" / "optional_categories.json" @@ -332,6 +334,14 @@ def get_regression_data() -> Any: return data +@cli.command() +def version(): + """Print the version of the benchmark tool.""" + import toml + version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"]["version"] + print(f"Benchmark Tool Version {version}") + + # def run_from_backend( # maintain: bool = False, @@ -420,5 +430,10 @@ def get_regression_data() -> Any: # return latest_report +<<<<<<< HEAD:benchmark/agbenchmark/start_benchmark.py # if __name__ == "__main__": # start() +======= +if __name__ == "__main__": + cli() +>>>>>>> 61b4afcb (Fixing benchmarks):benchmark/benchmark/start_benchmark.py diff --git a/benchmark/agbenchmark/utils/challenge.py b/benchmark/benchmark/utils/challenge.py similarity index 95% rename from benchmark/agbenchmark/utils/challenge.py rename to benchmark/benchmark/utils/challenge.py index 72849f51..e1d0c4fe 100644 --- a/benchmark/agbenchmark/utils/challenge.py +++ b/benchmark/benchmark/utils/challenge.py @@ -10,16 +10,15 @@ from typing import Any, Dict, List import openai import pytest -import agbenchmark.start_benchmark -from agbenchmark.agent_api_interface import run_api_agent -from agbenchmark.utils.data_types import ChallengeData, Ground -from agbenchmark.utils.prompts import ( +from benchmark.agent_api_interface import run_api_agent +from benchmark.utils.data_types import ChallengeData, Ground +from benchmark.utils.prompts import ( END_PROMPT, FEW_SHOT_EXAMPLES, PROMPT_MAP, SCORING_MAP, ) -from agbenchmark.utils.utils import agent_eligibible_for_optional_categories +from benchmark.utils.utils import agent_eligibible_for_optional_categories class Challenge(ABC): @@ -48,7 +47,7 @@ class Challenge(ABC): return self.data.dependencies async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None: - from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent + from benchmark.agent_interface import copy_artifacts_into_workspace, run_agent artifact_paths = [ self.ARTIFACTS_LOCATION, @@ -210,15 +209,16 @@ class Challenge(ABC): scores = [] scores_dict: Any = {} percentage = None - + answers = {} try: if self.data.task == "" and "--mock" in sys.argv: scores = [1.0] + answers = {"mock": "This is a mock answer"} elif isinstance(self.data.ground, Ground): files_contents = self.get_artifacts_out( config["workspace"], self.data.ground ) - + answers = {"answer": files_contents} for file_content in files_contents: score = self.scoring(config, file_content, self.data.ground) print("\033[1;32mYour score is:\033[0m", score) @@ -240,6 +240,7 @@ class Challenge(ABC): for ground_key in self.data.ground: ground = self.data.ground[ground_key] files_contents = self.get_artifacts_out(config["workspace"], ground) + answers[ground_key] = files_contents for file_content in files_contents: score = self.scoring(config, file_content, ground) @@ -289,6 +290,7 @@ class Challenge(ABC): "values": scores, "scores_obj": scores_dict, "percentage": percentage, + "answers": answers, } self.scores[self.__class__.__name__] = scores_data @@ -306,7 +308,7 @@ class Challenge(ABC): challenge_category = self.data.category categories = [ category - for category in agbenchmark.start_benchmark.OPTIONAL_CATEGORIES + for category in benchmark.start_benchmark.OPTIONAL_CATEGORIES if category in challenge_category ] if not agent_eligibible_for_optional_categories( diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/benchmark/utils/data_types.py similarity index 73% rename from benchmark/agbenchmark/utils/data_types.py rename to benchmark/benchmark/utils/data_types.py index d40682a1..e5d9e987 100644 --- a/benchmark/agbenchmark/utils/data_types.py +++ b/benchmark/benchmark/utils/data_types.py @@ -3,10 +3,9 @@ import json from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional - +import sys from pydantic import BaseModel, root_validator, validator - - +from datetime import datetime, timezone class DifficultyLevel(Enum): interface = "interface" basic = "basic" @@ -30,6 +29,77 @@ DIFFICULTY_MAP = { STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel} +def calculate_info_test_path(base_path: Path) -> Path: + """ + Calculates the path to the directory where the test report will be saved. + """ + # Ensure the reports path exists + base_path.mkdir(parents=True, exist_ok=True) + + # Get current UTC date-time stamp + date_stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + + # Default run name + run_name = "full_run" + + # Map command-line arguments to their respective labels + arg_labels = { + "--test": None, + "--suite": None, + "--category": None, + "--maintain": "maintain", + "--improve": "improve", + "--explore": "explore", + } + + # Identify the relevant command-line argument + for arg, label in arg_labels.items(): + if arg in sys.argv: + test_arg = sys.argv[sys.argv.index(arg) + 1] if label is None else None + run_name = arg.strip("--") + if test_arg: + run_name = f"{run_name}_{test_arg}" + break + + # Create the full new directory path with ISO standard UTC date-time stamp + report_path = base_path / f"{date_stamp}_{run_name}" + + # Ensure the new directory is created + report_path.mkdir(exist_ok=True) + + return report_path + +class AgentBenchmarkConfig(BaseModel): + """ + This class represents the configuration for the Agent Benchmark. + It includes the following attributes: + - entry_path: The path to the file that, when run, starts the agent configured for benchmarking, realtive location from the config_file. + - workspace: The path to the workspace where the benchmark will be run. + - reports_folder: The path to the folder where the benchmark reports will be stored. + - api_mode: A boolean indicating whether the benchmark is run in API mode. + - host: The host where the benchmark is run. + """ + agent_benchmark_config_path: Path | None = None + entry_path: Path + workspace: Path + reports_folder: Path | None = None + api_mode: bool = False + host: str | None + + def get_reports_location(self) -> Path: + if not self.reports_folder: + self.reports_folder = (self.agent_benchmark_config_path / self.entry_path.parent / ".." / "reports").resolve() + return self.reports_folder + + def get_reports_path(self) -> Path: + return calculate_info_test_path(self.get_reports_location()) + + def get_regression_reports_path(self) -> Path: + + return self.get_reports_location() / "regression_tests.json" + + def get_success_rate_path(self) -> Path: + return self.get_reports_location() / "success_rate.json" class Info(BaseModel): difficulty: DifficultyLevel @@ -100,6 +170,7 @@ class ChallengeData(BaseModel): cutoff: int ground: Ground | Dict[str, Ground] info: Info | Dict[str, Info] + metadata: Optional[Dict[str, Any]] = None def serialize(self, path: str) -> None: with open(path, "w") as file: diff --git a/benchmark/agbenchmark/utils/dependencies/__init__.py b/benchmark/benchmark/utils/dependencies/__init__.py similarity index 99% rename from benchmark/agbenchmark/utils/dependencies/__init__.py rename to benchmark/benchmark/utils/dependencies/__init__.py index bf2dba18..596c4760 100644 --- a/benchmark/agbenchmark/utils/dependencies/__init__.py +++ b/benchmark/benchmark/utils/dependencies/__init__.py @@ -67,7 +67,6 @@ def pytest_addoption(parser: Parser) -> None: for action in group.options: current_options += action._short_opts + action._long_opts - print(current_options) group = parser.getgroup("depends") diff --git a/benchmark/agbenchmark/utils/dependencies/constants.py b/benchmark/benchmark/utils/dependencies/constants.py similarity index 100% rename from benchmark/agbenchmark/utils/dependencies/constants.py rename to benchmark/benchmark/utils/dependencies/constants.py diff --git a/benchmark/agbenchmark/utils/dependencies/graphs.py b/benchmark/benchmark/utils/dependencies/graphs.py similarity index 98% rename from benchmark/agbenchmark/utils/dependencies/graphs.py rename to benchmark/benchmark/utils/dependencies/graphs.py index cf54f32b..3cb85af2 100644 --- a/benchmark/agbenchmark/utils/dependencies/graphs.py +++ b/benchmark/benchmark/utils/dependencies/graphs.py @@ -9,8 +9,8 @@ import networkx as nx import numpy as np from pyvis.network import Network -from agbenchmark.generate_test import DATA_CATEGORY -from agbenchmark.utils.utils import find_absolute_benchmark_path +from benchmark.generate_test import DATA_CATEGORY +from benchmark.utils.utils import find_absolute_benchmark_path def bezier_curve( diff --git a/benchmark/agbenchmark/utils/dependencies/main.py b/benchmark/benchmark/utils/dependencies/main.py similarity index 100% rename from benchmark/agbenchmark/utils/dependencies/main.py rename to benchmark/benchmark/utils/dependencies/main.py diff --git a/benchmark/agbenchmark/utils/dependencies/util.py b/benchmark/benchmark/utils/dependencies/util.py similarity index 100% rename from benchmark/agbenchmark/utils/dependencies/util.py rename to benchmark/benchmark/utils/dependencies/util.py diff --git a/benchmark/agbenchmark/utils/get_data_from_helicone.py b/benchmark/benchmark/utils/get_data_from_helicone.py similarity index 92% rename from benchmark/agbenchmark/utils/get_data_from_helicone.py rename to benchmark/benchmark/utils/get_data_from_helicone.py index 4e18b48e..0d04ed1d 100644 --- a/benchmark/agbenchmark/utils/get_data_from_helicone.py +++ b/benchmark/benchmark/utils/get_data_from_helicone.py @@ -4,8 +4,7 @@ from typing import Optional import requests -import agbenchmark.start_benchmark -from agbenchmark.agent_interface import HELICONE_GRAPHQL_LOGS +from benchmark.agent_interface import HELICONE_GRAPHQL_LOGS def get_data_from_helicone(challenge: str) -> Optional[float]: @@ -31,7 +30,7 @@ query ExampleQuery($properties: [PropertyFilter!]){ "name": "agent", }, { - "value": {"equals": agbenchmark.start_benchmark.BENCHMARK_START_TIME}, + "value": {"equals": benchmark.start_benchmark.BENCHMARK_START_TIME}, "name": "benchmark_start_time", }, {"value": {"equals": challenge}, "name": "challenge"}, diff --git a/benchmark/agbenchmark/utils/prompts.py b/benchmark/benchmark/utils/prompts.py similarity index 100% rename from benchmark/agbenchmark/utils/prompts.py rename to benchmark/benchmark/utils/prompts.py diff --git a/benchmark/agbenchmark/utils/utils.py b/benchmark/benchmark/utils/utils.py similarity index 80% rename from benchmark/agbenchmark/utils/utils.py rename to benchmark/benchmark/utils/utils.py index 56996e24..ebfdb030 100644 --- a/benchmark/agbenchmark/utils/utils.py +++ b/benchmark/benchmark/utils/utils.py @@ -10,52 +10,13 @@ import git from dotenv import load_dotenv load_dotenv() - -from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel +from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel AGENT_NAME = os.getenv("AGENT_NAME") REPORT_LOCATION = os.getenv("REPORT_LOCATION", None) -def calculate_info_test_path(base_path: Path) -> str: - """ - Calculates the path to the directory where the test report will be saved. - """ - # Ensure the reports path exists - base_path.mkdir(parents=True, exist_ok=True) - # Get current UTC date-time stamp - date_stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") - - # Default run name - run_name = "full_run" - - # Map command-line arguments to their respective labels - arg_labels = { - "--test": None, - "--suite": None, - "--category": None, - "--maintain": "maintain", - "--improve": "improve", - "--explore": "explore", - } - - # Identify the relevant command-line argument - for arg, label in arg_labels.items(): - if arg in sys.argv: - test_arg = sys.argv[sys.argv.index(arg) + 1] if label is None else None - run_name = arg.strip("--") - if test_arg: - run_name = f"{run_name}_{test_arg}" - break - - # Create the full new directory path with ISO standard UTC date-time stamp - report_path = base_path / f"{date_stamp}_{run_name}" - - # Ensure the new directory is created - report_path.mkdir(exist_ok=True) - - return str(report_path) def replace_backslash(value: Any) -> Any: @@ -88,7 +49,7 @@ def get_test_path(json_file: str | Path) -> str: # Find the index of "agbenchmark" in the path parts try: - agbenchmark_index = json_file.parts.index("agbenchmark") + agbenchmark_index = json_file.parts.index("benchmark") except ValueError: raise ValueError("Invalid challenge location.") @@ -249,19 +210,19 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str, str, str]: ) -def get_git_commit_sha(directory: Path) -> Optional[str]: - try: - repo = git.Repo(directory) - remote_url = repo.remotes.origin.url - if remote_url.endswith(".git"): - remote_url = remote_url[:-4] - git_commit_sha = f"{remote_url}/tree/{repo.head.commit.hexsha}" +# def get_git_commit_sha(directory: Path) -> Optional[str]: +# try: +# repo = git.Repo(directory) +# remote_url = repo.remotes.origin.url +# if remote_url.endswith(".git"): +# remote_url = remote_url[:-4] +# git_commit_sha = f"{remote_url}/tree/{repo.head.commit.hexsha}" - # print(f"GIT_COMMIT_SHA: {git_commit_sha}") - return git_commit_sha - except Exception: - # print(f"{directory} is not a git repository!") - return None +# # print(f"GIT_COMMIT_SHA: {git_commit_sha}") +# return git_commit_sha +# except Exception: +# # print(f"{directory} is not a git repository!") +# return None def agent_eligibible_for_optional_categories( diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml index 88740f4b..7fd2efcf 100644 --- a/benchmark/pyproject.toml +++ b/benchmark/pyproject.toml @@ -70,8 +70,6 @@ filterwarnings = [ "ignore::matplotlib.MatplotlibDeprecationWarning" ] -[tool.poetry.scripts] -agbenchmark = "agbenchmark.start_benchmark:cli" [tool.black] line-length = 88 diff --git a/benchmark/run.sh b/benchmark/run.sh old mode 100644 new mode 100755