From c73e90c4e6ceef0d5b6e69ba8af2a30bb14dddc9 Mon Sep 17 00:00:00 2001
From: SwiftyOS <craigswift13@gmail.com>
Date: Mon, 11 Sep 2023 17:23:38 +0200
Subject: [PATCH] Fixing benchmarks

---
 .../{agbenchmark => benchmark}/README.md      |   0
 benchmark/benchmark/__init__.py               |   5 +
 benchmark/benchmark/__main__.py               | 274 ++++++++++++++++++
 .../agent_api_interface.py                    |   4 +-
 .../agent_interface.py                        |   7 +-
 benchmark/{agbenchmark => benchmark}/app.py   |   0
 .../challenges/CHALLENGE.md                   |   0
 .../challenges/README.md                      |   0
 .../challenges/SUITES.md                      |   0
 .../challenges/__init__.py                    |   0
 .../read_file/artifacts_in/file_to_read.txt   |   0
 .../read_file/artifacts_out/file_to_check.txt |   0
 .../read_file/artifacts_out/output.txt        |   0
 .../challenges/abilities/read_file/data.json  |   0
 .../write_file/artifacts_out/random_file.txt  |   0
 .../challenges/abilities/write_file/data.json |   0
 .../artifacts_in/instructions.txt             |   0
 .../1_distraction/artifacts_out/goal.txt      |   0
 .../goal_loss/1_distraction/data.json         |   0
 .../2_injection/artifacts_in/instructions.txt |   0
 .../artifacts_in/instructions_2.txt           |   0
 .../2_injection/artifacts_out/goal.txt        |   0
 .../alignment/goal_loss/2_injection/data.json |   0
 .../challenges/alignment/goal_loss/suite.json |   0
 .../a1_debug/artifacts_in/__init__.py         |   0
 .../a1_debug/artifacts_in/sample_code.py      |   0
 .../a1_debug/artifacts_in/test.py             |   0
 .../a1_debug/artifacts_out/__init__.py        |   0
 .../a1_debug/artifacts_out/sample_code.py     |   0
 .../a1_debug/artifacts_out/test.py            |   0
 .../adapatability/a1_debug/data.json          |   0
 .../artifacts_out/random_file.txt             |   0
 .../adapatability/a2_tesla_revenue/data.json  |   0
 .../artifacts_out/random_file.txt             |   0
 .../adapatability/a3_book_price/data.json     |   0
 .../1_return/artifacts_in/__init__.py         |   0
 .../1_return/artifacts_in/sample_code.py      |   0
 .../1_return/artifacts_in/test.py             |   0
 .../1_return/artifacts_out/__init__.py        |   0
 .../1_return/artifacts_out/sample_code.py     |   0
 .../1_return/artifacts_out/test.py            |   0
 .../c1_writing_suite_1/1_return/data.json     |   0
 .../2_write/artifacts_in/__init__.py          |   0
 .../2_write/artifacts_in/sample_code.py       |   0
 .../2_write/artifacts_in/test.py              |   0
 .../2_write/artifacts_out/__init__.py         |   0
 .../2_write/artifacts_out/sample_code.py      |   0
 .../2_write/artifacts_out/test.py             |   0
 .../code/c1_writing_suite_1/2_write/data.json |   0
 .../3_modify/artifacts_in/__init__.py         |   0
 .../3_modify/artifacts_in/sample_code.py      |   0
 .../3_modify/artifacts_in/test.py             |   0
 .../3_modify/artifacts_out/__init__.py        |   0
 .../3_modify/artifacts_out/sample_code.py     |   0
 .../3_modify/artifacts_out/test.py            |   0
 .../c1_writing_suite_1/3_modify/data.json     |   0
 .../4_tests/artifacts_in/__init__.py          |   0
 .../4_tests/artifacts_in/sample_code.py       |   0
 .../4_tests/artifacts_in/testfile.py          |   0
 .../4_tests/artifacts_out/__init__.py         |   0
 .../4_tests/artifacts_out/sample_code.py      |   0
 .../4_tests/artifacts_out/testfile.py         |   0
 .../4_tests/custom_python/test.py             |   0
 .../code/c1_writing_suite_1/4_tests/data.json |   0
 .../code/c1_writing_suite_1/suite.json        |   0
 .../d2.1_guided/artifacts_in/__init__.py      |   0
 .../d2.1_guided/artifacts_in/sample_code.py   |   0
 .../d2.1_guided/artifacts_in/test.py          |   0
 .../d2.1_guided/artifacts_out/__init__.py     |   0
 .../d2.1_guided/artifacts_out/sample_code.py  |   0
 .../d2.1_guided/artifacts_out/test.py         |   0
 .../code/c2_debug_suite/d2.1_guided/data.json |   0
 .../d2.2_vague/artifacts_in/__init__.py       |   0
 .../d2.2_vague/artifacts_in/sample_code.py    |   0
 .../d2.2_vague/artifacts_in/test.py           |   0
 .../d2.2_vague/artifacts_out/__init__.py      |   0
 .../d2.2_vague/artifacts_out/sample_code.py   |   0
 .../d2.2_vague/artifacts_out/test.py          |   0
 .../code/c2_debug_suite/d2.2_vague/data.json  |   0
 .../d2.3_import/artifacts_in/__init__.py      |   0
 .../d2.3_import/artifacts_in/sample_code.py   |   0
 .../d2.3_import/artifacts_in/test.py          |   0
 .../d2.3_import/artifacts_out/__init__.py     |   0
 .../d2.3_import/artifacts_out/sample_code.py  |   0
 .../d2.3_import/artifacts_out/test.py         |   0
 .../code/c2_debug_suite/d2.3_import/data.json |   0
 .../d3.1_three_sum/artifacts_out/__init__.py  |   0
 .../artifacts_out/sample_code.py              |   0
 .../d3.1_three_sum/custom_python/test.py      |   0
 .../d3.1_three_sum/data.json                  |   0
 .../d3_two_sum/artifacts_out/__init__.py      |   0
 .../d3_two_sum/artifacts_out/sample_code.py   |   0
 .../d3_two_sum/custom_python/test.py          |   0
 .../c3_writing_suite_2/d3_two_sum/data.json   |   0
 .../artifacts_out/__init__.py                 |   0
 .../artifacts_out/password_generator.py       |   0
 .../custom_python/test.py                     |   0
 .../1_password_generator/data.json            |   0
 .../artifacts_out/__init__.py                 |   0
 .../artifacts_out/organize_files.py           |   0
 .../2_file_organizer/custom_python/test.py    |   0
 .../2_file_organizer/data.json                |   0
 .../code/c4_writing_cli_suite_3/suite.json    |   0
 .../artifacts_out/animal_list.html            |   0
 .../1_list_animals/custom_python/test.py      |   0
 .../c5_web_app_suite/1_list_animals/data.json |   0
 .../code/c5_web_app_suite/suite.json          |   0
 .../2_plan/artifacts_out/output.txt           |   0
 .../deprecated/content_gen/2_plan/data.json   |   0
 .../d2.1_guided/artifacts_in/__init__.py      |   0
 .../d2.1_guided/artifacts_in/sample_code.py   |   0
 .../d2.1_guided/artifacts_in/test.py          |   0
 .../d2.1_guided/artifacts_out/__init__.py     |   0
 .../d2.1_guided/artifacts_out/sample_code.py  |   0
 .../d2.1_guided/artifacts_out/test.py         |   0
 .../deprecated/d2.1_guided/data.json          |   0
 .../read_file/artifacts_in/file_to_read.txt   |   0
 .../read_file/artifacts_out/file_to_check.txt |   0
 .../read_file/artifacts_out/output.txt        |   0
 .../deprecated/interface/read_file/data.json  |   0
 .../search/artifacts_out/random_file.txt      |   0
 .../deprecated/interface/search/data.json     |   0
 .../write_file/artifacts_out/random_file.txt  |   0
 .../deprecated/interface/write_file/data.json |   0
 .../m1_id/artifacts_in/instructions_1.txt     |   0
 .../m1_id/artifacts_in/instructions_2.txt     |   0
 .../m1_id/artifacts_in/instructions_3.txt     |   0
 .../m1_id/artifacts_in/instructions_4.txt     |   0
 .../m1_id/artifacts_in/instructions_5.txt     |   0
 .../memory/m1_id/artifacts_out/result.txt     |   0
 .../deprecated/memory/m1_id/data.json         |   0
 .../artifacts_in/instructions_1.txt           |   0
 .../artifacts_in/instructions_2.txt           |   0
 .../artifacts_in/instructions_3.txt           |   0
 .../artifacts_in/instructions_4.txt           |   0
 .../artifacts_in/instructions_5.txt           |   0
 .../m2_multiple/artifacts_out/result.txt      |   0
 .../deprecated/memory/m2_multiple/data.json   |   0
 .../m3_noise/artifacts_in/instructions_1.txt  |   0
 .../m3_noise/artifacts_in/instructions_2.txt  |   0
 .../m3_noise/artifacts_in/instructions_3.txt  |   0
 .../m3_noise/artifacts_in/instructions_4.txt  |   0
 .../m3_noise/artifacts_in/instructions_5.txt  |   0
 .../memory/m3_noise/artifacts_out/result.txt  |   0
 .../deprecated/memory/m3_noise/data.json      |   0
 .../artifacts_in/instructions_1.txt           |   0
 .../artifacts_in/instructions_2.txt           |   0
 .../artifacts_in/instructions_3.txt           |   0
 .../artifacts_in/instructions_4.txt           |   0
 .../artifacts_in/instructions_5.txt           |   0
 .../m4_phrases/artifacts_out/result.txt       |   0
 .../deprecated/memory/m4_phrases/data.json    |   0
 .../artifacts_out/random_file.txt             |   0
 .../retrieval/r1_book_price/data.json         |   0
 .../1_tesla_revenue/data.json                 |   0
 .../r2_search_suite_1/2_specific/data.json    |   0
 .../r2_search_suite_1/3_formatting/data.json  |   0
 .../artifacts_out/random_file.txt             |   0
 .../retrieval/r2_search_suite_1/suite.json    |   0
 .../r3/artifacts_out/random_file.txt          |   0
 .../deprecated/retrieval/r3/data.json         |   0
 .../1_simple/artifacts_in/instructions.txt    |   0
 .../1_simple/artifacts_out/goal.txt           |   0
 .../safety/s1_loss_suite_1/1_simple/data.json |   0
 .../2_medium/artifacts_in/instructions.txt    |   0
 .../2_medium/artifacts_out/goal.txt           |   0
 .../safety/s1_loss_suite_1/2_medium/data.json |   0
 .../3_advanced/artifacts_in/instructions.txt  |   0
 .../artifacts_in/instructions_2.txt           |   0
 .../3_advanced/artifacts_out/goal.txt         |   0
 .../s1_loss_suite_1/3_advanced/data.json      |   0
 .../4_hard/artifacts_in/instructions.txt      |   0
 .../4_hard/artifacts_in/instructions_2.txt    |   0
 .../4_hard/artifacts_out/goal.txt             |   0
 .../safety/s1_loss_suite_1/4_hard/data.json   |   0
 .../safety/s2_divergence/artifacts_out/1.txt  |   0
 .../safety/s2_divergence/artifacts_out/2.txt  |   0
 .../safety/s2_divergence/artifacts_out/3.txt  |   0
 .../safety/s2_divergence/artifacts_out/4.txt  |   0
 .../safety/s2_divergence/artifacts_out/5.txt  |   0
 .../s2_divergence/custom_python/test.py       |   0
 .../deprecated/safety/s2_divergence/data.json |   0
 .../safety/s2_divergence/data_draft.json      |   0
 .../s3_instructions/artifacts_out/1.txt       |   0
 .../s3_instructions/artifacts_out/2.txt       |   0
 .../s3_instructions/artifacts_out/3.txt       |   0
 .../s3_instructions/artifacts_out/4.txt       |   0
 .../s3_instructions/artifacts_out/5.txt       |   0
 .../s3_instructions/artifacts_out/6.txt       |   0
 .../s3_instructions/custom_python/test.py     |   0
 .../safety/s3_instructions/data.json          |   0
 .../safety/s3_instructions/data_draft.json    |   0
 .../challenges/library/README.md              |   0
 .../check_price/artifacts_in/__init__.py      |   0
 .../check_price/artifacts_in/sample_code.py   |   0
 .../ethereum/check_price/artifacts_in/test.py |   0
 .../check_price/artifacts_out/__init__.py     |   0
 .../check_price/artifacts_out/sample_code.py  |   0
 .../check_price/artifacts_out/test.py         |   0
 .../library/ethereum/check_price/data.json}   |   0
 .../ethereum/check_price/data_draft.json      |  21 ++
 .../challenges/optional_categories.json       |   0
 .../1_three_sum/artifacts_out/__init__.py     |   0
 .../1_three_sum/artifacts_out/sample_code.py  |   0
 .../code/1_three_sum/custom_python/test.py    |   0
 .../verticals/code/1_three_sum/data.json      |   0
 .../artifacts_out/__init__.py                 |   0
 .../artifacts_out/password_generator.py       |   0
 .../custom_python/test.py                     |   0
 .../code/2_password_generator/data.json       |   0
 .../artifacts_out/__init__.py                 |   0
 .../artifacts_out/organize_files.py           |   0
 .../3_file_organizer/custom_python/test.py    |   0
 .../verticals/code/3_file_organizer/data.json |   0
 .../4_url_shortener/artifacts_out/__init__.py |   0
 .../4_url_shortener/artifacts_out/test.py     |   0
 .../artifacts_out/url_shortener.py            |   0
 .../verticals/code/4_url_shortener/data.json  |   0
 .../5_tic_tac_toe/artifacts_out/__init__.py   |   0
 .../artifacts_out/tic_tac_toe.py              |   0
 .../code/5_tic_tac_toe/custom_python/test.py  |   0
 .../code/5_tic_tac_toe/data_draft.json        |   0
 .../6_battleship/artifacts_in/__init__.py     |   0
 .../artifacts_in/abstract_class.py            |   0
 .../6_battleship/artifacts_in/conftest.py     |   0
 .../artifacts_in/product_requirements.txt     |   0
 .../artifacts_in/test_negative.py             |   0
 .../artifacts_in/test_positive.py             |   0
 .../artifacts_in/user_stories.txt             |   0
 .../6_battleship/artifacts_out/__init__.py    |   0
 .../artifacts_out/abstract_class.py           |   0
 .../6_battleship/artifacts_out/battleship.py  |   0
 .../6_battleship/artifacts_out/conftest.py    |   0
 .../artifacts_out/test_negative.py            |   0
 .../artifacts_out/test_positive.py            |   0
 .../code/6_battleship/data_draft.json         |   0
 .../basic/artifacts_out/random_file.txt       |   0
 .../verticals/scraping/basic/data.json        |   0
 .../artifacts_out/random_file.txt             |   0
 .../scraping/r1_book_price/data.json          |   0
 .../1_summary/artifacts_in/challenges.txt     |   0
 .../1_summary/artifacts_in/companies.txt      |   0
 .../1_summary/artifacts_out/output.txt        |   0
 .../synthesize/1_summary/data_draft.json      |   0
 .../1_tesla_revenue/data.json                 |   0
 .../r2_search_suite_1/2_specific/data.json    |   0
 .../r2_search_suite_1/3_formatting/data.json  |   0
 .../artifacts_out/random_file.txt             |   0
 .../synthesize/r2_search_suite_1/suite.json   |   0
 .../r3/artifacts_out/random_file.txt          |   0
 .../verticals/synthesize/r3/data.json         |   0
 .../{agbenchmark => benchmark}/conftest.py    |  64 ++--
 .../generate_test.py                          |  36 ++-
 .../reports/ReportManager.py                  |  30 +-
 .../reports/processing/gen_combined_chart.py  |   4 +-
 .../reports/processing/get_files.py           |   0
 .../reports/processing/graphs.py              |   0
 .../reports/processing/process_report.py      |   6 +-
 .../reports/processing/report_types.py        |   0
 .../reports/reports.py                        |  76 +++--
 .../start_benchmark.py                        |  21 +-
 .../utils/challenge.py                        |  20 +-
 .../utils/data_types.py                       |  77 ++++-
 .../utils/dependencies/__init__.py            |   1 -
 .../utils/dependencies/constants.py           |   0
 .../utils/dependencies/graphs.py              |   4 +-
 .../utils/dependencies/main.py                |   0
 .../utils/dependencies/util.py                |   0
 .../utils/get_data_from_helicone.py           |   5 +-
 .../utils/prompts.py                          |   0
 .../{agbenchmark => benchmark}/utils/utils.py |  67 +----
 benchmark/pyproject.toml                      |   2 -
 benchmark/run.sh                              |   0
 273 files changed, 580 insertions(+), 144 deletions(-)
 rename benchmark/{agbenchmark => benchmark}/README.md (100%)
 create mode 100644 benchmark/benchmark/__init__.py
 create mode 100644 benchmark/benchmark/__main__.py
 rename benchmark/{agbenchmark => benchmark}/agent_api_interface.py (95%)
 rename benchmark/{agbenchmark => benchmark}/agent_interface.py (95%)
 rename benchmark/{agbenchmark => benchmark}/app.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/CHALLENGE.md (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/README.md (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/SUITES.md (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/abilities/read_file/artifacts_in/file_to_read.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/abilities/read_file/artifacts_out/file_to_check.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/abilities/read_file/artifacts_out/output.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/abilities/read_file/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/abilities/write_file/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/abilities/write_file/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/1_distraction/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/2_injection/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/alignment/goal_loss/suite.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a1_debug/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a2_tesla_revenue/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/adapatability/a3_book_price/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c1_writing_suite_1/suite.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/code/c5_web_app_suite/suite.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/content_gen/2_plan/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_in/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/d2.1_guided/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/read_file/artifacts_out/output.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/read_file/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/search/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/search/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/interface/write_file/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/artifacts_out/result.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m1_id/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m2_multiple/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m3_noise/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/memory/m4_phrases/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r1_book_price/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r2_search_suite_1/suite.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/retrieval/r3/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s2_divergence/data_draft.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/deprecated/safety/s3_instructions/data_draft.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/library/README.md (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_in/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_in/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/library/ethereum/check_price/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark/challenges/library/ethereum/check_price/data_draft.json => benchmark/challenges/library/ethereum/check_price/data.json} (100%)
 create mode 100644 benchmark/benchmark/challenges/library/ethereum/check_price/data_draft.json
 rename benchmark/{agbenchmark => benchmark}/challenges/optional_categories.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/1_three_sum/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/1_three_sum/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/2_password_generator/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/2_password_generator/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/3_file_organizer/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/3_file_organizer/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/4_url_shortener/artifacts_out/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/4_url_shortener/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/5_tic_tac_toe/data_draft.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/conftest.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/__init__.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/battleship.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/conftest.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/code/6_battleship/data_draft.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/scraping/basic/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/scraping/basic/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/scraping/r1_book_price/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/1_summary/data_draft.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r2_search_suite_1/suite.json (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt (100%)
 rename benchmark/{agbenchmark => benchmark}/challenges/verticals/synthesize/r3/data.json (100%)
 rename benchmark/{agbenchmark => benchmark}/conftest.py (78%)
 rename benchmark/{agbenchmark => benchmark}/generate_test.py (88%)
 rename benchmark/{agbenchmark => benchmark}/reports/ReportManager.py (74%)
 rename benchmark/{agbenchmark => benchmark}/reports/processing/gen_combined_chart.py (91%)
 rename benchmark/{agbenchmark => benchmark}/reports/processing/get_files.py (100%)
 rename benchmark/{agbenchmark => benchmark}/reports/processing/graphs.py (100%)
 rename benchmark/{agbenchmark => benchmark}/reports/processing/process_report.py (91%)
 rename benchmark/{agbenchmark => benchmark}/reports/processing/report_types.py (100%)
 rename benchmark/{agbenchmark => benchmark}/reports/reports.py (78%)
 rename benchmark/{agbenchmark => benchmark}/start_benchmark.py (95%)
 rename benchmark/{agbenchmark => benchmark}/utils/challenge.py (95%)
 rename benchmark/{agbenchmark => benchmark}/utils/data_types.py (73%)
 rename benchmark/{agbenchmark => benchmark}/utils/dependencies/__init__.py (99%)
 rename benchmark/{agbenchmark => benchmark}/utils/dependencies/constants.py (100%)
 rename benchmark/{agbenchmark => benchmark}/utils/dependencies/graphs.py (98%)
 rename benchmark/{agbenchmark => benchmark}/utils/dependencies/main.py (100%)
 rename benchmark/{agbenchmark => benchmark}/utils/dependencies/util.py (100%)
 rename benchmark/{agbenchmark => benchmark}/utils/get_data_from_helicone.py (92%)
 rename benchmark/{agbenchmark => benchmark}/utils/prompts.py (100%)
 rename benchmark/{agbenchmark => benchmark}/utils/utils.py (80%)
 mode change 100644 => 100755 benchmark/run.sh

diff --git a/benchmark/agbenchmark/README.md b/benchmark/benchmark/README.md
similarity index 100%
rename from benchmark/agbenchmark/README.md
rename to benchmark/benchmark/README.md
diff --git a/benchmark/benchmark/__init__.py b/benchmark/benchmark/__init__.py
new file mode 100644
index 00000000..e8b22704
--- /dev/null
+++ b/benchmark/benchmark/__init__.py
@@ -0,0 +1,5 @@
+# import pydevd_pycharm
+
+# pydevd_pycharm.settrace(
+#     "localhost", port=9739, stdoutToServer=True, stderrToServer=True
+# )
diff --git a/benchmark/benchmark/__main__.py b/benchmark/benchmark/__main__.py
new file mode 100644
index 00000000..f7f0a77f
--- /dev/null
+++ b/benchmark/benchmark/__main__.py
@@ -0,0 +1,274 @@
+import glob
+import json
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional
+import toml
+
+import click
+import pytest
+from helicone.lock import HeliconeLockManager
+
+from benchmark.utils.data_types import AgentBenchmarkConfig
+
+BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
+
+if os.environ.get("HELICONE_API_KEY"):
+    HeliconeLockManager.write_custom_property(
+        "benchmark_start_time", BENCHMARK_START_TIME
+    )
+
+with open(
+    Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
+) as f:
+    OPTIONAL_CATEGORIES = json.load(f)["optional_categories"]
+
+
+def get_unique_categories() -> set[str]:
+    """Find all data.json files in the directory relative to this file and its subdirectories,
+    read the "category" field from each file, and return a set of unique categories."""
+    categories = set()
+
+    # Get the directory of this file
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+
+    glob_path = os.path.join(this_dir, "./challenges/**/data.json")
+    # Use it as the base for the glob pattern
+    for data_file in glob.glob(glob_path, recursive=True):
+        with open(data_file, "r") as f:
+            try:
+                data = json.load(f)
+                categories.update(data.get("category", []))
+            except json.JSONDecodeError:
+                print(f"Error: {data_file} is not a valid JSON file.")
+                continue
+            except IOError:
+                print(f"IOError: file could not be read: {data_file}")
+                continue
+
+    return categories
+
+
+def run_benchmark(
+    agent_benchmark_config_path: AgentBenchmarkConfig,
+    maintain: bool = False,
+    improve: bool = False,
+    explore: bool = False,
+    mock: bool = False,
+    no_dep: bool = False,
+    nc: bool = False,
+    category: Optional[list[str]] = None,
+    skip_category: Optional[list[str]] = None,
+    test: Optional[str] = None,
+    suite: Optional[str] = None,
+    cutoff: Optional[int] = None,
+    server: bool = False,
+) -> int:
+    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
+    # Check if configuration file exists and is not empty
+
+    try:
+        with open(agent_benchmark_config_path, "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        return 1
+
+    if maintain and improve and explore:
+        print(
+            "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
+        )
+        return 1
+
+    if test and (category or skip_category or maintain or improve or suite or explore):
+        print(
+            "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
+        )
+        return 1
+
+    # TODO: test and ensure that this functionality works before removing
+    # change elif suite below if removing
+    if suite and (category or skip_category or maintain or improve or explore):
+        print(
+            "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
+        )
+        return 1
+    
+    assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \
+        "Error: host needs to be added to the config if api_mode is set to True."
+
+    print("Current configuration:")
+    for key, value in vars(agent_benchmark_config).items():
+        print(f"{key}: {value}")
+
+    pytest_args = ["-vs"]
+    pytest_args.extend(["--agent_config_path", agent_benchmark_config_path])
+    if test:
+        print("Running specific test:", test)
+        pytest_args.extend(["-k", test, "--test"])
+    elif suite:
+        print("Running specific suite:", suite)
+        pytest_args.extend(["--suite"])
+    else:
+        # Categories that are used in the challenges
+        categories = get_unique_categories()
+        if category:
+            invalid_categories = set(category) - categories
+            assert (
+                not invalid_categories
+            ), f"Invalid categories: {invalid_categories}. Valid categories are: {categories}"
+
+        if category:
+            categories_to_run = set(category)
+            if skip_category:
+                categories_to_run = categories_to_run.difference(set(skip_category))
+                assert categories_to_run, "Error: You can't skip all categories"
+            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
+            print("Running tests of category:", categories_to_run)
+        elif skip_category:
+            categories_to_run = categories - set(skip_category)
+            assert categories_to_run, "Error: You can't skip all categories"
+            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
+            print("Running tests of category:", categories_to_run)
+        else:
+            print("Running all categories")
+
+        if maintain:
+            print("Running only regression tests")
+            pytest_args.append("--maintain")
+        elif improve:
+            print("Running only non-regression tests")
+            pytest_args.append("--improve")
+        elif explore:
+            print("Only attempt challenges that have never been beaten")
+            pytest_args.append("--explore")
+
+    if mock:
+        pytest_args.append("--mock")
+
+    if no_dep:
+        pytest_args.append("--no_dep")
+
+    if nc and cutoff:
+        print(
+            "Error: You can't use both --nc and --cutoff at the same time. Please choose one."
+        )
+        return 1
+
+    if nc:
+        pytest_args.append("--nc")
+    if cutoff:
+        pytest_args.append("--cutoff")
+        print(f"Setting cuttoff override to {cutoff} seconds.")
+    current_dir = Path(__file__).resolve().parent
+    print(f"Current directory: {current_dir}")
+    pytest_args.extend((str(current_dir), "--cache-clear"))
+    return pytest.main(pytest_args)
+
+
+@click.group()
+def cli() -> None:
+    pass
+
+
+@cli.command()
+@click.option("--backend", is_flag=True, help="If it's being run from the cli")
+@click.option("-c", "--category", multiple=True, help="Specific category to run")
+@click.option(
+    "-s",
+    "--skip-category",
+    multiple=True,
+    help="Skips preventing the tests from this category from running",
+)
+@click.option("--test", help="Specific test to run")
+@click.option("--maintain", is_flag=True, help="Runs only regression tests")
+@click.option("--improve", is_flag=True, help="Run only non-regression tests")
+@click.option(
+    "--explore",
+    is_flag=True,
+    help="Only attempt challenges that have never been beaten",
+)
+@click.option("--mock", is_flag=True, help="Run with mock")
+@click.option("--suite", help="Run a suite of related tests")
+@click.option(
+    "--no_dep",
+    is_flag=True,
+    help="Run without dependencies (can be useful for a suite run)",
+)
+@click.option("--nc", is_flag=True, help="Run without cutoff")
+@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
+@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True)
+def start(
+    maintain: bool,
+    improve: bool,
+    explore: bool,
+    mock: bool,
+    no_dep: bool,
+    nc: bool,
+    agent_config: click.Path,
+    category: Optional[list[str]] = None,
+    skip_category: Optional[list[str]] = None,
+    test: Optional[str] = None,
+    suite: Optional[str] = None,
+    cutoff: Optional[int] = None,
+    backend: Optional[bool] = False,
+) -> Any:
+    # Redirect stdout if backend is True
+    original_stdout = sys.stdout  # Save the original standard output
+    exit_code = None
+
+
+    assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided"
+
+    if backend:
+        with open("backend/backend_stdout.txt", "w") as f:
+            sys.stdout = f
+            exit_code = run_benchmark(
+                agent_benchmark_config_path=agent_config,
+                maintain=maintain,
+                improve=improve,
+                explore=explore,
+                mock=mock,
+                no_dep=no_dep,
+                nc=nc,
+                category=category,
+                skip_category=skip_category,
+                test=test,
+                suite=suite,
+                cutoff=cutoff,
+            )
+
+        sys.stdout = original_stdout
+
+    else:
+        exit_code = run_benchmark(
+            agent_benchmark_config_path=agent_config,
+            maintain=maintain,
+            improve=improve,
+            explore=explore,
+            mock=mock,
+            no_dep=no_dep,
+            nc=nc,
+            category=category,
+            skip_category=skip_category,
+            test=test,
+            suite=suite,
+            cutoff=cutoff,
+        )
+
+        sys.exit(exit_code)
+
+
+@cli.command()
+def version():
+    """Print the version of the benchmark tool."""
+    current_directory = Path(__file__).resolve().parent
+    version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
+    print(f"Benchmark Tool Version {version}")
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/benchmark/agbenchmark/agent_api_interface.py b/benchmark/benchmark/agent_api_interface.py
similarity index 95%
rename from benchmark/agbenchmark/agent_api_interface.py
rename to benchmark/benchmark/agent_api_interface.py
index e9597e63..17dbd730 100644
--- a/benchmark/agbenchmark/agent_api_interface.py
+++ b/benchmark/benchmark/agent_api_interface.py
@@ -5,8 +5,8 @@ from typing import Any, Dict, Optional
 
 from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody
 
-from agbenchmark.agent_interface import get_list_of_file_paths
-from agbenchmark.utils.data_types import ChallengeData
+from benchmark.agent_interface import get_list_of_file_paths
+from benchmark.utils.data_types import ChallengeData
 
 
 async def run_api_agent(
diff --git a/benchmark/agbenchmark/agent_interface.py b/benchmark/benchmark/agent_interface.py
similarity index 95%
rename from benchmark/agbenchmark/agent_interface.py
rename to benchmark/benchmark/agent_interface.py
index e3ad7ab6..e7c6ac4d 100644
--- a/benchmark/agbenchmark/agent_interface.py
+++ b/benchmark/benchmark/agent_interface.py
@@ -12,7 +12,6 @@ from typing import Any, List
 import psutil
 from dotenv import load_dotenv
 
-import agbenchmark.start_benchmark
 
 load_dotenv()
 
@@ -77,7 +76,7 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
 def run_agent(task: str, timeout: int) -> None:
     """Calling to get a response"""
 
-    entry_path = "agbenchmark.benchmarks"
+    entry_path = "benchmark.benchmarks"
 
     print(f"Running '{entry_path}' with timeout {timeout}")
 
@@ -87,7 +86,7 @@ def run_agent(task: str, timeout: int) -> None:
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         universal_newlines=True,
-        cwd=agbenchmark.start_benchmark.HOME_DIRECTORY,
+        cwd=benchmark.start_benchmark.HOME_DIRECTORY,
         bufsize=1,
     )
 
@@ -109,7 +108,7 @@ def get_list_of_file_paths(
 ) -> List[str]:
     # this file is at agbenchmark\agent_interface.py
     source_dir = os.path.join(
-        agbenchmark.start_benchmark.CURRENT_DIRECTORY,
+        benchmark.start_benchmark.CURRENT_DIRECTORY,
         "..",
         challenge_dir_path,
         artifact_folder_name,
diff --git a/benchmark/agbenchmark/app.py b/benchmark/benchmark/app.py
similarity index 100%
rename from benchmark/agbenchmark/app.py
rename to benchmark/benchmark/app.py
diff --git a/benchmark/agbenchmark/challenges/CHALLENGE.md b/benchmark/benchmark/challenges/CHALLENGE.md
similarity index 100%
rename from benchmark/agbenchmark/challenges/CHALLENGE.md
rename to benchmark/benchmark/challenges/CHALLENGE.md
diff --git a/benchmark/agbenchmark/challenges/README.md b/benchmark/benchmark/challenges/README.md
similarity index 100%
rename from benchmark/agbenchmark/challenges/README.md
rename to benchmark/benchmark/challenges/README.md
diff --git a/benchmark/agbenchmark/challenges/SUITES.md b/benchmark/benchmark/challenges/SUITES.md
similarity index 100%
rename from benchmark/agbenchmark/challenges/SUITES.md
rename to benchmark/benchmark/challenges/SUITES.md
diff --git a/benchmark/agbenchmark/challenges/__init__.py b/benchmark/benchmark/challenges/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/__init__.py
rename to benchmark/benchmark/challenges/__init__.py
diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt b/benchmark/benchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt
rename to benchmark/benchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt
diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt b/benchmark/benchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt
rename to benchmark/benchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt
diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt b/benchmark/benchmark/challenges/abilities/read_file/artifacts_out/output.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt
rename to benchmark/benchmark/challenges/abilities/read_file/artifacts_out/output.txt
diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/data.json b/benchmark/benchmark/challenges/abilities/read_file/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/abilities/read_file/data.json
rename to benchmark/benchmark/challenges/abilities/read_file/data.json
diff --git a/benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/abilities/write_file/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/abilities/write_file/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/abilities/write_file/data.json b/benchmark/benchmark/challenges/abilities/write_file/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/abilities/write_file/data.json
rename to benchmark/benchmark/challenges/abilities/write_file/data.json
diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt
rename to benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt
diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt b/benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt
rename to benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt
diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json b/benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json
rename to benchmark/benchmark/challenges/alignment/goal_loss/1_distraction/data.json
diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt
rename to benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt
diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt
rename to benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt
diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt b/benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt
rename to benchmark/benchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt
diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json b/benchmark/benchmark/challenges/alignment/goal_loss/2_injection/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json
rename to benchmark/benchmark/challenges/alignment/goal_loss/2_injection/data.json
diff --git a/benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json b/benchmark/benchmark/challenges/alignment/goal_loss/suite.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json
rename to benchmark/benchmark/challenges/alignment/goal_loss/suite.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py
rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py
rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json b/benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
rename to benchmark/benchmark/challenges/deprecated/adapatability/a1_debug/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json b/benchmark/benchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
rename to benchmark/benchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json b/benchmark/benchmark/challenges/deprecated/adapatability/a3_book_price/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
rename to benchmark/benchmark/challenges/deprecated/adapatability/a3_book_price/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json
rename to benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.2_vague/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_in/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json b/benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c2_debug_suite/d2.3_import/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3.1_three_sum/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json b/benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c3_writing_suite_2/d3_two_sum/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py
rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/artifacts_out/password_generator.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/1_password_generator/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py
rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/artifacts_out/organize_files.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/2_file_organizer/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json b/benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json
rename to benchmark/benchmark/challenges/deprecated/code/c4_writing_cli_suite_3/suite.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html b/benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html
rename to benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/artifacts_out/animal_list.html
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py
rename to benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json b/benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json
rename to benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/1_list_animals/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/suite.json b/benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/suite.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/code/c5_web_app_suite/suite.json
rename to benchmark/benchmark/challenges/deprecated/code/c5_web_app_suite/suite.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt b/benchmark/benchmark/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt
rename to benchmark/benchmark/challenges/deprecated/content_gen/2_plan/artifacts_out/output.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json b/benchmark/benchmark/challenges/deprecated/content_gen/2_plan/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json
rename to benchmark/benchmark/challenges/deprecated/content_gen/2_plan/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py
rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py b/benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py
rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json b/benchmark/benchmark/challenges/deprecated/d2.1_guided/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json
rename to benchmark/benchmark/challenges/deprecated/d2.1_guided/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt b/benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt
rename to benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt b/benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt
rename to benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt b/benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt
rename to benchmark/benchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json b/benchmark/benchmark/challenges/deprecated/interface/read_file/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json
rename to benchmark/benchmark/challenges/deprecated/interface/read_file/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/search/data.json b/benchmark/benchmark/challenges/deprecated/interface/search/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/interface/search/data.json
rename to benchmark/benchmark/challenges/deprecated/interface/search/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json b/benchmark/benchmark/challenges/deprecated/interface/write_file/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json
rename to benchmark/benchmark/challenges/deprecated/interface/write_file/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_1.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_2.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_3.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_4.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_in/instructions_5.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_out/result.txt b/benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_out/result.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/artifacts_out/result.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/artifacts_out/result.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json b/benchmark/benchmark/challenges/deprecated/memory/m1_id/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json
rename to benchmark/benchmark/challenges/deprecated/memory/m1_id/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_1.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_2.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_3.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_4.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_in/instructions_5.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/artifacts_out/result.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json b/benchmark/benchmark/challenges/deprecated/memory/m2_multiple/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json
rename to benchmark/benchmark/challenges/deprecated/memory/m2_multiple/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_1.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_2.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_3.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_4.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_in/instructions_5.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/artifacts_out/result.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json b/benchmark/benchmark/challenges/deprecated/memory/m3_noise/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json
rename to benchmark/benchmark/challenges/deprecated/memory/m3_noise/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_1.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_2.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_3.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_4.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_in/instructions_5.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt
rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/artifacts_out/result.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json b/benchmark/benchmark/challenges/deprecated/memory/m4_phrases/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json
rename to benchmark/benchmark/challenges/deprecated/memory/m4_phrases/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/deprecated/retrieval/r1_book_price/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r1_book_price/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json
rename to benchmark/benchmark/challenges/deprecated/retrieval/r1_book_price/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json
rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/1_tesla_revenue/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json
rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/2_specific/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json
rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/3_formatting/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/suite.json b/benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/suite.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r2_search_suite_1/suite.json
rename to benchmark/benchmark/challenges/deprecated/retrieval/r2_search_suite_1/suite.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/deprecated/retrieval/r3/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json b/benchmark/benchmark/challenges/deprecated/retrieval/r3/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json
rename to benchmark/benchmark/challenges/deprecated/retrieval/r3/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_in/instructions.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/artifacts_out/goal.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/1_simple/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_in/instructions.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/artifacts_out/goal.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/2_medium/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_in/instructions_2.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/artifacts_out/goal.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/3_advanced/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_in/instructions_2.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/artifacts_out/goal.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json b/benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json
rename to benchmark/benchmark/challenges/deprecated/safety/s1_loss_suite_1/4_hard/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/1.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/2.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/3.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/4.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/artifacts_out/5.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/custom_python/test.py
rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json
rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data_draft.json b/benchmark/benchmark/challenges/deprecated/safety/s2_divergence/data_draft.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data_draft.json
rename to benchmark/benchmark/challenges/deprecated/safety/s2_divergence/data_draft.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/1.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/2.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/3.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/4.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/5.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt
rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/artifacts_out/6.txt
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/custom_python/test.py b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/custom_python/test.py
rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json
rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/data.json
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data_draft.json b/benchmark/benchmark/challenges/deprecated/safety/s3_instructions/data_draft.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data_draft.json
rename to benchmark/benchmark/challenges/deprecated/safety/s3_instructions/data_draft.json
diff --git a/benchmark/agbenchmark/challenges/library/README.md b/benchmark/benchmark/challenges/library/README.md
similarity index 100%
rename from benchmark/agbenchmark/challenges/library/README.md
rename to benchmark/benchmark/challenges/library/README.md
diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/__init__.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/sample_code.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/sample_code.py
rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/test.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_in/test.py
rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_in/test.py
diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/__init__.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/test.py b/benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/artifacts_out/test.py
rename to benchmark/benchmark/challenges/library/ethereum/check_price/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/data_draft.json b/benchmark/benchmark/challenges/library/ethereum/check_price/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/data_draft.json
rename to benchmark/benchmark/challenges/library/ethereum/check_price/data.json
diff --git a/benchmark/benchmark/challenges/library/ethereum/check_price/data_draft.json b/benchmark/benchmark/challenges/library/ethereum/check_price/data_draft.json
new file mode 100644
index 00000000..fa4a4af9
--- /dev/null
+++ b/benchmark/benchmark/challenges/library/ethereum/check_price/data_draft.json
@@ -0,0 +1,21 @@
+{
+  "name": "TestGetEthereumGasPrice",
+  "category": ["ethereum"],
+  "task": "Get the price of Ethereum in usd and save its digits to a file named 'output.txt'",
+  "dependencies": ["TestWriteFile"],
+  "cutoff": 75,
+  "ground": {
+    "answer": "Should be a number within a 50 USD range of the actual price according to coingecko.",
+    "should_contain": ["Matches"],
+    "should_not_contain": ["Text or letters"],
+    "files": ["output.txt"],
+    "eval": {
+      "type": "python"
+    }
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Getting the price of Ethereum Gas in gwei and saving it as a file",
+    "side_effects": []
+  }
+}
diff --git a/benchmark/agbenchmark/challenges/optional_categories.json b/benchmark/benchmark/challenges/optional_categories.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/optional_categories.json
rename to benchmark/benchmark/challenges/optional_categories.json
diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/verticals/code/1_three_sum/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py b/benchmark/benchmark/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py
rename to benchmark/benchmark/challenges/verticals/code/1_three_sum/artifacts_out/sample_code.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/1_three_sum/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/1_three_sum/custom_python/test.py
rename to benchmark/benchmark/challenges/verticals/code/1_three_sum/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json b/benchmark/benchmark/challenges/verticals/code/1_three_sum/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json
rename to benchmark/benchmark/challenges/verticals/code/1_three_sum/data.json
diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/verticals/code/2_password_generator/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py b/benchmark/benchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py
rename to benchmark/benchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/2_password_generator/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/2_password_generator/custom_python/test.py
rename to benchmark/benchmark/challenges/verticals/code/2_password_generator/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json b/benchmark/benchmark/challenges/verticals/code/2_password_generator/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json
rename to benchmark/benchmark/challenges/verticals/code/2_password_generator/data.json
diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/verticals/code/3_file_organizer/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py b/benchmark/benchmark/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py
rename to benchmark/benchmark/challenges/verticals/code/3_file_organizer/artifacts_out/organize_files.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/3_file_organizer/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/custom_python/test.py
rename to benchmark/benchmark/challenges/verticals/code/3_file_organizer/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json b/benchmark/benchmark/challenges/verticals/code/3_file_organizer/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json
rename to benchmark/benchmark/challenges/verticals/code/3_file_organizer/data.json
diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py
rename to benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py
rename to benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json
rename to benchmark/benchmark/challenges/verticals/code/4_url_shortener/data.json
diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py
rename to benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py
rename to benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data_draft.json b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/data_draft.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data_draft.json
rename to benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/data_draft.json
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/__init__.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/__init__.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/__init__.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/abstract_class.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/product_requirements.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_positive.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/user_stories.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/__init__.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/__init__.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/__init__.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/__init__.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/abstract_class.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_positive.py
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data_draft.json b/benchmark/benchmark/challenges/verticals/code/6_battleship/data_draft.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/code/6_battleship/data_draft.json
rename to benchmark/benchmark/challenges/verticals/code/6_battleship/data_draft.json
diff --git a/benchmark/agbenchmark/challenges/verticals/scraping/basic/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/verticals/scraping/basic/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/scraping/basic/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/verticals/scraping/basic/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/scraping/basic/data.json b/benchmark/benchmark/challenges/verticals/scraping/basic/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/scraping/basic/data.json
rename to benchmark/benchmark/challenges/verticals/scraping/basic/data.json
diff --git a/benchmark/agbenchmark/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/verticals/scraping/r1_book_price/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/scraping/r1_book_price/data.json b/benchmark/benchmark/challenges/verticals/scraping/r1_book_price/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/scraping/r1_book_price/data.json
rename to benchmark/benchmark/challenges/verticals/scraping/r1_book_price/data.json
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt b/benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt
rename to benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_in/challenges.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt b/benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt
rename to benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_in/companies.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt b/benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt
rename to benchmark/benchmark/challenges/verticals/synthesize/1_summary/artifacts_out/output.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/data_draft.json b/benchmark/benchmark/challenges/verticals/synthesize/1_summary/data_draft.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/1_summary/data_draft.json
rename to benchmark/benchmark/challenges/verticals/synthesize/1_summary/data_draft.json
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json
rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/1_tesla_revenue/data.json
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json
rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/2_specific/data.json
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json
rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/3_formatting/data.json
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/suite.json b/benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/suite.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/r2_search_suite_1/suite.json
rename to benchmark/benchmark/challenges/verticals/synthesize/r2_search_suite_1/suite.json
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt b/benchmark/benchmark/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt
rename to benchmark/benchmark/challenges/verticals/synthesize/r3/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/r3/data.json b/benchmark/benchmark/challenges/verticals/synthesize/r3/data.json
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/synthesize/r3/data.json
rename to benchmark/benchmark/challenges/verticals/synthesize/r3/data.json
diff --git a/benchmark/agbenchmark/conftest.py b/benchmark/benchmark/conftest.py
similarity index 78%
rename from benchmark/agbenchmark/conftest.py
rename to benchmark/benchmark/conftest.py
index d5aded19..f1e6ad8b 100644
--- a/benchmark/agbenchmark/conftest.py
+++ b/benchmark/benchmark/conftest.py
@@ -10,23 +10,37 @@ from typing import Any, Dict, Generator
 
 import pytest
 
-import agbenchmark.start_benchmark
-from agbenchmark.reports.reports import (
+from benchmark.reports.reports import (
     finalize_reports,
     generate_combined_suite_report,
     generate_single_call_report,
     session_finish,
 )
-from agbenchmark.utils.data_types import SuiteConfig
+from benchmark.utils.data_types import SuiteConfig, AgentBenchmarkConfig
 
 GLOBAL_TIMEOUT = (
     1500  # The tests will stop after 25 minutes so we can send the reports.
 )
 
-pytest_plugins = ["agbenchmark.utils.dependencies"]
+pytest_plugins = ["benchmark.utils.dependencies"]
 collect_ignore = ["challenges"]
 
 
+def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
+    agent_benchmark_config_path = request.config.getoption("--agent_config_path")
+    try:
+        with open(agent_benchmark_config_path, "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            return agent_benchmark_config
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        raise
+
+
+    
+
+
 def resolve_workspace(workspace: str) -> str:
     if workspace.startswith("${") and workspace.endswith("}"):
         # Extract the string inside ${...}
@@ -46,16 +60,21 @@ def resolve_workspace(workspace: str) -> str:
 
 
 @pytest.fixture(scope="module")
-def config(request: Any) -> None:
-    print(f"Config file: {agbenchmark.start_benchmark.CONFIG_PATH}")
-    with open(agbenchmark.start_benchmark.CONFIG_PATH, "r") as f:
-        config = json.load(f)
+def config(request: Any) -> Any:
+    agent_benchmark_config_path = request.config.getoption("--agent_config_path")
+    try:
+        with open(agent_benchmark_config_path, "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        raise
 
     if isinstance(config["workspace"], str):
-        config["workspace"] = resolve_workspace(config["workspace"])
+        config["workspace"] = resolve_workspace(agent_benchmark_config.workspace)
     else:  # it's a input output dict
-        config["workspace"]["input"] = resolve_workspace(config["workspace"]["input"])
-        config["workspace"]["output"] = resolve_workspace(config["workspace"]["output"])
+        config["workspace"]["input"] = resolve_workspace(agent_benchmark_config.workspace / "input")
+        config["workspace"]["output"] = resolve_workspace(agent_benchmark_config.workspace / "output")
 
     return config
 
@@ -89,6 +108,7 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
 
 
 def pytest_addoption(parser: Any) -> None:
+    parser.addoption("--agent_config_path", action="store_true", default=False)
     parser.addoption("--mock", action="store_true", default=False)
     parser.addoption("--api_mode", action="store_true", default=False)
     parser.addoption("--host", action="store_true", default=None)
@@ -106,7 +126,9 @@ def pytest_addoption(parser: Any) -> None:
 @pytest.fixture(autouse=True)
 def check_regression(request: Any) -> None:
     test_name = request.node.parent.name
-    data = agbenchmark.start_benchmark.get_regression_data()
+    agent_benchmark_config = load_config_from_request(request)
+
+    data = json.loads(agent_benchmark_config.get_regression_reports_path())
 
     # Get the true location of the test
     challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
@@ -215,7 +237,15 @@ def scores(request: Any) -> None:
 
 # this is adding the dependency marker and category markers automatically from the json
 def pytest_collection_modifyitems(items: Any, config: Any) -> None:
-    data = agbenchmark.start_benchmark.get_regression_data()
+    try:
+        with open(config.getoption('--agent_config_path'), "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = config.getoption('--agent_config_path')
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        raise
+
+    data = json.loads(agent_benchmark_config.get_regression_reports_path())
 
     for item in items:
         # Assuming item.cls is your test class
@@ -252,17 +282,15 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
 
 @pytest.fixture(scope="session", autouse=True)
 def run_agent(request: Any) -> Any:
-    with open(agbenchmark.start_benchmark.CONFIG_PATH, "r") as f:
-        config = json.load(f)
-
+    agent_benchmark_config_path = request.config.getoption("--agent_config_path")
     if "--api_mode" not in sys.argv:
-        command = [sys.executable, "-m", "agbenchmark.benchmarks"]
+        command = [sys.executable, "-m", "benchmark.benchmarks"]
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             universal_newlines=True,
-            cwd=agbenchmark.start_benchmark.HOME_DIRECTORY,
+            cwd=agent_benchmark_config_path.entry_path.parent.parent,
         )
         time.sleep(3)
         yield
diff --git a/benchmark/agbenchmark/generate_test.py b/benchmark/benchmark/generate_test.py
similarity index 88%
rename from benchmark/agbenchmark/generate_test.py
rename to benchmark/benchmark/generate_test.py
index b4d6b201..1180119b 100644
--- a/benchmark/agbenchmark/generate_test.py
+++ b/benchmark/benchmark/generate_test.py
@@ -10,10 +10,9 @@ from typing import Any, Callable, Dict, Optional
 
 import pytest
 
-import agbenchmark.start_benchmark
-from agbenchmark.utils.challenge import Challenge
-from agbenchmark.utils.data_types import ChallengeData, SuiteConfig
-from agbenchmark.utils.utils import get_test_path
+from benchmark.utils.challenge import Challenge
+from benchmark.utils.data_types import ChallengeData, SuiteConfig, AgentBenchmarkConfig
+from benchmark.utils.utils import get_test_path
 
 DATA_CATEGORY = {}
 
@@ -72,7 +71,7 @@ def create_single_test(
 
     # Define test class dynamically
     challenge_class = types.new_class(data["name"], (Challenge,))
-
+    print(challenge_location)
     clean_challenge_location = get_test_path(challenge_location)
     setattr(challenge_class, "CHALLENGE_LOCATION", clean_challenge_location)
 
@@ -132,6 +131,8 @@ def create_single_test(
         await self.setup_challenge(config, timeout)
 
         scores = self.get_scores(config)
+        request.node.answers = scores["answers"]  # store answers in request.node
+        del scores["answers"]  # remove answers from scores
         request.node.scores = scores  # store scores in request.node
         assert 1 in scores["values"]
 
@@ -221,14 +222,34 @@ def create_challenge(
 def generate_tests() -> None:  # sourcery skip: invert-any-all
     print("Generating tests...")
 
+    challenges_path = os.path.join(os.path.dirname(__file__), 'challenges')
+
     json_files = deque(
         glob.glob(
-            f"{agbenchmark.start_benchmark.CHALLENGES_PATH}/**/data.json",
+            f"{challenges_path}/**/data.json",
             recursive=True,
         )
     )
-    regression_tests = agbenchmark.start_benchmark.get_regression_data()
 
+    agent_config_path = None
+    if "--agent-config" in sys.argv:
+        agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
+    else:
+        print(sys.argv)
+    try:
+        with open(agent_benchmark_config_path, "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        raise
+
+    regression_reports_path = agent_benchmark_config.get_regression_reports_path()
+    if regression_reports_path and os.path.exists(regression_reports_path):
+        with open(regression_reports_path, 'r') as f:
+            regression_tests = json.load(f)
+    else:
+        regression_tests = {}
     # for suites to know if the file has already been used to generate the tests
     # Dynamic class creation
 
@@ -287,7 +308,6 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
             # ):
             #     # a part of the suite but not the one specified
             #     continue
-
         json_files = create_challenge(data, json_file, suite_config, json_files)
 
         if suite_config and not (test_flag or maintain_flag or improve_flag):
diff --git a/benchmark/agbenchmark/reports/ReportManager.py b/benchmark/benchmark/reports/ReportManager.py
similarity index 74%
rename from benchmark/agbenchmark/reports/ReportManager.py
rename to benchmark/benchmark/reports/ReportManager.py
index 51feca1c..991dd7cf 100644
--- a/benchmark/agbenchmark/reports/ReportManager.py
+++ b/benchmark/benchmark/reports/ReportManager.py
@@ -6,11 +6,12 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict
 
-from agbenchmark.reports.processing.graphs import save_single_radar_chart
-from agbenchmark.reports.processing.process_report import get_agent_category
-from agbenchmark.reports.processing.report_types import Report
-from agbenchmark.utils.utils import get_highest_success_difficulty
-
+from benchmark.reports.processing.graphs import save_single_radar_chart
+from benchmark.reports.processing.process_report import get_agent_category
+from benchmark.reports.processing.report_types import Report
+from benchmark.utils.utils import get_highest_success_difficulty
+from benchmark.utils.data_types import AgentBenchmarkConfig
+from benchmark.__main__ import BENCHMARK_START_TIME
 
 class ReportManager:
     """Abstracts interaction with the regression tests file"""
@@ -21,6 +22,11 @@ class ReportManager:
         self.load()
 
     def load(self) -> None:
+        if not os.path.exists(self.filename):
+            os.makedirs(os.path.dirname(self.filename), exist_ok=True)
+            with open(self.filename, 'w') as f:
+                pass
+        
         try:
             with open(self.filename, "r") as f:
                 file_content = (
@@ -55,26 +61,25 @@ class ReportManager:
         self.tests = {}
         self.save()
 
-    def end_info_report(self, config: Dict[str, Any]) -> None:
-        import agbenchmark.start_benchmark
+    def end_info_report(self, config: AgentBenchmarkConfig) -> None:
 
         command = " ".join(sys.argv)
 
         self.tests = {
             "command": command.split(os.sep)[-1],
-            "benchmark_git_commit_sha": agbenchmark.start_benchmark.BENCHMARK_GIT_COMMIT_SHA,
-            "agent_git_commit_sha": agbenchmark.start_benchmark.AGENT_GIT_COMMIT_SHA,
+            "benchmark_git_commit_sha": '---',
+            "agent_git_commit_sha": '---',
             "completion_time": datetime.now(timezone.utc).strftime(
                 "%Y-%m-%dT%H:%M:%S+00:00"
             ),
-            "benchmark_start_time": agbenchmark.start_benchmark.BENCHMARK_START_TIME,
+            "benchmark_start_time": BENCHMARK_START_TIME,
             "metrics": {
                 "run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
                 "highest_difficulty": get_highest_success_difficulty(self.tests),
                 "total_cost": self.get_total_costs(),
             },
             "tests": self.tests,
-            "config": config,
+            "config": {k: v for k, v in json.loads(config.json()).items() if v is not None},
         }
 
         converted_data = Report.parse_obj(self.tests)
@@ -83,7 +88,8 @@ class ReportManager:
 
         save_single_radar_chart(
             agent_categories,
-            Path(agbenchmark.start_benchmark.REPORTS_PATH) / "radar_chart.png",
+            
+            config.get_reports_path() / "radar_chart.png",
         )
 
         self.save()
diff --git a/benchmark/agbenchmark/reports/processing/gen_combined_chart.py b/benchmark/benchmark/reports/processing/gen_combined_chart.py
similarity index 91%
rename from benchmark/agbenchmark/reports/processing/gen_combined_chart.py
rename to benchmark/benchmark/reports/processing/gen_combined_chart.py
index f7140de6..47d4c05e 100644
--- a/benchmark/agbenchmark/reports/processing/gen_combined_chart.py
+++ b/benchmark/benchmark/reports/processing/gen_combined_chart.py
@@ -2,11 +2,11 @@ import json
 import os
 from pathlib import Path
 
-from agbenchmark.reports.processing.graphs import (
+from benchmark.reports.processing.graphs import (
     save_combined_bar_chart,
     save_combined_radar_chart,
 )
-from agbenchmark.reports.processing.process_report import (
+from benchmark.reports.processing.process_report import (
     all_agent_categories,
     get_reports_data,
 )
diff --git a/benchmark/agbenchmark/reports/processing/get_files.py b/benchmark/benchmark/reports/processing/get_files.py
similarity index 100%
rename from benchmark/agbenchmark/reports/processing/get_files.py
rename to benchmark/benchmark/reports/processing/get_files.py
diff --git a/benchmark/agbenchmark/reports/processing/graphs.py b/benchmark/benchmark/reports/processing/graphs.py
similarity index 100%
rename from benchmark/agbenchmark/reports/processing/graphs.py
rename to benchmark/benchmark/reports/processing/graphs.py
diff --git a/benchmark/agbenchmark/reports/processing/process_report.py b/benchmark/benchmark/reports/processing/process_report.py
similarity index 91%
rename from benchmark/agbenchmark/reports/processing/process_report.py
rename to benchmark/benchmark/reports/processing/process_report.py
index 25f9303a..a94f76fe 100644
--- a/benchmark/agbenchmark/reports/processing/process_report.py
+++ b/benchmark/benchmark/reports/processing/process_report.py
@@ -3,11 +3,11 @@ import os
 from pathlib import Path
 from typing import Any
 
-from agbenchmark.reports.processing.get_files import (
+from benchmark.reports.processing.get_files import (
     get_latest_report_from_agent_directories,
 )
-from agbenchmark.reports.processing.report_types import Report, SuiteTest, Test
-from agbenchmark.utils.data_types import STRING_DIFFICULTY_MAP
+from benchmark.reports.processing.report_types import Report, SuiteTest, Test
+from benchmark.utils.data_types import STRING_DIFFICULTY_MAP
 
 
 def get_reports_data(report_path: str) -> dict[str, Any]:
diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/benchmark/reports/processing/report_types.py
similarity index 100%
rename from benchmark/agbenchmark/reports/processing/report_types.py
rename to benchmark/benchmark/reports/processing/report_types.py
diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/benchmark/reports/reports.py
similarity index 78%
rename from benchmark/agbenchmark/reports/reports.py
rename to benchmark/benchmark/reports/reports.py
index 1e5ba1e6..1cb81fd3 100644
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/benchmark/reports/reports.py
@@ -4,15 +4,48 @@ import sys
 from pathlib import Path
 from typing import Any, Dict
 
-import agbenchmark.start_benchmark
-from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig
-from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone
-from agbenchmark.utils.utils import (
+from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig, AgentBenchmarkConfig
+from benchmark.utils.get_data_from_helicone import get_data_from_helicone
+from benchmark.utils.utils import (
     calculate_success_percentage,
     get_highest_success_difficulty,
     get_test_path,
     replace_backslash,
 )
+from benchmark.reports.ReportManager import ReportManager
+
+
+
+def get_agent_benchmark_config() -> AgentBenchmarkConfig:
+    if "--agent-config" in sys.argv:
+        agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
+    else:
+        print(sys.argv)
+    try:
+        with open(agent_benchmark_config_path, "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            return agent_benchmark_config
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        raise
+
+def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
+    agent_benchmark_config = get_agent_benchmark_config()
+    # tests that consistently pass are considered regression tests
+    REGRESSION_MANAGER = ReportManager(agent_benchmark_config.get_regression_reports_path())
+
+    # print(f"Using {REPORTS_PATH} for reports")
+    # user facing reporting information
+    INFO_MANAGER = ReportManager(str(agent_benchmark_config.get_reports_path() / "report.json"))
+
+    # internal db step in replacement track pass/fail rate
+    INTERNAL_INFO_MANAGER = ReportManager(agent_benchmark_config.get_success_rate_path())
+
+    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
+
+
+(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
 
 
 def generate_combined_suite_report(
@@ -26,6 +59,7 @@ def generate_combined_suite_report(
 
     data_paths = suite_config.get_data_paths(root_path / Path(challenge_location))
     scores = getattr(item, "scores", {})
+
     mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
 
     tests = {}
@@ -65,7 +99,7 @@ def generate_combined_suite_report(
             # add dependency fail here
 
             if not mock:  # don't remove if it's a mock test
-                agbenchmark.start_benchmark.REGRESSION_MANAGER.remove_test(test_name)
+                REGRESSION_MANAGER.remove_test(test_name)
 
         prev_test_results: list[bool] = get_previous_test_results(
             test_name, test_info_details
@@ -76,7 +110,7 @@ def generate_combined_suite_report(
         )
 
         tests[test_name] = test_info_details
-
+        
     info_details: Any = {
         "data_path": challenge_location,
         "task": challenge_data["task"],
@@ -98,14 +132,14 @@ def get_previous_test_results(
     agent_tests: dict[str, list[bool]] = {}
     mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
 
-    prev_test_results = agbenchmark.start_benchmark.INTERNAL_INFO_MANAGER.tests.get(
+    prev_test_results = INTERNAL_INFO_MANAGER.tests.get(
         test_name, []
     )
 
     if not mock:
         # only add if it's an actual test
         prev_test_results.append(info_details["metrics"]["success"])
-        agbenchmark.start_benchmark.INTERNAL_INFO_MANAGER.add_test(
+        INTERNAL_INFO_MANAGER.add_test(
             test_name, prev_test_results
         )
 
@@ -126,7 +160,7 @@ def update_regression_tests(
     if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
         # if the last 3 tests were successful, add to the regression tests
         info_details["is_regression"] = True
-        agbenchmark.start_benchmark.REGRESSION_MANAGER.add_test(test_name, test_details)
+        REGRESSION_MANAGER.add_test(test_name, test_details)
 
 
 def generate_single_call_report(
@@ -144,6 +178,7 @@ def generate_single_call_report(
     challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
     test_name = item.nodeid.split("::")[1]
     item.test_name = test_name
+    answers = call.node.answers
 
     test_details = {
         "difficulty": difficulty,
@@ -162,7 +197,10 @@ def generate_single_call_report(
             "success": False,
             "attempted": True,
         },
+        "answers": answers,
     }
+    if 'metadata' in challenge_data:
+        info_details['metadata'] = challenge_data['metadata']
 
     mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
 
@@ -170,7 +208,7 @@ def generate_single_call_report(
         info_details["metrics"]["success"] = True
     else:
         if not mock:  # don't remove if it's a mock test
-            agbenchmark.start_benchmark.REGRESSION_MANAGER.remove_test(test_name)
+            REGRESSION_MANAGER.remove_test(test_name)
         info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
         if call.excinfo.typename == "Skipped":
             info_details["metrics"]["attempted"] = False
@@ -221,7 +259,7 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
                             nested_test_info, nested_test_name
                         )
 
-        agbenchmark.start_benchmark.INFO_MANAGER.add_test(test_name, info_details)
+        INFO_MANAGER.add_test(test_name, info_details)
 
 
 def update_challenges_already_beaten(
@@ -260,11 +298,11 @@ def generate_separate_suite_reports(suite_reports: dict) -> None:
         }
 
         for name in suite_file_datum:
-            test_data = agbenchmark.start_benchmark.INFO_MANAGER.tests[
+            test_data = INFO_MANAGER.tests[
                 name
             ]  # get the individual test reports
             data[name] = test_data  # this is for calculating highest difficulty
-            agbenchmark.start_benchmark.INFO_MANAGER.remove_test(name)
+            INFO_MANAGER.remove_test(name)
 
             successes.append(test_data["metrics"]["success"])
             run_time += float(test_data["metrics"]["run_time"].split(" ")[0])
@@ -282,7 +320,7 @@ def generate_separate_suite_reports(suite_reports: dict) -> None:
             Path(next(iter(data.values()))["data_path"]).resolve().parent.parent
         )
         info_details["data_path"] = get_test_path(suite_path)
-        agbenchmark.start_benchmark.INFO_MANAGER.add_test(prefix, info_details)
+        INFO_MANAGER.add_test(prefix, info_details)
 
 
 def session_finish(suite_reports: dict) -> None:
@@ -290,9 +328,9 @@ def session_finish(suite_reports: dict) -> None:
     if not flags:
         generate_separate_suite_reports(suite_reports)
 
-    with open(agbenchmark.start_benchmark.CONFIG_PATH, "r") as f:
-        config = json.load(f)
+    agent_benchmark_config = get_agent_benchmark_config()
 
-    agbenchmark.start_benchmark.INTERNAL_INFO_MANAGER.save()
-    agbenchmark.start_benchmark.INFO_MANAGER.end_info_report(config)
-    agbenchmark.start_benchmark.REGRESSION_MANAGER.save()
+
+    INTERNAL_INFO_MANAGER.save()
+    INFO_MANAGER.end_info_report(agent_benchmark_config)
+    REGRESSION_MANAGER.save()
diff --git a/benchmark/agbenchmark/start_benchmark.py b/benchmark/benchmark/start_benchmark.py
similarity index 95%
rename from benchmark/agbenchmark/start_benchmark.py
rename to benchmark/benchmark/start_benchmark.py
index a94ae234..77044b5c 100644
--- a/benchmark/agbenchmark/start_benchmark.py
+++ b/benchmark/benchmark/start_benchmark.py
@@ -9,12 +9,14 @@ from typing import Any, Optional
 import click
 import pytest
 from helicone.lock import HeliconeLockManager
+import sys
+sys.path.append('/Users/swifty/dev/Auto-GPT/benchmark')
 
 from agbenchmark.reports.ReportManager import ReportManager
 from agbenchmark.utils.utils import (
     AGENT_NAME,
     calculate_dynamic_paths,
-    get_git_commit_sha,
+    # get_git_commit_sha,
 )
 
 CURRENT_DIRECTORY = Path(__file__).resolve().parent
@@ -32,8 +34,8 @@ if os.environ.get("HELICONE_API_KEY"):
     SUCCESS_RATE_PATH,
     CHALLENGES_PATH,
 ) = calculate_dynamic_paths()
-BENCHMARK_GIT_COMMIT_SHA = get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
-AGENT_GIT_COMMIT_SHA = get_git_commit_sha(HOME_DIRECTORY)
+BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
+AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY)
 # open a file in the challenges/optional_categories
 with open(
     Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
@@ -332,6 +334,14 @@ def get_regression_data() -> Any:
 
     return data
 
+@cli.command()
+def version():
+    """Print the version of the benchmark tool."""
+    import toml
+    version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
+    print(f"Benchmark Tool Version {version}")
+
+
 
 # def run_from_backend(
 #     maintain: bool = False,
@@ -420,5 +430,10 @@ def get_regression_data() -> Any:
 #     return latest_report
 
 
+<<<<<<< HEAD:benchmark/agbenchmark/start_benchmark.py
 # if __name__ == "__main__":
 # start()
+=======
+if __name__ == "__main__":
+    cli()
+>>>>>>> 61b4afcb (Fixing benchmarks):benchmark/benchmark/start_benchmark.py
diff --git a/benchmark/agbenchmark/utils/challenge.py b/benchmark/benchmark/utils/challenge.py
similarity index 95%
rename from benchmark/agbenchmark/utils/challenge.py
rename to benchmark/benchmark/utils/challenge.py
index 72849f51..e1d0c4fe 100644
--- a/benchmark/agbenchmark/utils/challenge.py
+++ b/benchmark/benchmark/utils/challenge.py
@@ -10,16 +10,15 @@ from typing import Any, Dict, List
 import openai
 import pytest
 
-import agbenchmark.start_benchmark
-from agbenchmark.agent_api_interface import run_api_agent
-from agbenchmark.utils.data_types import ChallengeData, Ground
-from agbenchmark.utils.prompts import (
+from benchmark.agent_api_interface import run_api_agent
+from benchmark.utils.data_types import ChallengeData, Ground
+from benchmark.utils.prompts import (
     END_PROMPT,
     FEW_SHOT_EXAMPLES,
     PROMPT_MAP,
     SCORING_MAP,
 )
-from agbenchmark.utils.utils import agent_eligibible_for_optional_categories
+from benchmark.utils.utils import agent_eligibible_for_optional_categories
 
 
 class Challenge(ABC):
@@ -48,7 +47,7 @@ class Challenge(ABC):
         return self.data.dependencies
 
     async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
-        from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
+        from benchmark.agent_interface import copy_artifacts_into_workspace, run_agent
 
         artifact_paths = [
             self.ARTIFACTS_LOCATION,
@@ -210,15 +209,16 @@ class Challenge(ABC):
         scores = []
         scores_dict: Any = {}
         percentage = None
-
+        answers = {}
         try:
             if self.data.task == "" and "--mock" in sys.argv:
                 scores = [1.0]
+                answers = {"mock": "This is a mock answer"}
             elif isinstance(self.data.ground, Ground):
                 files_contents = self.get_artifacts_out(
                     config["workspace"], self.data.ground
                 )
-
+                answers = {"answer": files_contents}
                 for file_content in files_contents:
                     score = self.scoring(config, file_content, self.data.ground)
                     print("\033[1;32mYour score is:\033[0m", score)
@@ -240,6 +240,7 @@ class Challenge(ABC):
                 for ground_key in self.data.ground:
                     ground = self.data.ground[ground_key]
                     files_contents = self.get_artifacts_out(config["workspace"], ground)
+                    answers[ground_key] = files_contents
 
                     for file_content in files_contents:
                         score = self.scoring(config, file_content, ground)
@@ -289,6 +290,7 @@ class Challenge(ABC):
             "values": scores,
             "scores_obj": scores_dict,
             "percentage": percentage,
+            "answers": answers,
         }
 
         self.scores[self.__class__.__name__] = scores_data
@@ -306,7 +308,7 @@ class Challenge(ABC):
         challenge_category = self.data.category
         categories = [
             category
-            for category in agbenchmark.start_benchmark.OPTIONAL_CATEGORIES
+            for category in benchmark.start_benchmark.OPTIONAL_CATEGORIES
             if category in challenge_category
         ]
         if not agent_eligibible_for_optional_categories(
diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/benchmark/utils/data_types.py
similarity index 73%
rename from benchmark/agbenchmark/utils/data_types.py
rename to benchmark/benchmark/utils/data_types.py
index d40682a1..e5d9e987 100644
--- a/benchmark/agbenchmark/utils/data_types.py
+++ b/benchmark/benchmark/utils/data_types.py
@@ -3,10 +3,9 @@ import json
 from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-
+import sys
 from pydantic import BaseModel, root_validator, validator
-
-
+from datetime import datetime, timezone
 class DifficultyLevel(Enum):
     interface = "interface"
     basic = "basic"
@@ -30,6 +29,77 @@ DIFFICULTY_MAP = {
 
 STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel}
 
+def calculate_info_test_path(base_path: Path) -> Path:
+    """
+    Calculates the path to the directory where the test report will be saved.
+    """
+    # Ensure the reports path exists
+    base_path.mkdir(parents=True, exist_ok=True)
+
+    # Get current UTC date-time stamp
+    date_stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
+
+    # Default run name
+    run_name = "full_run"
+
+    # Map command-line arguments to their respective labels
+    arg_labels = {
+        "--test": None,
+        "--suite": None,
+        "--category": None,
+        "--maintain": "maintain",
+        "--improve": "improve",
+        "--explore": "explore",
+    }
+
+    # Identify the relevant command-line argument
+    for arg, label in arg_labels.items():
+        if arg in sys.argv:
+            test_arg = sys.argv[sys.argv.index(arg) + 1] if label is None else None
+            run_name = arg.strip("--")
+            if test_arg:
+                run_name = f"{run_name}_{test_arg}"
+            break
+
+    # Create the full new directory path with ISO standard UTC date-time stamp
+    report_path = base_path / f"{date_stamp}_{run_name}"
+
+    # Ensure the new directory is created
+    report_path.mkdir(exist_ok=True)
+
+    return report_path
+
+class AgentBenchmarkConfig(BaseModel):
+    """
+    This class represents the configuration for the Agent Benchmark.
+    It includes the following attributes:
+    - entry_path: The path to the file that, when run, starts the agent configured for benchmarking, realtive location from the config_file.
+    - workspace: The path to the workspace where the benchmark will be run.
+    - reports_folder: The path to the folder where the benchmark reports will be stored.
+    - api_mode: A boolean indicating whether the benchmark is run in API mode.
+    - host: The host where the benchmark is run.
+    """
+    agent_benchmark_config_path: Path | None = None
+    entry_path: Path
+    workspace: Path
+    reports_folder: Path | None = None
+    api_mode: bool = False
+    host: str | None
+
+    def get_reports_location(self) -> Path:
+        if not self.reports_folder:
+            self.reports_folder = (self.agent_benchmark_config_path / self.entry_path.parent / ".." / "reports").resolve()
+        return self.reports_folder
+    
+    def get_reports_path(self) -> Path:
+        return calculate_info_test_path(self.get_reports_location())
+    
+    def get_regression_reports_path(self) -> Path:
+
+        return self.get_reports_location() / "regression_tests.json"
+    
+    def get_success_rate_path(self) -> Path:
+        return self.get_reports_location() / "success_rate.json"
 
 class Info(BaseModel):
     difficulty: DifficultyLevel
@@ -100,6 +170,7 @@ class ChallengeData(BaseModel):
     cutoff: int
     ground: Ground | Dict[str, Ground]
     info: Info | Dict[str, Info]
+    metadata: Optional[Dict[str, Any]] = None
 
     def serialize(self, path: str) -> None:
         with open(path, "w") as file:
diff --git a/benchmark/agbenchmark/utils/dependencies/__init__.py b/benchmark/benchmark/utils/dependencies/__init__.py
similarity index 99%
rename from benchmark/agbenchmark/utils/dependencies/__init__.py
rename to benchmark/benchmark/utils/dependencies/__init__.py
index bf2dba18..596c4760 100644
--- a/benchmark/agbenchmark/utils/dependencies/__init__.py
+++ b/benchmark/benchmark/utils/dependencies/__init__.py
@@ -67,7 +67,6 @@ def pytest_addoption(parser: Parser) -> None:
         for action in group.options:
             current_options += action._short_opts + action._long_opts
 
-    print(current_options)
 
     group = parser.getgroup("depends")
 
diff --git a/benchmark/agbenchmark/utils/dependencies/constants.py b/benchmark/benchmark/utils/dependencies/constants.py
similarity index 100%
rename from benchmark/agbenchmark/utils/dependencies/constants.py
rename to benchmark/benchmark/utils/dependencies/constants.py
diff --git a/benchmark/agbenchmark/utils/dependencies/graphs.py b/benchmark/benchmark/utils/dependencies/graphs.py
similarity index 98%
rename from benchmark/agbenchmark/utils/dependencies/graphs.py
rename to benchmark/benchmark/utils/dependencies/graphs.py
index cf54f32b..3cb85af2 100644
--- a/benchmark/agbenchmark/utils/dependencies/graphs.py
+++ b/benchmark/benchmark/utils/dependencies/graphs.py
@@ -9,8 +9,8 @@ import networkx as nx
 import numpy as np
 from pyvis.network import Network
 
-from agbenchmark.generate_test import DATA_CATEGORY
-from agbenchmark.utils.utils import find_absolute_benchmark_path
+from benchmark.generate_test import DATA_CATEGORY
+from benchmark.utils.utils import find_absolute_benchmark_path
 
 
 def bezier_curve(
diff --git a/benchmark/agbenchmark/utils/dependencies/main.py b/benchmark/benchmark/utils/dependencies/main.py
similarity index 100%
rename from benchmark/agbenchmark/utils/dependencies/main.py
rename to benchmark/benchmark/utils/dependencies/main.py
diff --git a/benchmark/agbenchmark/utils/dependencies/util.py b/benchmark/benchmark/utils/dependencies/util.py
similarity index 100%
rename from benchmark/agbenchmark/utils/dependencies/util.py
rename to benchmark/benchmark/utils/dependencies/util.py
diff --git a/benchmark/agbenchmark/utils/get_data_from_helicone.py b/benchmark/benchmark/utils/get_data_from_helicone.py
similarity index 92%
rename from benchmark/agbenchmark/utils/get_data_from_helicone.py
rename to benchmark/benchmark/utils/get_data_from_helicone.py
index 4e18b48e..0d04ed1d 100644
--- a/benchmark/agbenchmark/utils/get_data_from_helicone.py
+++ b/benchmark/benchmark/utils/get_data_from_helicone.py
@@ -4,8 +4,7 @@ from typing import Optional
 
 import requests
 
-import agbenchmark.start_benchmark
-from agbenchmark.agent_interface import HELICONE_GRAPHQL_LOGS
+from benchmark.agent_interface import HELICONE_GRAPHQL_LOGS
 
 
 def get_data_from_helicone(challenge: str) -> Optional[float]:
@@ -31,7 +30,7 @@ query ExampleQuery($properties: [PropertyFilter!]){
                 "name": "agent",
             },
             {
-                "value": {"equals": agbenchmark.start_benchmark.BENCHMARK_START_TIME},
+                "value": {"equals": benchmark.start_benchmark.BENCHMARK_START_TIME},
                 "name": "benchmark_start_time",
             },
             {"value": {"equals": challenge}, "name": "challenge"},
diff --git a/benchmark/agbenchmark/utils/prompts.py b/benchmark/benchmark/utils/prompts.py
similarity index 100%
rename from benchmark/agbenchmark/utils/prompts.py
rename to benchmark/benchmark/utils/prompts.py
diff --git a/benchmark/agbenchmark/utils/utils.py b/benchmark/benchmark/utils/utils.py
similarity index 80%
rename from benchmark/agbenchmark/utils/utils.py
rename to benchmark/benchmark/utils/utils.py
index 56996e24..ebfdb030 100644
--- a/benchmark/agbenchmark/utils/utils.py
+++ b/benchmark/benchmark/utils/utils.py
@@ -10,52 +10,13 @@ import git
 from dotenv import load_dotenv
 
 load_dotenv()
-
-from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel
+from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel
 
 AGENT_NAME = os.getenv("AGENT_NAME")
 REPORT_LOCATION = os.getenv("REPORT_LOCATION", None)
 
 
-def calculate_info_test_path(base_path: Path) -> str:
-    """
-    Calculates the path to the directory where the test report will be saved.
-    """
-    # Ensure the reports path exists
-    base_path.mkdir(parents=True, exist_ok=True)
 
-    # Get current UTC date-time stamp
-    date_stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
-
-    # Default run name
-    run_name = "full_run"
-
-    # Map command-line arguments to their respective labels
-    arg_labels = {
-        "--test": None,
-        "--suite": None,
-        "--category": None,
-        "--maintain": "maintain",
-        "--improve": "improve",
-        "--explore": "explore",
-    }
-
-    # Identify the relevant command-line argument
-    for arg, label in arg_labels.items():
-        if arg in sys.argv:
-            test_arg = sys.argv[sys.argv.index(arg) + 1] if label is None else None
-            run_name = arg.strip("--")
-            if test_arg:
-                run_name = f"{run_name}_{test_arg}"
-            break
-
-    # Create the full new directory path with ISO standard UTC date-time stamp
-    report_path = base_path / f"{date_stamp}_{run_name}"
-
-    # Ensure the new directory is created
-    report_path.mkdir(exist_ok=True)
-
-    return str(report_path)
 
 
 def replace_backslash(value: Any) -> Any:
@@ -88,7 +49,7 @@ def get_test_path(json_file: str | Path) -> str:
 
     # Find the index of "agbenchmark" in the path parts
     try:
-        agbenchmark_index = json_file.parts.index("agbenchmark")
+        agbenchmark_index = json_file.parts.index("benchmark")
     except ValueError:
         raise ValueError("Invalid challenge location.")
 
@@ -249,19 +210,19 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str, str, str]:
     )
 
 
-def get_git_commit_sha(directory: Path) -> Optional[str]:
-    try:
-        repo = git.Repo(directory)
-        remote_url = repo.remotes.origin.url
-        if remote_url.endswith(".git"):
-            remote_url = remote_url[:-4]
-        git_commit_sha = f"{remote_url}/tree/{repo.head.commit.hexsha}"
+# def get_git_commit_sha(directory: Path) -> Optional[str]:
+#     try:
+#         repo = git.Repo(directory)
+#         remote_url = repo.remotes.origin.url
+#         if remote_url.endswith(".git"):
+#             remote_url = remote_url[:-4]
+#         git_commit_sha = f"{remote_url}/tree/{repo.head.commit.hexsha}"
 
-        # print(f"GIT_COMMIT_SHA: {git_commit_sha}")
-        return git_commit_sha
-    except Exception:
-        # print(f"{directory} is not a git repository!")
-        return None
+#         # print(f"GIT_COMMIT_SHA: {git_commit_sha}")
+#         return git_commit_sha
+#     except Exception:
+#         # print(f"{directory} is not a git repository!")
+#         return None
 
 
 def agent_eligibible_for_optional_categories(
diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml
index 88740f4b..7fd2efcf 100644
--- a/benchmark/pyproject.toml
+++ b/benchmark/pyproject.toml
@@ -70,8 +70,6 @@ filterwarnings = [
     "ignore::matplotlib.MatplotlibDeprecationWarning"
 ]
 
-[tool.poetry.scripts]
-agbenchmark = "agbenchmark.start_benchmark:cli"
 
 [tool.black]
 line-length = 88
diff --git a/benchmark/run.sh b/benchmark/run.sh
old mode 100644
new mode 100755