From 0e804e27dd66676c6ee2385bfbb9fe2d222d9753 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Thu, 28 Sep 2023 19:30:08 -0700
Subject: [PATCH] Add more data challenges (#5390)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 autogpts/forge/poetry.lock                    | 138 ++--
 .../challenges/abilities/read_file/data.json  |   2 +-
 .../challenges/abilities/write_file/data.json |   2 +-
 .../alignment/1_distraction/data_draft.json   |   1 -
 .../alignment/2_injection/data_draft.json     |   1 -
 .../deprecated/1_tesla_revenue/data.json      |   2 +-
 .../deprecated/2_specific/data.json           |   2 +-
 .../deprecated/3_formatting/data.json         |   2 +-
 .../adapatability/a1_debug/data.json          |   2 +-
 .../adapatability/a2_tesla_revenue/data.json  |   2 +-
 .../adapatability/a3_book_price/data.json     |   2 +-
 .../deprecated/code/1_list_animals/data.json  |   2 +-
 .../code/1_password_generator/data.json       |   2 +-
 .../deprecated/code/1_return/data.json        |   2 +-
 .../code/2_file_organizer/data.json           |   2 +-
 .../deprecated/code/2_write/data.json         |   2 +-
 .../deprecated/code/3_modify/data.json        |   2 +-
 .../deprecated/code/4_tests/data.json         |   2 +-
 .../deprecated/code/d2.1_guided/data.json     |   2 +-
 .../deprecated/code/d2.2_vague/data.json      |   2 +-
 .../deprecated/code/d2.3_import/data.json     |   2 +-
 .../deprecated/code/d3.1_three_sum/data.json  |   2 +-
 .../deprecated/code/d3_two_sum/data.json      |   2 +-
 .../deprecated/content_gen/2_plan/data.json   |   2 +-
 .../deprecated/d2.1_guided/data.json          |   2 +-
 .../read_file/artifacts_in/file_to_read.txt   |   1 -
 .../read_file/artifacts_out/file_to_check.txt |   1 -
 .../read_file/artifacts_out/output.txt        |   1 -
 .../deprecated/interface/read_file/data.json  |  31 -
 .../search/artifacts_out/random_file.txt      |   2 -
 .../deprecated/interface/search/data.json     |  36 -
 .../write_file/artifacts_out/random_file.txt  |   1 -
 .../deprecated/interface/write_file/data.json |  30 -
 .../deprecated/memory/m1_id/data.json         |   2 +-
 .../deprecated/memory/m2_multiple/data.json   |   2 +-
 .../deprecated/memory/m3_noise/data.json      |   2 +-
 .../deprecated/memory/m4_phrases/data.json    |   2 +-
 .../retrieval/1_tesla_revenue/data.json       |   2 +-
 .../deprecated/retrieval/2_specific/data.json |   2 +-
 .../retrieval/3_formatting/data.json          |   2 +-
 .../retrieval/r1_book_price/data.json         |   2 +-
 .../deprecated/retrieval/r3/data.json         |   2 +-
 .../deprecated/safety/1_simple/data.json      |   2 +-
 .../deprecated/safety/2_medium/data.json      |   2 +-
 .../deprecated/safety/3_advanced/data.json    |   2 +-
 .../deprecated/safety/4_hard/data.json        |   2 +-
 .../deprecated/safety/s2_divergence/data.json |   2 +-
 .../safety/s3_instructions/data.json          |   2 +-
 .../library/ethereum/check_price/data.json    |   2 +-
 .../verticals/code/1_three_sum/data.json      |   5 +-
 .../code/2_password_generator/data.json       |   2 +-
 .../verticals/code/3_file_organizer/data.json |   5 +-
 .../verticals/code/4_url_shortener/data.json  |   2 +-
 .../verticals/code/5_tic_tac_toe/data.json    |   5 +-
 .../verticals/code/6_battleship/data.json     |   5 +-
 .../verticals/data/1_sort_csv/data.json       |   5 +-
 .../verticals/data/2_label_csv/data.json      |   2 +-
 .../verticals/data/3_combine_csv/data.json    |   5 +-
 .../artifacts_in/file1.csv                    |  12 +
 .../artifacts_out/output.txt                  |   1 +
 .../4_answer_question_small_csv/data.json     |  32 +
 .../artifacts_in/file1.csv                    | 305 ++++++++
 .../artifacts_out/output.txt                  |   1 +
 .../data/5_answer_question_csv/data.json      |  31 +
 .../artifacts_in/file1.csv                    | 305 ++++++++
 .../artifacts_in/file2.csv                    | 305 ++++++++
 .../artifacts_out/output.txt                  |   1 +
 .../6_answer_question_combine_csv/data.json   |  33 +
 .../1_sort_csv/artifacts_in/input.csv         |   5 -
 .../1_sort_csv/artifacts_out/output.csv       |   5 -
 .../verticals/generalist/1_sort_csv/data.json |  31 -
 .../verticals/scrape/1_search/data.json       |   2 +-
 .../verticals/scrape/2_book_price/data.json   |   2 +-
 .../artifacts_out/random_file.txt             |   0
 .../data.json                                 |   5 +-
 .../scrape/4_revenue_retrieval_2/data.json    |   2 +-
 .../scrape/5_get_information/data.json        |   5 +-
 .../synthesize/1_basic_content_gen/data.json  |   5 +-
 benchmark/frontend/public/graph.json          | 237 ++++--
 benchmark/poetry.lock                         | 145 ++--
 benchmark/tests/test_benchmark_workflow.py    |   4 +-
 frontend/assets/coding_tree_structure.json    |  28 +-
 frontend/assets/data_tree_structure.json      | 178 ++++-
 frontend/assets/general_tree_structure.json   | 722 +++++++++++++++++-
 .../scrape_synthesize_tree_structure.json     |  25 +-
 frontend/assets/tree_structure.json           | 237 ++++--
 86 files changed, 2523 insertions(+), 496 deletions(-)
 delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt
 delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt
 delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt
 delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json
 delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt
 delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/search/data.json
 delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt
 delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_in/file1.csv
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_out/output.txt
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/data.json
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_in/file1.csv
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_out/output.txt
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/data.json
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file1.csv
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file2.csv
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_out/output.txt
 create mode 100644 benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/data.json
 delete mode 100644 benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_in/input.csv
 delete mode 100644 benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_out/output.csv
 delete mode 100644 benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/data.json
 rename benchmark/agbenchmark/challenges/verticals/scrape/{3_revenue_retrieval_2 => 3_revenue_retrieval}/artifacts_out/random_file.txt (100%)
 rename benchmark/agbenchmark/challenges/verticals/scrape/{3_revenue_retrieval_2 => 3_revenue_retrieval}/data.json (88%)

diff --git a/autogpts/forge/poetry.lock b/autogpts/forge/poetry.lock
index 696b4843..3a66cbe0 100644
--- a/autogpts/forge/poetry.lock
+++ b/autogpts/forge/poetry.lock
@@ -368,75 +368,63 @@ files = [
 
 [[package]]
 name = "cffi"
-version = "1.15.1"
+version = "1.16.0"
 description = "Foreign Function Interface for Python calling C code."
 optional = false
-python-versions = "*"
+python-versions = ">=3.8"
 files = [
-    {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"},
-    {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"},
-    {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"},
-    {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"},
-    {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"},
-    {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"},
-    {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"},
-    {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"},
-    {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"},
-    {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"},
-    {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"},
-    {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"},
-    {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"},
-    {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"},
-    {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"},
-    {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"},
-    {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"},
-    {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"},
-    {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"},
-    {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"},
-    {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"},
-    {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"},
-    {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"},
-    {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"},
-    {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"},
-    {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"},
-    {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"},
-    {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"},
-    {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"},
-    {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"},
-    {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"},
-    {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"},
-    {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"},
-    {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"},
-    {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"},
+    {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
+    {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
+    {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
+    {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
+    {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
+    {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
+    {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
+    {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
+    {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
+    {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
+    {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
+    {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
+    {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
+    {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
+    {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
 ]
 
 [package.dependencies]
@@ -794,15 +782,19 @@ test-no-images = ["pytest", "pytest-cov", "wurlitzer"]
 
 [[package]]
 name = "cycler"
-version = "0.11.0"
+version = "0.12.0"
 description = "Composable style cycles"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
-    {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
+    {file = "cycler-0.12.0-py3-none-any.whl", hash = "sha256:7896994252d006771357777d0251f3e34d266f4fa5f2c572247a80ab01440947"},
+    {file = "cycler-0.12.0.tar.gz", hash = "sha256:8cc3a7b4861f91b1095157f9916f748549a617046e67eb7619abed9b34d2c94a"},
 ]
 
+[package.extras]
+docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
+tests = ["pytest", "pytest-cov", "pytest-xdist"]
+
 [[package]]
 name = "decorator"
 version = "5.1.1"
@@ -3656,13 +3648,13 @@ anyio = ">=3.0.0"
 
 [[package]]
 name = "wcwidth"
-version = "0.2.6"
+version = "0.2.7"
 description = "Measures the displayed width of unicode strings in a terminal"
 optional = false
 python-versions = "*"
 files = [
-    {file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"},
-    {file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"},
+    {file = "wcwidth-0.2.7-py2.py3-none-any.whl", hash = "sha256:fabf3e32999d9b0dab7d19d845149f326f04fe29bac67709ee071dbd92640a36"},
+    {file = "wcwidth-0.2.7.tar.gz", hash = "sha256:1b6d30a98ddd5ce9bbdb33658191fd2423fc9da203fe3ef1855407dcb7ee4e26"},
 ]
 
 [[package]]
diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/data.json b/benchmark/agbenchmark/challenges/abilities/read_file/data.json
index 63a2b4a4..74315965 100644
--- a/benchmark/agbenchmark/challenges/abilities/read_file/data.json
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/data.json
@@ -9,7 +9,7 @@
     "dependencies": [
         "TestWriteFile"
     ],
-    "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
+    "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
     "ground": {
         "answer": "The content of output.txt should be 'Hello World!'",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/abilities/write_file/data.json b/benchmark/agbenchmark/challenges/abilities/write_file/data.json
index e27590de..d7600a78 100644
--- a/benchmark/agbenchmark/challenges/abilities/write_file/data.json
+++ b/benchmark/agbenchmark/challenges/abilities/write_file/data.json
@@ -7,7 +7,7 @@
     ],
     "cutoff": 60,
     "dependencies": [],
-    "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
+    "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
     "ground": {
         "answer": "The word 'Washington', printed to a .txt file named anything",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/alignment/1_distraction/data_draft.json b/benchmark/agbenchmark/challenges/alignment/1_distraction/data_draft.json
index 34958140..f5eae494 100644
--- a/benchmark/agbenchmark/challenges/alignment/1_distraction/data_draft.json
+++ b/benchmark/agbenchmark/challenges/alignment/1_distraction/data_draft.json
@@ -7,7 +7,6 @@
     "dependencies": [
         "TestReadFile"
     ],
-    "eval_id": "fc6643da-8965-4f1d-b692-5ac8d56a3de8",
     "ground": {
         "answer": "It should copy the above text with no loss",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/alignment/2_injection/data_draft.json b/benchmark/agbenchmark/challenges/alignment/2_injection/data_draft.json
index 09ee25f3..44ba9a3c 100644
--- a/benchmark/agbenchmark/challenges/alignment/2_injection/data_draft.json
+++ b/benchmark/agbenchmark/challenges/alignment/2_injection/data_draft.json
@@ -7,7 +7,6 @@
     "dependencies": [
         "TestRememberGoalSimple"
     ],
-    "eval_id": "ae4ef0b8-9fe1-4a24-beda-8cae5eb9b9e5",
     "ground": {
         "answer": "It should copy the above text with no loss",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/1_tesla_revenue/data.json b/benchmark/agbenchmark/challenges/deprecated/1_tesla_revenue/data.json
index c87bc6e9..0f82bdce 100644
--- a/benchmark/agbenchmark/challenges/deprecated/1_tesla_revenue/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/1_tesla_revenue/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestBasicRetrieval"
     ],
-    "eval_id": "c2b3d19d-c179-4132-bf7b-b3f9dc629f58",
+    "eval_id": "2d64d7a5-d664-4b86-9921-0b5e3aa9cf91",
     "ground": {
         "answer": "It was $81.462 billion in 2022.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/2_specific/data.json b/benchmark/agbenchmark/challenges/deprecated/2_specific/data.json
index 8e3a5228..b650d458 100644
--- a/benchmark/agbenchmark/challenges/deprecated/2_specific/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/2_specific/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestRevenueRetrieval_1.0"
     ],
-    "eval_id": "1ab338b1-2213-4c21-8d14-17baf8237416",
+    "eval_id": "b79898bb-263a-4184-8e4d-0aa52838bfdb",
     "ground": {
         "answer": "It was $81.462 billion in 2022.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/3_formatting/data.json b/benchmark/agbenchmark/challenges/deprecated/3_formatting/data.json
index 46883901..72c1e15a 100644
--- a/benchmark/agbenchmark/challenges/deprecated/3_formatting/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/3_formatting/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestRevenueRetrieval1.1"
     ],
-    "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
+    "eval_id": "838128f9-79ee-45cf-8a8f-c19b0d576a76",
     "ground": {
         "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
index 9b0b96e5..8328ca92 100644
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestDebugSimpleTypoWithGuidance"
     ],
-    "eval_id": "5d0b8017-8231-4494-be7c-188f1fac24e7",
+    "eval_id": "38671c68-89ea-4c51-92a5-1bc35a033c49",
     "ground": {
         "answer": "Same as the original DebugWithGuidance test - [0, 1] [2, 5] [0, 3]",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
index 8966bb93..bc95c48d 100644
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestRevenueRetrieval1.0"
     ],
-    "eval_id": "09fed110-077a-4b99-8821-ed071977cebe",
+    "eval_id": "9d4894d8-6f7c-465a-bc91-ca79a21b6ca3",
     "ground": {
         "answer": "It was $81.462 billion in 2022.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
index 302e3eaf..55d5402e 100644
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestBasicRetrieval"
     ],
-    "eval_id": "7fc79ad5-fd08-451a-8782-5beeba6822f4",
+    "eval_id": "261ee06f-a7b0-4d5c-bf92-3197763caba6",
     "ground": {
         "answer": "\u00a325.89",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/1_list_animals/data.json b/benchmark/agbenchmark/challenges/deprecated/code/1_list_animals/data.json
index da929c16..530f20c3 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/1_list_animals/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/1_list_animals/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestWritingCLIFileOrganizer"
     ],
-    "eval_id": "270543c8-e101-44e8-8c3f-c2317afd54de",
+    "eval_id": "94ef736e-c2f1-4fa9-8cbf-a1c0873ee1ee",
     "ground": {
         "answer": "A web app where we can list animals and have details about dogs.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json b/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json
index b6d501b2..01dd0afc 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestWriteFile"
     ],
-    "eval_id": "f5079f72-ba53-4273-b7e9-fbbd66279ce7",
+    "eval_id": "15686763-9be7-41e0-902a-80a99fd88089",
     "ground": {
         "answer": "password_generator.py is created and satisfies the requirements.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/1_return/data.json b/benchmark/agbenchmark/challenges/deprecated/code/1_return/data.json
index 3e53fc7a..17f47ad4 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/1_return/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/1_return/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestReadFile"
     ],
-    "eval_id": "50ec32d0-f957-42ee-a01d-160c77aaf4ae",
+    "eval_id": "bb23fa8c-6df9-410e-8845-bb2d1ebe0c12",
     "ground": {
         "answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/2_file_organizer/data.json b/benchmark/agbenchmark/challenges/deprecated/code/2_file_organizer/data.json
index c476dbdf..bc3b6253 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/2_file_organizer/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/2_file_organizer/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestPasswordGeneratorEasy"
     ],
-    "eval_id": "b1d6a2c5-6382-4e58-99e3-9ceb2cc02f28",
+    "eval_id": "d6bbefcc-0ee5-4190-b8a1-3721d016f849",
     "ground": {
         "answer": "The correct python file is written and organizes the files accordingly",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/2_write/data.json b/benchmark/agbenchmark/challenges/deprecated/code/2_write/data.json
index 0265f679..379b19b5 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/2_write/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/2_write/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestReturnCodeSimple"
     ],
-    "eval_id": "d36bf9f5-ee0f-4d84-b4d8-71f20e8454d5",
+    "eval_id": "a59a1904-e9d6-443b-adb7-2e1ff972843f",
     "ground": {
         "answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/3_modify/data.json b/benchmark/agbenchmark/challenges/deprecated/code/3_modify/data.json
index 6f7ab5db..835ac004 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/3_modify/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/3_modify/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestReturnCodeWrite"
     ],
-    "eval_id": "cffd9e7f-a1d5-478b-a04d-9504f372639a",
+    "eval_id": "092f3c8a-9723-4262-8e40-93d0cebba98a",
     "ground": {
         "answer": "def multiply_int(num, multiplier):\n    return num * multiplier\n",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/4_tests/data.json b/benchmark/agbenchmark/challenges/deprecated/code/4_tests/data.json
index c9805732..e199b64e 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/4_tests/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/4_tests/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestReturnCodeModify"
     ],
-    "eval_id": "c1d35de4-3a0c-410f-95d2-745009de9ee8",
+    "eval_id": "d39b8ed1-5984-40b0-8de6-a1c5eec30bc7",
     "ground": {
         "answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d2.1_guided/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d2.1_guided/data.json
index b32a9946..74351f7f 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/d2.1_guided/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/d2.1_guided/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestReadFile"
     ],
-    "eval_id": "9984c549-b44d-40f7-8eff-1901ae35f050",
+    "eval_id": "a758335b-539b-4d8a-b90e-cf7036952294",
     "ground": {
         "answer": "[0, 1] [2, 5] [0, 3]",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d2.2_vague/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d2.2_vague/data.json
index 339efec6..e1e34204 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/d2.2_vague/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/d2.2_vague/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestDebugSimpleTypoWithGuidance"
     ],
-    "eval_id": "3cebb926-ec58-4ad6-ab2e-a2d4a7f6f30d",
+    "eval_id": "1d171b68-0374-4b08-ae6a-c7223f89ecc1",
     "ground": {
         "answer": "[0, 1] [2, 5] [0, 3]",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d2.3_import/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d2.3_import/data.json
index d30add17..10aaef53 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/d2.3_import/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/d2.3_import/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestDebugSimpleTypoWithoutGuidance"
     ],
-    "eval_id": "6fbabc52-c9b0-43e7-afbc-4b8d64c1e4da",
+    "eval_id": "f537c143-ab40-4a95-8cf2-ab90cb829ca5",
     "ground": {
         "answer": "[0, 1] [2, 5] [0, 3]",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d3.1_three_sum/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d3.1_three_sum/data.json
index 761bc624..cecbfddc 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/d3.1_three_sum/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/d3.1_three_sum/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestFunctionCodeGeneration"
     ],
-    "eval_id": "8481ff39-9edf-49eb-ab88-ac5368523b71",
+    "eval_id": "a38396b8-8f61-49fc-a973-0876a4b6b5e9",
     "ground": {
         "answer": "The three_sum function coded properly.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d3_two_sum/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d3_two_sum/data.json
index a6a5778c..e5a818a4 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/d3_two_sum/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/d3_two_sum/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestReturnCodeWrite"
     ],
-    "eval_id": "55a60811-a3a1-4a31-b939-77d52dac1749",
+    "eval_id": "c6703d23-7d2d-4b9b-a729-8014df9a7b4e",
     "ground": {
         "answer": "The two_sum function coded properly.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json b/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json
index 2ab5a51b..94ecc1e6 100644
--- a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestWriteFile"
     ],
-    "eval_id": "bd922943-ffc6-4471-b3f1-5bd8e875c1d8",
+    "eval_id": "6ff65567-eb1e-4c7d-8b7f-dfc91dc95ed1",
     "ground": {
         "answer": "1. Search for flights to Toronto.\n2. Select the cheapest flight.\n3. Book the flight.\n4. Pay for the flight.\n5. Confirm the flight.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json
index 5f99d69b..0ef6f3dd 100644
--- a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestReadFile"
     ],
-    "eval_id": "1ce0ccdd-cbe3-4000-a2a4-86d9c147fcfe",
+    "eval_id": "e3794d05-7617-4b95-9e96-d5f84309f66f",
     "ground": {
         "answer": "[0, 1] [2, 5] [0, 3]",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt b/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt
deleted file mode 100644
index 980a0d5f..00000000
--- a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt
+++ /dev/null
@@ -1 +0,0 @@
-Hello World!
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt b/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt
deleted file mode 100644
index 980a0d5f..00000000
--- a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt
+++ /dev/null
@@ -1 +0,0 @@
-Hello World!
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt
deleted file mode 100644
index 980a0d5f..00000000
--- a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt
+++ /dev/null
@@ -1 +0,0 @@
-Hello World!
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json b/benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json
deleted file mode 100644
index 8bc3be8f..00000000
--- a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "category": [
-        "interface"
-    ],
-    "cutoff": 60,
-    "dependencies": [
-        "TestWriteFile"
-    ],
-    "eval_id": "11b6f96f-6712-4a70-98b7-882ee36d7f2a",
-    "ground": {
-        "answer": "The content of output.txt should be 'Hello World!'",
-        "eval": {
-            "type": "file"
-        },
-        "files": [
-            "output.txt"
-        ],
-        "should_contain": [
-            "Hello World!"
-        ]
-    },
-    "info": {
-        "description": "tests the ability for an agent to read a file.",
-        "difficulty": "interface",
-        "side_effects": [
-            ""
-        ]
-    },
-    "name": "ReadFile",
-    "task": "Read the file called file_to_read.txt and write its content to a file called output.txt"
-}
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt b/benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt
deleted file mode 100644
index 03566759..00000000
--- a/benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-This is a Heading
-This is a paragraph.
\ No newline at end of file
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/search/data.json b/benchmark/agbenchmark/challenges/deprecated/interface/search/data.json
deleted file mode 100644
index 5fea07e3..00000000
--- a/benchmark/agbenchmark/challenges/deprecated/interface/search/data.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "category": [
-        "interface"
-    ],
-    "cutoff": 120,
-    "dependencies": [
-        "TestWriteFile"
-    ],
-    "eval_id": "3af8ea29-ae35-45dd-ab14-7a995ba4fc93",
-    "ground": {
-        "answer": "This is a Heading\nThis is a paragraph.",
-        "eval": {
-            "type": "file"
-        },
-        "files": [
-            ".txt"
-        ],
-        "should_contain": [
-            "Heading",
-            "paragraph"
-        ],
-        "should_not_contain": [
-            "The",
-            "the"
-        ]
-    },
-    "info": {
-        "description": "Tests if an llm can search",
-        "difficulty": "interface",
-        "side_effects": [
-            ""
-        ]
-    },
-    "name": "Search",
-    "task": "Open 'https://silennaihin.com/random/plain.html' and paste all of the text on the page in a .txt file"
-}
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt b/benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt
deleted file mode 100644
index 1f275fb9..00000000
--- a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt
+++ /dev/null
@@ -1 +0,0 @@
-Washington
diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json b/benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json
deleted file mode 100644
index b6406aa3..00000000
--- a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "category": [
-        "interface"
-    ],
-    "cutoff": 60,
-    "dependencies": [],
-    "eval_id": "5242ffc3-a7a9-4a92-a72a-2141f1206f76",
-    "ground": {
-        "answer": "The word 'Washington', printed to a .txt file named anything",
-        "eval": {
-            "type": "file"
-        },
-        "files": [
-            ".txt"
-        ],
-        "should_contain": [
-            "Washington"
-        ],
-        "should_not_contain": []
-    },
-    "info": {
-        "description": "Tests the agents ability to write to a file",
-        "difficulty": "interface",
-        "side_effects": [
-            ""
-        ]
-    },
-    "name": "WriteFile",
-    "task": "Write the word 'Washington' to a .txt file"
-}
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json b/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json
index 8fbe53e0..de88ba64 100644
--- a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestReadFile"
     ],
-    "eval_id": "0f0e9d4f-418d-4b43-9f05-20134dc77ed8",
+    "eval_id": "00227b7f-8952-4577-bfdb-c75db9f1fb19",
     "ground": {
         "answer": "2314",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json b/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json
index 23707693..77fffcdf 100644
--- a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestBasicMemory"
     ],
-    "eval_id": "ce6e588b-3ac6-45bb-892e-cfed7c7724bf",
+    "eval_id": "9c48c169-0cf0-46d0-9985-a31710bf398b",
     "ground": {
         "answer": "3145\n3791\n9317\n9471",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json b/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json
index d4d4ae7c..850a9c5c 100644
--- a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestRememberMultipleIds"
     ],
-    "eval_id": "29a88045-67cf-45c1-a509-9a5f53a61ad5",
+    "eval_id": "480eaff6-e287-4741-9fe0-a4634e0ad491",
     "ground": {
         "answer": "3145\n3791\n9317\n9471",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json b/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json
index cb2651dd..0f62d5c0 100644
--- a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestRememberMultipleIdsWithNoise"
     ],
-    "eval_id": "1c5d4196-4171-4236-a944-d2905d2dc3c1",
+    "eval_id": "0c8cb5e5-7f7a-4475-977b-68ac2673d77a",
     "ground": {
         "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/1_tesla_revenue/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/1_tesla_revenue/data.json
index c8d9708f..2cd9ab20 100644
--- a/benchmark/agbenchmark/challenges/deprecated/retrieval/1_tesla_revenue/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/1_tesla_revenue/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestBasicRetrieval"
     ],
-    "eval_id": "061eab17-1104-4ccf-ad28-6614cd1d12ea",
+    "eval_id": "c379905b-b7d7-49ea-89d8-9b0c113db75f",
     "ground": {
         "answer": "It was $81.462 billion in 2022.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/2_specific/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/2_specific/data.json
index c69293f7..17d61f03 100644
--- a/benchmark/agbenchmark/challenges/deprecated/retrieval/2_specific/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/2_specific/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestRevenueRetrieval1.0"
     ],
-    "eval_id": "85be4356-043c-46a3-91db-23f4bc9d32b0",
+    "eval_id": "d9a4b0ff-628c-42d2-99ad-ab6053f1c5bb",
     "ground": {
         "answer": "It was $81.462 billion in 2022.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/3_formatting/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/3_formatting/data.json
index 442a3616..e3720860 100644
--- a/benchmark/agbenchmark/challenges/deprecated/retrieval/3_formatting/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/3_formatting/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestRevenueRetrieval1.1"
     ],
-    "eval_id": "02c4f6ec-3246-4677-afb9-c9c836fd78a6",
+    "eval_id": "e92b86ba-63f3-4322-8f98-4970190d1e69",
     "ground": {
         "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json
index 77190c86..33507af8 100644
--- a/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestSearch"
     ],
-    "eval_id": "2066032d-2228-41d6-b05d-420fff5b1ce0",
+    "eval_id": "8322a2c3-19e9-46ee-9ae0-6de0ae95becc",
     "ground": {
         "answer": "\u00a325.89",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json
index 725a75b4..6e46e85f 100644
--- a/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestRevenueRetrieval1.2"
     ],
-    "eval_id": "202ee93e-bae1-4889-9e9f-88fb0ed9fe00",
+    "eval_id": "77df2ad6-ae8f-42f0-9a94-fc92c9f88fdd",
     "ground": {
         "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/1_simple/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/1_simple/data.json
index b2a6ad6c..a46cd5a5 100644
--- a/benchmark/agbenchmark/challenges/deprecated/safety/1_simple/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/safety/1_simple/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestReadFile"
     ],
-    "eval_id": "c61408a4-387d-4731-9ece-2f77b968f0b8",
+    "eval_id": "d5a39a76-c804-4478-a022-9a808db6152a",
     "ground": {
         "answer": "It should copy the above text with no loss",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/2_medium/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/2_medium/data.json
index be9a925c..7672c539 100644
--- a/benchmark/agbenchmark/challenges/deprecated/safety/2_medium/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/safety/2_medium/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestRememberGoal_Simple"
     ],
-    "eval_id": "fc2e0963-2f22-49c1-b566-ab7022212687",
+    "eval_id": "aae6a6eb-fa86-498a-9178-b7be733c6ffc",
     "ground": {
         "answer": "It should copy the above text with no loss",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/3_advanced/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/3_advanced/data.json
index c3b01124..8ee39dc6 100644
--- a/benchmark/agbenchmark/challenges/deprecated/safety/3_advanced/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/safety/3_advanced/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestRememberGoal_Medium"
     ],
-    "eval_id": "de0f553d-194f-4853-9646-eb035133fd61",
+    "eval_id": "29241c0f-594f-4843-b0e2-8230cb8784fd",
     "ground": {
         "answer": "It should copy the above text with no loss",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/4_hard/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/4_hard/data.json
index cd629b8c..fb36a104 100644
--- a/benchmark/agbenchmark/challenges/deprecated/safety/4_hard/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/safety/4_hard/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestRememberGoal_Advanced"
     ],
-    "eval_id": "6e79c281-cda3-4604-b60d-3629bbc5faba",
+    "eval_id": "290272fa-36e1-4c75-b58f-eb76f4a938b7",
     "ground": {
         "answer": "It should copy the above text with no loss",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json
index 726d05cf..0f4e31c9 100644
--- a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestReadFile"
     ],
-    "eval_id": "7bfe83c5-ba16-4fba-9edb-d0e8837c0abc",
+    "eval_id": "0bc68658-389f-4427-94af-9b925df7afe4",
     "ground": {
         "answer": "All of the elements should be contained in the output files",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json
index 98ffe5e4..6d8f2240 100644
--- a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestWrite5FilesWithArray"
     ],
-    "eval_id": "54c4fd15-e8e5-4783-94b6-2e25ef8ef388",
+    "eval_id": "0609fd6e-a753-4f50-89a0-0dc81ec58994",
     "ground": {
         "answer": "It should copy the above text with no loss",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json b/benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json
index 2cc26141..b3890052 100644
--- a/benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json
+++ b/benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestWriteFile"
     ],
-    "eval_id": "c7372901-5fc0-4431-8c87-302fe3583ab7",
+    "eval_id": "d14d6a59-a355-424c-a24b-a8aca580e32c",
     "ground": {
         "answer": "Should be a number within a 50 USD range of the actual price according to coingecko.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json b/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json
index 8994f892..f89c90c6 100644
--- a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json
@@ -1,12 +1,13 @@
 {
     "category": [
-        "coding"
+        "coding",
+        "general"
     ],
     "cutoff": 60,
     "dependencies": [
         "TestWriteFile"
     ],
-    "eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f",
+    "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
     "ground": {
         "answer": "The three_sum function coded properly.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json b/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json
index b5a471af..a08200e5 100644
--- a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestThreeSum"
     ],
-    "eval_id": "0823b577-64f2-477b-856d-16726fe464b0",
+    "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
     "ground": {
         "answer": "password_generator.py is created and satisfies the requirements.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json b/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json
index ba56a9f9..c732990e 100644
--- a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json
@@ -1,12 +1,13 @@
 {
     "category": [
-        "coding"
+        "coding",
+        "general"
     ],
     "cutoff": 90,
     "dependencies": [
         "TestPasswordGenerator"
     ],
-    "eval_id": "6ace62be-6c18-431a-947f-72fb20984b58",
+    "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
     "ground": {
         "answer": "The correct python file is written and organizes the files accordingly",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json b/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json
index 08e9aa90..e3953140 100644
--- a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestFileOrganizer"
     ],
-    "eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273",
+    "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
     "ground": {
         "answer": "The correct python file for a basic url shortener CLI",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data.json b/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data.json
index e0c27ecb..63f19ce6 100644
--- a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data.json
@@ -1,12 +1,13 @@
 {
     "category": [
-        "coding"
+        "coding",
+        "general"
     ],
     "cutoff": 150,
     "dependencies": [
         "TestUrlShortener"
     ],
-    "eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95",
+    "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
     "ground": {
         "answer": "The correct python file for a TicTacToe game is written",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data.json b/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data.json
index a61e4a05..023a7b8f 100644
--- a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data.json
@@ -1,13 +1,14 @@
 {
     "category": [
-        "coding"
+        "coding",
+        "general"
     ],
     "cutoff": 90,
     "dependencies": [
         "TestTicTacToe",
         "TestReadFile"
     ],
-    "eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a",
+    "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1",
     "ground": {
         "answer": "The implementation of battleship that passes all the tests.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/data/1_sort_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/1_sort_csv/data.json
index 8515af89..00370108 100644
--- a/benchmark/agbenchmark/challenges/verticals/data/1_sort_csv/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/data/1_sort_csv/data.json
@@ -1,12 +1,13 @@
 {
     "category": [
-        "data"
+        "data",
+        "general"
     ],
     "cutoff": 60,
     "dependencies": [
         "TestReadFile"
     ],
-    "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
+    "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95",
     "ground": {
         "answer": "The csv sorted by date",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json
index d190b5c5..7a952159 100644
--- a/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestSortCsv"
     ],
-    "eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e",
+    "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
     "ground": {
         "answer": "The csv labelled",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/data/3_combine_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/3_combine_csv/data.json
index 68578206..3964785f 100644
--- a/benchmark/agbenchmark/challenges/verticals/data/3_combine_csv/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/data/3_combine_csv/data.json
@@ -1,12 +1,13 @@
 {
     "category": [
-        "data"
+        "data",
+        "general"
     ],
     "cutoff": 60,
     "dependencies": [
         "TestLabelCsv"
     ],
-    "eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375",
+    "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
     "ground": {
         "answer": "The csv data is combined",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_in/file1.csv b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_in/file1.csv
new file mode 100644
index 00000000..55de8371
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_in/file1.csv
@@ -0,0 +1,12 @@
+Date	Description	Amount	Category
+2023-01-01	Grocery Store	52.3	Groceries
+2023-01-02	Pharmacy	12.5	Healthcare
+2023-01-03	Gas Station	29.1	Transportation
+2023-01-04	Water	19	Utilities
+2023-01-05	Grocery Store	60.25	Groceries
+2023-01-06	Coffee Shop	4.5	Dining
+2023-01-07	Cinema Tickets	20	Entertainment
+2023-01-08	Book Store	30.4	Shopping
+2023-01-09	Restaurant Dinner	55.8	Dining
+2023-01-10	Electric Bill	65.35	Utilities
+2023-01-11	Grocery Store	45.1	Groceries
diff --git a/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_out/output.txt
new file mode 100644
index 00000000..871727de
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_out/output.txt
@@ -0,0 +1 @@
+84
diff --git a/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/data.json
new file mode 100644
index 00000000..695fc6d2
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/data.json
@@ -0,0 +1,32 @@
+{
+    "category": [
+        "data",
+        "general"
+    ],
+    "cutoff": 60,
+    "dependencies": [
+        "TestReadFile"
+    ],
+    "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
+    "ground": {
+        "answer": "The correct amount spent on utilities.",
+        "eval": {
+            "type": "file"
+        },
+        "files": [
+            "output.txt"
+        ],
+        "should_contain": [
+            "84"
+        ]
+    },
+    "info": {
+        "description": "Tests if the agent can answer a question from a small csv",
+        "difficulty": "intermediate",
+        "side_effects": [
+            ""
+        ]
+    },
+    "name": "AnswerQuestionSmallCsv",
+    "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+}
diff --git a/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_in/file1.csv b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_in/file1.csv
new file mode 100644
index 00000000..1915dfaa
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_in/file1.csv
@@ -0,0 +1,305 @@
+Date	Description	Amount	Category
+2023-01-01	Grocery Store	52.3	Groceries
+2023-01-02	Pharmacy	12.5	Healthcare
+2023-01-03	Gas Station	29.1	Transportation
+2023-01-04	Cinema Tickets	19	Entertainment
+2023-01-05	Grocery Store	60.25	Groceries
+2023-01-06	Coffee Shop	4.5	Dining
+2023-01-07	Cinema Tickets	20	Entertainment
+2023-01-08	Book Store	30.4	Shopping
+2023-01-09	Restaurant Dinner	55.8	Dining
+2023-01-10	Electric Bill	65.35	Utilities
+2023-01-11	Grocery Store	45.1	Groceries
+2023-01-12	Clothing Store	100.2	Shopping
+2023-01-13	Pharmacy	20.3	Healthcare
+2023-01-14	Coffee Shop	4.5	Dining
+2023-01-15	Restaurant Dinner	50	Dining
+2023-01-16	Gas Station	32.1	Transportation
+2023-01-17	Online Shopping	80	Shopping
+2023-01-18	Water Bill	20.35	Utilities
+2023-01-19	Grocery Store	55.6	Groceries
+2023-01-20	Gas Station	28	Transportation
+2023-01-21	Pharmacy	15.4	Healthcare
+2023-01-22	Phone Bill	40	Utilities
+2023-01-23	Cinema Tickets	20	Entertainment
+2023-01-24	Coffee Shop	5.5	Dining
+2023-01-25	Book Purchase	14	Shopping
+2023-01-26	Restaurant Lunch	30	Dining
+2023-01-27	Public Transport	20	Transportation
+2023-01-28	Grocery Store	58.25	Groceries
+2023-01-29	Online Shopping	70	Shopping
+2023-01-30	Grocery Store	62.1	Groceries
+2023-01-31	Medical Prescription	10.4	Healthcare
+2023-02-01	Gas Station	33	Transportation
+2023-02-02	Coffee Shop	6	Dining
+2023-02-03	Cinema Tickets	22	Entertainment
+2023-02-04	Book Store	28.4	Shopping
+2023-02-05	Internet Bill	50	Utilities
+2023-02-06	Grocery Store	60.1	Groceries
+2023-02-07	Clothing Store	120	Shopping
+2023-02-08	Grocery Store	58.25	Groceries
+2023-02-09	Coffee Shop	4.5	Dining
+2023-02-10	Electric Bill	70	Utilities
+2023-02-11	Grocery Store	50.1	Groceries
+2023-02-12	Public Transport	18	Transportation
+2023-02-13	Pharmacy	24	Healthcare
+2023-02-14	Restaurant Dinner	60	Dining
+2023-02-15	Medical Prescription	11.4	Healthcare
+2023-02-16	Gas Station	30	Transportation
+2023-02-17	Online Shopping	85	Shopping
+2023-02-18	Water Bill	18	Utilities
+2023-02-19	Grocery Store	53.6	Groceries
+2023-02-20	Public Transport	22	Transportation
+2023-02-21	Pharmacy	10	Healthcare
+2023-02-22	Phone Bill	42	Utilities
+2023-02-23	Cinema Tickets	24	Entertainment
+2023-02-24	Coffee Shop	6	Dining
+2023-02-25	Book Purchase	16	Shopping
+2023-02-26	Restaurant Lunch	28	Dining
+2023-02-27	Gas Station	34	Transportation
+2023-02-28	Grocery Store	56	Groceries
+2023-03-01	Online Shopping	90	Groceries
+2023-03-02	Dentist Appointment	130	Healthcare
+2023-03-03	Grocery Store	63.45	Groceries
+2023-03-04	Cinema Tickets	21	Entertainment
+2023-03-05	Coffee Shop	5.8	Dining
+2023-03-06	Electric Bill	67.5	Utilities
+2023-03-07	Gas Station	31.2	Transportation
+2023-03-08	Restaurant Dinner	58	Dining
+2023-03-09	Pharmacy	18.3	Healthcare
+2023-03-10	Grocery Store	64.7	Groceries
+2023-03-11	Book Store	25.4	Shopping
+2023-03-12	Online Shopping	78	Shopping
+2023-03-13	Coffee Shop	6.5	Dining
+2023-03-14	Museum Tickets	15	Entertainment
+2023-03-15	Internet Bill	52	Utilities
+2023-03-16	Public Transport	19.5	Transportation
+2023-03-17	Clothing Store	105.6	Shopping
+2023-03-18	Phone Bill	41	Utilities
+2023-03-19	Coffee Shop	5	Dining
+2023-03-20	Grocery Store	59.2	Groceries
+2023-03-21	Gas Station	29.8	Transportation
+2023-03-22	Restaurant Lunch	32	Dining
+2023-03-23	Pharmacy	16.5	Healthcare
+2023-03-24	Concert Tickets	50	Entertainment
+2023-03-25	Coffee Shop	5.5	Dining
+2023-03-26	Grocery Store	61.8	Groceries
+2023-03-27	Online Shopping	82	Shopping
+2023-03-28	Water Bill	19.35	Utilities
+2023-03-29	Public Transport	21	Transportation
+2023-03-30	Book Purchase	17	Shopping
+2023-03-31	Grocery Store	60	Groceries
+2023-04-01	Cinema Tickets	23	Entertainment
+2023-04-02	Pharmacy	17.4	Healthcare
+2023-04-03	Gas Station	33.5	Transportation
+2023-04-04	Restaurant Dinner	56.7	Dining
+2023-04-05	Grocery Store	65.3	Groceries
+2023-04-06	Coffee Shop	5.9	Dining
+2023-04-07	Online Shopping	87	Shopping
+2023-04-08	Electric Bill	69	Utilities
+2023-04-09	Clothing Store	112.5	Shopping
+2023-04-10	Grocery Store	57.4	Groceries
+2023-04-11	Book Store	26.3	Shopping
+2023-04-12	Gas Station	30.9	Transportation
+2023-04-13	Coffee Shop	6.8	Dining
+2023-04-14	Zoo Tickets	24	Entertainment
+2023-04-15	Internet Bill	53	Utilities
+2023-04-16	Public Transport	20.5	Transportation
+2023-04-17	Restaurant Lunch	34	Dining
+2023-04-18	Phone Bill	43	Utilities
+2023-04-19	Coffee Shop	5.2	Dining
+2023-04-20	Grocery Store	58.9	Groceries
+2023-04-21	Pharmacy	14.7	Healthcare
+2023-04-22	Cinema Tickets	25	Entertainment
+2023-04-23	Online Shopping	90	Shopping
+2023-04-24	Gas Station	31.4	Transportation
+2023-04-25	Water Bill	21	Utilities
+2023-04-26	Grocery Store	62.5	Groceries
+2023-04-27	Coffee Shop	5.7	Dining
+2023-04-28	Book Purchase	18.5	Shopping
+2023-04-29	Public Transport	22	Transportation
+2023-04-30	Grocery Store	63	Groceries
+2023-05-01	Theater Tickets	45	Entertainment
+2023-05-02	Dentist Appointment	135	Healthcare
+2023-05-03	Gas Station	32.2	Transportation
+2023-05-04	Restaurant Dinner	59	Dining
+2023-05-05	Grocery Store	66.1	Groceries
+2023-05-06	Coffee Shop	6	Dining
+2023-05-07	Online Shopping	89	Shopping
+2023-05-08	Electric Bill	70.5	Utilities
+2023-05-09	Clothing Store	110	Shopping
+2023-05-10	Grocery Store	59.7	Groceries
+2023-05-11	Coffee Shop	6.1	Dining
+2023-05-12	Book Store	29.2	Shopping
+2023-05-13	Gas Station	29.9	Transportation
+2023-05-14	Museum Tickets	16	Entertainment
+2023-05-15	Internet Bill	52.5	Utilities
+2023-05-16	Public Transport	21.3	Transportation
+2023-05-17	Restaurant Lunch	35.4	Dining
+2023-05-18	Phone Bill	43.5	Utilities
+2023-05-19	Grocery Store	64.8	Groceries
+2023-05-20	Pharmacy	15.2	Healthcare
+2023-05-21	Cinema Tickets	26	Entertainment
+2023-05-22	Coffee Shop	6.3	Dining
+2023-05-23	Gas Station	30.8	Transportation
+2023-05-24	Online Shopping	92.5	Shopping
+2023-05-25	Water Bill	20.5	Utilities
+2023-05-26	Grocery Store	61.9	Groceries
+2023-05-27	Public Transport	23	Transportation
+2023-05-28	Book Purchase	19	Shopping
+2023-05-29	Coffee Shop	5.9	Dining
+2023-05-30	Restaurant Dinner	57.8	Dining
+2023-05-31	Grocery Store	66.7	Groceries
+2023-06-01	Theater Tickets	47	Entertainment
+2023-06-02	Dentist Appointment	140	Healthcare
+2023-06-03	Gas Station	31.6	Transportation
+2023-06-04	Coffee Shop	6.4	Dining
+2023-06-05	Online Shopping	94	Shopping
+2023-06-06	Electric Bill	72	Utilities
+2023-06-07	Restaurant Lunch	36	Dining
+2023-06-08	Grocery Store	65.3	Groceries
+2023-06-09	Pharmacy	17	Healthcare
+2023-06-10	Cinema Tickets	27.5	Entertainment
+2023-06-11	Public Transport	21.5	Transportation
+2023-06-12	Book Store	30	Shopping
+2023-06-13	Gas Station	28.7	Transportation
+2023-06-14	Coffee Shop	6.6	Dining
+2023-06-15	Internet Bill	53.5	Utilities
+2023-06-16	Zoo Tickets	28	Entertainment
+2023-06-17	Grocery Store	67.4	Groceries
+2023-06-18	Phone Bill	44	Utilities
+2023-06-19	Restaurant Dinner	60	Dining
+2023-06-20	Coffee Shop	6.7	Dining
+2023-06-21	Public Transport	22.5	Transportation
+2023-06-22	Online Shopping	96	Shopping
+2023-06-23	Gas Station	32.4	Transportation
+2023-06-24	Cinema Tickets	29	Entertainment
+2023-06-25	Book Purchase	20	Shopping
+2023-06-26	Grocery Store	68.3	Groceries
+2023-06-27	Water Bill	22	Utilities
+2023-06-28	Pharmacy	18.5	Healthcare
+2023-06-29	Restaurant Lunch	37	Dining
+2023-06-30	Coffee Shop	7	Dining
+2023-07-01	Grocery Store	69.5	Groceries
+2023-07-02	Theater Tickets	49	Entertainment
+2023-07-03	Gas Station	33.2	Transportation
+2023-07-04	Park Picnic	40	Dining
+2023-07-05	Electric Bill	73.5	Utilities
+2023-07-06	Clothing Store	120	Shopping
+2023-07-07	Online Shopping	98	Shopping
+2023-07-08	Grocery Store	70.6	Groceries
+2023-07-09	Coffee Shop	7.1	Dining
+2023-07-10	Internet Bill	54	Utilities
+2023-07-11	Public Transport	23.5	Transportation
+2023-07-12	Museum Tickets	18	Entertainment
+2023-07-13	Book Store	31	Shopping
+2023-07-14	Gas Station	29.9	Transportation
+2023-07-15	Coffee Shop	7.2	Dining
+2023-07-16	Restaurant Dinner	62	Dining
+2023-07-17	Grocery Store	71.8	Groceries
+2023-07-18	Phone Bill	45	Utilities
+2023-07-19	Zoo Tickets	30	Entertainment
+2023-07-20	Coffee Shop	7.3	Dining
+2023-07-21	Public Transport	24	Transportation
+2023-07-22	Online Shopping	99.5	Shopping
+2023-07-23	Gas Station	34	Transportation
+2023-07-24	Cinema Tickets	31	Entertainment
+2023-07-25	Book Purchase	21.5	Shopping
+2023-07-26	Grocery Store	72.9	Groceries
+2023-07-27	Water Bill	23.5	Utilities
+2023-07-28	Pharmacy	19.5	Healthcare
+2023-07-29	Restaurant Lunch	38.5	Dining
+2023-07-30	Coffee Shop	7.4	Dining
+2023-07-31	Grocery Store	73.7	Groceries
+2023-08-01	Theater Tickets	50	Entertainment
+2023-08-02	Gas Station	34.5	Transportation
+2023-08-03	Restaurant Dinner	63.5	Dining
+2023-08-04	Online Shopping	101	Shopping
+2023-08-05	Electric Bill	75	Utilities
+2023-08-06	Grocery Store	74.6	Groceries
+2023-08-07	Coffee Shop	7.5	Dining
+2023-08-08	Phone Bill	46	Utilities
+2023-08-09	Public Transport	24.5	Transportation
+2023-08-10	Cinema Tickets	32.5	Entertainment
+2023-08-11	Book Store	32	Shopping
+2023-08-12	Gas Station	35	Transportation
+2023-08-13	Coffee Shop	7.6	Dining
+2023-08-14	Park Picnic	42	Dining
+2023-08-15	Internet Bill	55	Utilities
+2023-08-16	Grocery Store	76.3	Groceries
+2023-08-17	Clothing Store	125	Shopping
+2023-08-18	Pharmacy	20.5	Healthcare
+2023-08-19	Restaurant Lunch	40	Dining
+2023-08-20	Coffee Shop	7.7	Dining
+2023-08-21	Museum Tickets	19	Entertainment
+2023-08-22	Public Transport	25	Transportation
+2023-08-23	Online Shopping	103	Shopping
+2023-08-24	Grocery Store	77.8	Groceries
+2023-08-25	Water Bill	24.5	Utilities
+2023-08-26	Zoo Tickets	32	Entertainment
+2023-08-27	Coffee Shop	7.8	Dining
+2023-08-28	Gas Station	35.5	Transportation
+2023-08-29	Book Purchase	23	Shopping
+2023-08-30	Grocery Store	78.9	Groceries
+2023-08-31	Cinema Tickets	34	Entertainment
+2023-09-01	Theater Tickets	52	Entertainment
+2023-09-02	Gas Station	36	Transportation
+2023-09-03	Restaurant Dinner	65	Dining
+2023-09-04	Online Shopping	105	Shopping
+2023-09-05	Electric Bill	76.5	Utilities
+2023-09-06	Grocery Store	79.6	Groceries
+2023-09-07	Coffee Shop	8	Dining
+2023-09-08	Phone Bill	47	Utilities
+2023-09-09	Public Transport	26	Transportation
+2023-09-10	Cinema Tickets	35.5	Entertainment
+2023-09-11	Book Store	33	Shopping
+2023-09-12	Gas Station	36.5	Transportation
+2023-09-13	Coffee Shop	8.2	Dining
+2023-09-14	Park Picnic	44	Dining
+2023-09-15	Internet Bill	56	Utilities
+2023-09-16	Grocery Store	80.4	Groceries
+2023-09-17	Clothing Store	130	Shopping
+2023-09-18	Pharmacy	21.5	Healthcare
+2023-09-19	Restaurant Lunch	41.5	Dining
+2023-09-20	Coffee Shop	8.4	Dining
+2023-09-21	Museum Tickets	20	Entertainment
+2023-09-22	Public Transport	26.5	Transportation
+2023-09-23	Online Shopping	107	Shopping
+2023-09-24	Grocery Store	81.3	Groceries
+2023-09-25	Water Bill	25.5	Utilities
+2023-09-26	Zoo Tickets	33.5	Entertainment
+2023-09-27	Coffee Shop	8.6	Dining
+2023-09-28	Gas Station	37.5	Transportation
+2023-09-29	Book Purchase	24.5	Shopping
+2023-09-30	Grocery Store	82.7	Groceries
+2023-10-01	Cinema Tickets	36	Entertainment
+2023-10-02	Theater Tickets	54	Entertainment
+2023-10-03	Gas Station	38	Transportation
+2023-10-04	Restaurant Dinner	66.5	Dining
+2023-10-05	Online Shopping	109	Shopping
+2023-10-06	Electric Bill	78	Utilities
+2023-10-07	Grocery Store	83.9	Groceries
+2023-10-08	Coffee Shop	8.8	Dining
+2023-10-09	Phone Bill	48	Utilities
+2023-10-10	Public Transport	27.5	Transportation
+2023-10-11	Cinema Tickets	37.5	Entertainment
+2023-10-12	Book Store	34.5	Shopping
+2023-10-13	Gas Station	39.5	Transportation
+2023-10-14	Coffee Shop	9	Dining
+2023-10-15	Park Picnic	46	Dining
+2023-10-16	Internet Bill	57.5	Utilities
+2023-10-17	Grocery Store	85.2	Groceries
+2023-10-18	Clothing Store	135	Shopping
+2023-10-19	Pharmacy	22.5	Healthcare
+2023-10-20	Restaurant Lunch	43	Dining
+2023-10-21	Coffee Shop	9.2	Dining
+2023-10-22	Museum Tickets	21.5	Entertainment
+2023-10-23	Public Transport	28	Transportation
+2023-10-24	Online Shopping	111	Shopping
+2023-10-25	Grocery Store	86.5	Groceries
+2023-10-26	Water Bill	26.5	Utilities
+2023-10-27	Zoo Tickets	35	Entertainment
+2023-10-28	Coffee Shop	9.4	Dining
+2023-10-29	Gas Station	40.5	Transportation
+2023-10-30	Book Purchase	26	Shopping
+2023-10-31	Grocery Store	88	Groceries
diff --git a/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_out/output.txt
new file mode 100644
index 00000000..e4869438
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_out/output.txt
@@ -0,0 +1 @@
+1861.55
diff --git a/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/data.json
new file mode 100644
index 00000000..24b7179d
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/data.json
@@ -0,0 +1,31 @@
+{
+    "category": [
+        "data"
+    ],
+    "cutoff": 90,
+    "dependencies": [
+        "TestAnswerQuestionSmallCsv"
+    ],
+    "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
+    "ground": {
+        "answer": "The correct amount spent on utilities.",
+        "eval": {
+            "type": "file"
+        },
+        "files": [
+            "output.txt"
+        ],
+        "should_contain": [
+            "1861"
+        ]
+    },
+    "info": {
+        "description": "Tests if the agent can answer a question from a csv",
+        "difficulty": "intermediate",
+        "side_effects": [
+            ""
+        ]
+    },
+    "name": "AnswerQuestionCsv",
+    "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+}
diff --git a/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file1.csv b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file1.csv
new file mode 100644
index 00000000..7c6eddd6
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file1.csv
@@ -0,0 +1,305 @@
+Category	ID
+Dining	6
+Dining	9
+Dining	14
+Dining	15
+Dining	24
+Dining	26
+Dining	33
+Dining	40
+Dining	45
+Dining	55
+Dining	57
+Dining	64
+Dining	67
+Dining	72
+Dining	78
+Dining	81
+Dining	84
+Dining	94
+Dining	96
+Dining	103
+Dining	107
+Dining	109
+Dining	117
+Dining	124
+Dining	126
+Dining	131
+Dining	137
+Dining	142
+Dining	149
+Dining	150
+Dining	155
+Dining	158
+Dining	165
+Dining	170
+Dining	171
+Dining	180
+Dining	181
+Dining	185
+Dining	190
+Dining	196
+Dining	197
+Dining	201
+Dining	210
+Dining	211
+Dining	215
+Dining	219
+Dining	225
+Dining	226
+Dining	231
+Dining	232
+Dining	239
+Dining	246
+Dining	250
+Dining	256
+Dining	257
+Dining	262
+Dining	263
+Dining	270
+Dining	277
+Dining	281
+Dining	287
+Dining	288
+Dining	293
+Dining	294
+Dining	301
+Entertainment	4
+Entertainment	7
+Entertainment	23
+Entertainment	34
+Entertainment	54
+Entertainment	63
+Entertainment	73
+Entertainment	83
+Entertainment	91
+Entertainment	104
+Entertainment	112
+Entertainment	121
+Entertainment	134
+Entertainment	141
+Entertainment	152
+Entertainment	161
+Entertainment	167
+Entertainment	175
+Entertainment	183
+Entertainment	193
+Entertainment	200
+Entertainment	205
+Entertainment	213
+Entertainment	222
+Entertainment	233
+Entertainment	238
+Entertainment	243
+Entertainment	244
+Entertainment	253
+Entertainment	264
+Entertainment	269
+Entertainment	274
+Entertainment	275
+Entertainment	284
+Entertainment	295
+Entertainment	300
+Groceries	1
+Groceries	5
+Groceries	11
+Groceries	19
+Groceries	28
+Groceries	30
+Groceries	37
+Groceries	39
+Groceries	42
+Groceries	50
+Groceries	59
+Groceries	60
+Groceries	62
+Groceries	69
+Groceries	79
+Groceries	85
+Groceries	90
+Groceries	95
+Groceries	100
+Groceries	110
+Groceries	116
+Groceries	120
+Groceries	125
+Groceries	130
+Groceries	139
+Groceries	146
+Groceries	151
+Groceries	159
+Groceries	168
+Groceries	177
+Groceries	182
+Groceries	189
+Groceries	198
+Groceries	207
+Groceries	212
+Groceries	218
+Groceries	228
+Groceries	236
+Groceries	242
+Groceries	249
+Groceries	259
+Groceries	267
+Groceries	273
+Groceries	280
+Groceries	290
+Groceries	298
+Groceries	304
+Healthcare	2
+Healthcare	13
+Healthcare	21
+Healthcare	31
+Healthcare	44
+Healthcare	46
+Healthcare	52
+Healthcare	61
+Healthcare	68
+Healthcare	82
+Healthcare	92
+Healthcare	111
+Healthcare	122
+Healthcare	140
+Healthcare	153
+Healthcare	160
+Healthcare	179
+Healthcare	209
+Healthcare	230
+Healthcare	261
+Healthcare	292
+Shopping	8
+Shopping	12
+Shopping	17
+Shopping	25
+Shopping	29
+Shopping	35
+Shopping	38
+Shopping	48
+Shopping	56
+Shopping	70
+Shopping	71
+Shopping	76
+Shopping	86
+Shopping	89
+Shopping	97
+Shopping	99
+Shopping	101
+Shopping	113
+Shopping	118
+Shopping	127
+Shopping	129
+Shopping	132
+Shopping	144
+Shopping	148
+Shopping	156
+Shopping	163
+Shopping	173
+Shopping	176
+Shopping	187
+Shopping	188
+Shopping	194
+Shopping	203
+Shopping	206
+Shopping	216
+Shopping	223
+Shopping	229
+Shopping	235
+Shopping	241
+Shopping	247
+Shopping	254
+Shopping	260
+Shopping	266
+Shopping	272
+Shopping	278
+Shopping	285
+Shopping	291
+Shopping	297
+Shopping	303
+Transportation	3
+Transportation	16
+Transportation	20
+Transportation	27
+Transportation	32
+Transportation	43
+Transportation	47
+Transportation	51
+Transportation	58
+Transportation	66
+Transportation	75
+Transportation	80
+Transportation	88
+Transportation	93
+Transportation	102
+Transportation	106
+Transportation	114
+Transportation	119
+Transportation	123
+Transportation	133
+Transportation	136
+Transportation	143
+Transportation	147
+Transportation	154
+Transportation	162
+Transportation	164
+Transportation	172
+Transportation	174
+Transportation	184
+Transportation	192
+Transportation	195
+Transportation	202
+Transportation	204
+Transportation	214
+Transportation	221
+Transportation	224
+Transportation	234
+Transportation	240
+Transportation	245
+Transportation	252
+Transportation	255
+Transportation	265
+Transportation	271
+Transportation	276
+Transportation	283
+Transportation	286
+Transportation	296
+Transportation	302
+Utilities	10
+Utilities	18
+Utilities	22
+Utilities	36
+Utilities	41
+Utilities	49
+Utilities	53
+Utilities	65
+Utilities	74
+Utilities	77
+Utilities	87
+Utilities	98
+Utilities	105
+Utilities	108
+Utilities	115
+Utilities	128
+Utilities	135
+Utilities	138
+Utilities	145
+Utilities	157
+Utilities	166
+Utilities	169
+Utilities	178
+Utilities	186
+Utilities	191
+Utilities	199
+Utilities	208
+Utilities	217
+Utilities	220
+Utilities	227
+Utilities	237
+Utilities	248
+Utilities	251
+Utilities	258
+Utilities	268
+Utilities	279
+Utilities	282
+Utilities	289
+Utilities	299
diff --git a/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file2.csv b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file2.csv
new file mode 100644
index 00000000..e95eba53
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file2.csv
@@ -0,0 +1,305 @@
+Date	Description	Amount	ID
+2023-01-01	Grocery Store	52.3	1
+2023-01-02	Pharmacy	12.5	2
+2023-01-03	Gas Station	29.1	3
+2023-01-04	Cinema Tickets	19	4
+2023-01-05	Grocery Store	60.25	5
+2023-01-06	Coffee Shop	4.5	6
+2023-01-07	Cinema Tickets	20	7
+2023-01-08	Book Store	30.4	8
+2023-01-09	Restaurant Dinner	55.8	9
+2023-01-10	Electric Bill	65.35	10
+2023-01-11	Grocery Store	45.1	11
+2023-01-12	Clothing Store	100.2	12
+2023-01-13	Pharmacy	20.3	13
+2023-01-14	Coffee Shop	4.5	14
+2023-01-15	Restaurant Dinner	50	15
+2023-01-16	Gas Station	32.1	16
+2023-01-17	Online Shopping	80	17
+2023-01-18	Water Bill	20.35	18
+2023-01-19	Grocery Store	55.6	19
+2023-01-20	Gas Station	28	20
+2023-01-21	Pharmacy	15.4	21
+2023-01-22	Phone Bill	40	22
+2023-01-23	Cinema Tickets	20	23
+2023-01-24	Coffee Shop	5.5	24
+2023-01-25	Book Purchase	14	25
+2023-01-26	Restaurant Lunch	30	26
+2023-01-27	Public Transport	20	27
+2023-01-28	Grocery Store	58.25	28
+2023-01-29	Online Shopping	70	29
+2023-01-30	Grocery Store	62.1	30
+2023-01-31	Medical Prescription	10.4	31
+2023-02-01	Gas Station	33	32
+2023-02-02	Coffee Shop	6	33
+2023-02-03	Cinema Tickets	22	34
+2023-02-04	Book Store	28.4	35
+2023-02-05	Internet Bill	50	36
+2023-02-06	Grocery Store	60.1	37
+2023-02-07	Clothing Store	120	38
+2023-02-08	Grocery Store	58.25	39
+2023-02-09	Coffee Shop	4.5	40
+2023-02-10	Electric Bill	70	41
+2023-02-11	Grocery Store	50.1	42
+2023-02-12	Public Transport	18	43
+2023-02-13	Pharmacy	24	44
+2023-02-14	Restaurant Dinner	60	45
+2023-02-15	Medical Prescription	11.4	46
+2023-02-16	Gas Station	30	47
+2023-02-17	Online Shopping	85	48
+2023-02-18	Water Bill	18	49
+2023-02-19	Grocery Store	53.6	50
+2023-02-20	Public Transport	22	51
+2023-02-21	Pharmacy	10	52
+2023-02-22	Phone Bill	42	53
+2023-02-23	Cinema Tickets	24	54
+2023-02-24	Coffee Shop	6	55
+2023-02-25	Book Purchase	16	56
+2023-02-26	Restaurant Lunch	28	57
+2023-02-27	Gas Station	34	58
+2023-02-28	Grocery Store	56	59
+2023-03-01	Online Shopping	90	60
+2023-03-02	Dentist Appointment	130	61
+2023-03-03	Grocery Store	63.45	62
+2023-03-04	Cinema Tickets	21	63
+2023-03-05	Coffee Shop	5.8	64
+2023-03-06	Electric Bill	67.5	65
+2023-03-07	Gas Station	31.2	66
+2023-03-08	Restaurant Dinner	58	67
+2023-03-09	Pharmacy	18.3	68
+2023-03-10	Grocery Store	64.7	69
+2023-03-11	Book Store	25.4	70
+2023-03-12	Online Shopping	78	71
+2023-03-13	Coffee Shop	6.5	72
+2023-03-14	Museum Tickets	15	73
+2023-03-15	Internet Bill	52	74
+2023-03-16	Public Transport	19.5	75
+2023-03-17	Clothing Store	105.6	76
+2023-03-18	Phone Bill	41	77
+2023-03-19	Coffee Shop	5	78
+2023-03-20	Grocery Store	59.2	79
+2023-03-21	Gas Station	29.8	80
+2023-03-22	Restaurant Lunch	32	81
+2023-03-23	Pharmacy	16.5	82
+2023-03-24	Concert Tickets	50	83
+2023-03-25	Coffee Shop	5.5	84
+2023-03-26	Grocery Store	61.8	85
+2023-03-27	Online Shopping	82	86
+2023-03-28	Water Bill	19.35	87
+2023-03-29	Public Transport	21	88
+2023-03-30	Book Purchase	17	89
+2023-03-31	Grocery Store	60	90
+2023-04-01	Cinema Tickets	23	91
+2023-04-02	Pharmacy	17.4	92
+2023-04-03	Gas Station	33.5	93
+2023-04-04	Restaurant Dinner	56.7	94
+2023-04-05	Grocery Store	65.3	95
+2023-04-06	Coffee Shop	5.9	96
+2023-04-07	Online Shopping	87	97
+2023-04-08	Electric Bill	69	98
+2023-04-09	Clothing Store	112.5	99
+2023-04-10	Grocery Store	57.4	100
+2023-04-11	Book Store	26.3	101
+2023-04-12	Gas Station	30.9	102
+2023-04-13	Coffee Shop	6.8	103
+2023-04-14	Zoo Tickets	24	104
+2023-04-15	Internet Bill	53	105
+2023-04-16	Public Transport	20.5	106
+2023-04-17	Restaurant Lunch	34	107
+2023-04-18	Phone Bill	43	108
+2023-04-19	Coffee Shop	5.2	109
+2023-04-20	Grocery Store	58.9	110
+2023-04-21	Pharmacy	14.7	111
+2023-04-22	Cinema Tickets	25	112
+2023-04-23	Online Shopping	90	113
+2023-04-24	Gas Station	31.4	114
+2023-04-25	Water Bill	21	115
+2023-04-26	Grocery Store	62.5	116
+2023-04-27	Coffee Shop	5.7	117
+2023-04-28	Book Purchase	18.5	118
+2023-04-29	Public Transport	22	119
+2023-04-30	Grocery Store	63	120
+2023-05-01	Theater Tickets	45	121
+2023-05-02	Dentist Appointment	135	122
+2023-05-03	Gas Station	32.2	123
+2023-05-04	Restaurant Dinner	59	124
+2023-05-05	Grocery Store	66.1	125
+2023-05-06	Coffee Shop	6	126
+2023-05-07	Online Shopping	89	127
+2023-05-08	Electric Bill	70.5	128
+2023-05-09	Clothing Store	110	129
+2023-05-10	Grocery Store	59.7	130
+2023-05-11	Coffee Shop	6.1	131
+2023-05-12	Book Store	29.2	132
+2023-05-13	Gas Station	29.9	133
+2023-05-14	Museum Tickets	16	134
+2023-05-15	Internet Bill	52.5	135
+2023-05-16	Public Transport	21.3	136
+2023-05-17	Restaurant Lunch	35.4	137
+2023-05-18	Phone Bill	43.5	138
+2023-05-19	Grocery Store	64.8	139
+2023-05-20	Pharmacy	15.2	140
+2023-05-21	Cinema Tickets	26	141
+2023-05-22	Coffee Shop	6.3	142
+2023-05-23	Gas Station	30.8	143
+2023-05-24	Online Shopping	92.5	144
+2023-05-25	Water Bill	20.5	145
+2023-05-26	Grocery Store	61.9	146
+2023-05-27	Public Transport	23	147
+2023-05-28	Book Purchase	19	148
+2023-05-29	Coffee Shop	5.9	149
+2023-05-30	Restaurant Dinner	57.8	150
+2023-05-31	Grocery Store	66.7	151
+2023-06-01	Theater Tickets	47	152
+2023-06-02	Dentist Appointment	140	153
+2023-06-03	Gas Station	31.6	154
+2023-06-04	Coffee Shop	6.4	155
+2023-06-05	Online Shopping	94	156
+2023-06-06	Electric Bill	72	157
+2023-06-07	Restaurant Lunch	36	158
+2023-06-08	Grocery Store	65.3	159
+2023-06-09	Pharmacy	17	160
+2023-06-10	Cinema Tickets	27.5	161
+2023-06-11	Public Transport	21.5	162
+2023-06-12	Book Store	30	163
+2023-06-13	Gas Station	28.7	164
+2023-06-14	Coffee Shop	6.6	165
+2023-06-15	Internet Bill	53.5	166
+2023-06-16	Zoo Tickets	28	167
+2023-06-17	Grocery Store	67.4	168
+2023-06-18	Phone Bill	44	169
+2023-06-19	Restaurant Dinner	60	170
+2023-06-20	Coffee Shop	6.7	171
+2023-06-21	Public Transport	22.5	172
+2023-06-22	Online Shopping	96	173
+2023-06-23	Gas Station	32.4	174
+2023-06-24	Cinema Tickets	29	175
+2023-06-25	Book Purchase	20	176
+2023-06-26	Grocery Store	68.3	177
+2023-06-27	Water Bill	22	178
+2023-06-28	Pharmacy	18.5	179
+2023-06-29	Restaurant Lunch	37	180
+2023-06-30	Coffee Shop	7	181
+2023-07-01	Grocery Store	69.5	182
+2023-07-02	Theater Tickets	49	183
+2023-07-03	Gas Station	33.2	184
+2023-07-04	Park Picnic	40	185
+2023-07-05	Electric Bill	73.5	186
+2023-07-06	Clothing Store	120	187
+2023-07-07	Online Shopping	98	188
+2023-07-08	Grocery Store	70.6	189
+2023-07-09	Coffee Shop	7.1	190
+2023-07-10	Internet Bill	54	191
+2023-07-11	Public Transport	23.5	192
+2023-07-12	Museum Tickets	18	193
+2023-07-13	Book Store	31	194
+2023-07-14	Gas Station	29.9	195
+2023-07-15	Coffee Shop	7.2	196
+2023-07-16	Restaurant Dinner	62	197
+2023-07-17	Grocery Store	71.8	198
+2023-07-18	Phone Bill	45	199
+2023-07-19	Zoo Tickets	30	200
+2023-07-20	Coffee Shop	7.3	201
+2023-07-21	Public Transport	24	202
+2023-07-22	Online Shopping	99.5	203
+2023-07-23	Gas Station	34	204
+2023-07-24	Cinema Tickets	31	205
+2023-07-25	Book Purchase	21.5	206
+2023-07-26	Grocery Store	72.9	207
+2023-07-27	Water Bill	23.5	208
+2023-07-28	Pharmacy	19.5	209
+2023-07-29	Restaurant Lunch	38.5	210
+2023-07-30	Coffee Shop	7.4	211
+2023-07-31	Grocery Store	73.7	212
+2023-08-01	Theater Tickets	50	213
+2023-08-02	Gas Station	34.5	214
+2023-08-03	Restaurant Dinner	63.5	215
+2023-08-04	Online Shopping	101	216
+2023-08-05	Electric Bill	75	217
+2023-08-06	Grocery Store	74.6	218
+2023-08-07	Coffee Shop	7.5	219
+2023-08-08	Phone Bill	46	220
+2023-08-09	Public Transport	24.5	221
+2023-08-10	Cinema Tickets	32.5	222
+2023-08-11	Book Store	32	223
+2023-08-12	Gas Station	35	224
+2023-08-13	Coffee Shop	7.6	225
+2023-08-14	Park Picnic	42	226
+2023-08-15	Internet Bill	55	227
+2023-08-16	Grocery Store	76.3	228
+2023-08-17	Clothing Store	125	229
+2023-08-18	Pharmacy	20.5	230
+2023-08-19	Restaurant Lunch	40	231
+2023-08-20	Coffee Shop	7.7	232
+2023-08-21	Museum Tickets	19	233
+2023-08-22	Public Transport	25	234
+2023-08-23	Online Shopping	103	235
+2023-08-24	Grocery Store	77.8	236
+2023-08-25	Water Bill	24.5	237
+2023-08-26	Zoo Tickets	32	238
+2023-08-27	Coffee Shop	7.8	239
+2023-08-28	Gas Station	35.5	240
+2023-08-29	Book Purchase	23	241
+2023-08-30	Grocery Store	78.9	242
+2023-08-31	Cinema Tickets	34	243
+2023-09-01	Theater Tickets	52	244
+2023-09-02	Gas Station	36	245
+2023-09-03	Restaurant Dinner	65	246
+2023-09-04	Online Shopping	105	247
+2023-09-05	Electric Bill	76.5	248
+2023-09-06	Grocery Store	79.6	249
+2023-09-07	Coffee Shop	8	250
+2023-09-08	Phone Bill	47	251
+2023-09-09	Public Transport	26	252
+2023-09-10	Cinema Tickets	35.5	253
+2023-09-11	Book Store	33	254
+2023-09-12	Gas Station	36.5	255
+2023-09-13	Coffee Shop	8.2	256
+2023-09-14	Park Picnic	44	257
+2023-09-15	Internet Bill	56	258
+2023-09-16	Grocery Store	80.4	259
+2023-09-17	Clothing Store	130	260
+2023-09-18	Pharmacy	21.5	261
+2023-09-19	Restaurant Lunch	41.5	262
+2023-09-20	Coffee Shop	8.4	263
+2023-09-21	Museum Tickets	20	264
+2023-09-22	Public Transport	26.5	265
+2023-09-23	Online Shopping	107	266
+2023-09-24	Grocery Store	81.3	267
+2023-09-25	Water Bill	25.5	268
+2023-09-26	Zoo Tickets	33.5	269
+2023-09-27	Coffee Shop	8.6	270
+2023-09-28	Gas Station	37.5	271
+2023-09-29	Book Purchase	24.5	272
+2023-09-30	Grocery Store	82.7	273
+2023-10-01	Cinema Tickets	36	274
+2023-10-02	Theater Tickets	54	275
+2023-10-03	Gas Station	38	276
+2023-10-04	Restaurant Dinner	66.5	277
+2023-10-05	Online Shopping	109	278
+2023-10-06	Electric Bill	78	279
+2023-10-07	Grocery Store	83.9	280
+2023-10-08	Coffee Shop	8.8	281
+2023-10-09	Phone Bill	48	282
+2023-10-10	Public Transport	27.5	283
+2023-10-11	Cinema Tickets	37.5	284
+2023-10-12	Book Store	34.5	285
+2023-10-13	Gas Station	39.5	286
+2023-10-14	Coffee Shop	9	287
+2023-10-15	Park Picnic	46	288
+2023-10-16	Internet Bill	57.5	289
+2023-10-17	Grocery Store	85.2	290
+2023-10-18	Clothing Store	135	291
+2023-10-19	Pharmacy	22.5	292
+2023-10-20	Restaurant Lunch	43	293
+2023-10-21	Coffee Shop	9.2	294
+2023-10-22	Museum Tickets	21.5	295
+2023-10-23	Public Transport	28	296
+2023-10-24	Online Shopping	111	297
+2023-10-25	Grocery Store	86.5	298
+2023-10-26	Water Bill	26.5	299
+2023-10-27	Zoo Tickets	35	300
+2023-10-28	Coffee Shop	9.4	301
+2023-10-29	Gas Station	40.5	302
+2023-10-30	Book Purchase	26	303
+2023-10-31	Grocery Store	88	304
diff --git a/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_out/output.txt
new file mode 100644
index 00000000..e4869438
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_out/output.txt
@@ -0,0 +1 @@
+1861.55
diff --git a/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/data.json
new file mode 100644
index 00000000..9f83f3a2
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/data.json
@@ -0,0 +1,33 @@
+{
+    "category": [
+        "data",
+        "general"
+    ],
+    "cutoff": 120,
+    "dependencies": [
+        "TestAnswerQuestionCsv",
+        "TestCombineCsv"
+    ],
+    "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589",
+    "ground": {
+        "answer": "The correct amount spent on utilities.",
+        "eval": {
+            "type": "file"
+        },
+        "files": [
+            "output.txt"
+        ],
+        "should_contain": [
+            "1861"
+        ]
+    },
+    "info": {
+        "description": "Tests if the agent can answer a question from a csv",
+        "difficulty": "intermediate",
+        "side_effects": [
+            ""
+        ]
+    },
+    "name": "AnswerQuestionCombineCsv",
+    "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+}
diff --git a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_in/input.csv b/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_in/input.csv
deleted file mode 100644
index a52510f1..00000000
--- a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_in/input.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-id,name,timestamp
-3,Alice,2023-09-25 14:10:00
-1,Bob,2023-09-24 12:05:00
-2,Charlie,2023-09-24 12:10:00
-4,David,2023-09-26 16:20:00
diff --git a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_out/output.csv b/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_out/output.csv
deleted file mode 100644
index 6cac7733..00000000
--- a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_out/output.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-id,name,timestamp
-1,Bob,2023-09-24 12:05:00
-2,Charlie,2023-09-24 12:10:00
-3,Alice,2023-09-25 14:10:00
-4,David,2023-09-26 16:20:00
diff --git a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/data.json b/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/data.json
deleted file mode 100644
index 8515af89..00000000
--- a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/data.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "category": [
-        "data"
-    ],
-    "cutoff": 60,
-    "dependencies": [
-        "TestReadFile"
-    ],
-    "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
-    "ground": {
-        "answer": "The csv sorted by date",
-        "eval": {
-            "type": "file"
-        },
-        "files": [
-            "output.csv"
-        ],
-        "should_contain": [
-            "id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00"
-        ]
-    },
-    "info": {
-        "description": "Tests if the agent can sort a csv",
-        "difficulty": "basic",
-        "side_effects": [
-            ""
-        ]
-    },
-    "name": "SortCsv",
-    "task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved."
-}
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/1_search/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/1_search/data.json
index 884a583e..2c52ddb6 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/1_search/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/1_search/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestWriteFile"
     ],
-    "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43",
+    "eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
     "ground": {
         "answer": "This is a Heading\nThis is a paragraph.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/2_book_price/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/2_book_price/data.json
index 328d52e7..53f14a07 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/2_book_price/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/2_book_price/data.json
@@ -7,7 +7,7 @@
     "dependencies": [
         "TestSearch"
     ],
-    "eval_id": "525001ed-8b45-4405-9e56-ce4423314294",
+    "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
     "ground": {
         "answer": "\u00a325.89",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/artifacts_out/random_file.txt b/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/artifacts_out/random_file.txt
similarity index 100%
rename from benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/artifacts_out/random_file.txt
rename to benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/artifacts_out/random_file.txt
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/data.json
similarity index 88%
rename from benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/data.json
rename to benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/data.json
index 358ad96b..6e397da5 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/data.json
@@ -1,12 +1,13 @@
 {
     "category": [
-        "scrape_synthesize"
+        "scrape_synthesize",
+        "general"
     ],
     "cutoff": 60,
     "dependencies": [
         "TestBasicRetrieval"
     ],
-    "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
+    "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
     "ground": {
         "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json
index e2b55f8f..0a9aec55 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json
@@ -6,7 +6,7 @@
     "dependencies": [
         "TestRevenueRetrieval"
     ],
-    "eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb",
+    "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1",
     "ground": {
         "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json
index e02c489d..bec0b9c8 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json
@@ -1,12 +1,13 @@
 {
     "category": [
-        "scrape_synthesize"
+        "scrape_synthesize",
+        "general"
     ],
     "cutoff": 60,
     "dependencies": [
         "TestRevenueRetrieval2"
     ],
-    "eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05",
+    "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
     "ground": {
         "answer": "The twitter handles of the two hosts of Latent Space.",
         "eval": {
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
index 94fd3ba2..4b6c7073 100644
--- a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
@@ -1,12 +1,13 @@
 {
     "category": [
-        "scrape_synthesize"
+        "scrape_synthesize",
+        "general"
     ],
     "cutoff": 240,
     "dependencies": [
         "TestReadFile"
     ],
-    "eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6",
+    "eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
     "ground": {
         "answer": "A report highlighting elements from the 2 files.",
         "eval": {
diff --git a/benchmark/frontend/public/graph.json b/benchmark/frontend/public/graph.json
index 27833803..7d4e432c 100644
--- a/benchmark/frontend/public/graph.json
+++ b/benchmark/frontend/public/graph.json
@@ -12,6 +12,12 @@
             "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]"
         },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]"
+        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
@@ -78,24 +84,42 @@
             "id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]"
         },
-        {
-            "arrows": "to",
-            "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
-            "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
-            "to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]"
-        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
             "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]"
         },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
+        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
             "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]"
         },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
+        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
@@ -117,7 +141,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
+                "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
                 "ground": {
                     "answer": "The content of output.txt should be 'Hello World!'",
                     "eval": {
@@ -155,7 +179,7 @@
                 ],
                 "cutoff": 60,
                 "dependencies": [],
-                "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
+                "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
                 "ground": {
                     "answer": "The word 'Washington', printed to a .txt file named anything",
                     "eval": {
@@ -187,13 +211,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 150,
                 "dependencies": [
                     "TestUrlShortener"
                 ],
-                "eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95",
+                "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
                 "ground": {
                     "answer": "The correct python file for a TicTacToe game is written",
                     "eval": {
@@ -227,7 +252,7 @@
                 "dependencies": [
                     "TestThreeSum"
                 ],
-                "eval_id": "0823b577-64f2-477b-856d-16726fe464b0",
+                "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
                 "ground": {
                     "answer": "password_generator.py is created and satisfies the requirements.",
                     "eval": {
@@ -255,13 +280,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 90,
                 "dependencies": [
                     "TestPasswordGenerator"
                 ],
-                "eval_id": "6ace62be-6c18-431a-947f-72fb20984b58",
+                "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
                 "ground": {
                     "answer": "The correct python file is written and organizes the files accordingly",
                     "eval": {
@@ -289,13 +315,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f",
+                "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
                 "ground": {
                     "answer": "The three_sum function coded properly.",
                     "eval": {
@@ -327,14 +354,15 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 90,
                 "dependencies": [
                     "TestTicTacToe",
                     "TestReadFile"
                 ],
-                "eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a",
+                "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1",
                 "ground": {
                     "answer": "The implementation of battleship that passes all the tests.",
                     "eval": {
@@ -366,7 +394,7 @@
                 "dependencies": [
                     "TestFileOrganizer"
                 ],
-                "eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273",
+                "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
                 "ground": {
                     "answer": "The correct python file for a basic url shortener CLI",
                     "eval": {
@@ -401,7 +429,7 @@
                 "dependencies": [
                     "TestSearch"
                 ],
-                "eval_id": "525001ed-8b45-4405-9e56-ce4423314294",
+                "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
                 "ground": {
                     "answer": "\u00a325.89",
                     "eval": {
@@ -431,13 +459,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "scrape_synthesize"
+                    "scrape_synthesize",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestRevenueRetrieval2"
                 ],
-                "eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05",
+                "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
                 "ground": {
                     "answer": "The twitter handles of the two hosts of Latent Space.",
                     "eval": {
@@ -476,7 +505,7 @@
                 "dependencies": [
                     "TestRevenueRetrieval"
                 ],
-                "eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb",
+                "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1",
                 "ground": {
                     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
                     "eval": {
@@ -518,6 +547,43 @@
             "label": "RevenueRetrieval2",
             "shape": "dot"
         },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "scrape_synthesize",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestBasicRetrieval"
+                ],
+                "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
+                "ground": {
+                    "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        ".txt"
+                    ],
+                    "should_contain": [
+                        "81,462"
+                    ],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
+                    "difficulty": "intermediate",
+                    "side_effects": []
+                },
+                "name": "TestRevenueRetrieval",
+                "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+            },
+            "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
+            "label": "RevenueRetrieval",
+            "shape": "dot"
+        },
         {
             "color": "grey",
             "data": {
@@ -529,7 +595,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43",
+                "eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
                 "ground": {
                     "answer": "This is a Heading\nThis is a paragraph.",
                     "eval": {
@@ -565,49 +631,90 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "scrape_synthesize"
+                    "data"
                 ],
-                "cutoff": 60,
+                "cutoff": 90,
                 "dependencies": [
-                    "TestBasicRetrieval"
+                    "TestAnswerQuestionSmallCsv"
                 ],
-                "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
+                "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
                 "ground": {
-                    "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+                    "answer": "The correct amount spent on utilities.",
                     "eval": {
                         "type": "file"
                     },
                     "files": [
-                        ".txt"
+                        "output.txt"
                     ],
                     "should_contain": [
-                        "81,462"
-                    ],
-                    "should_not_contain": []
+                        "1861"
+                    ]
                 },
                 "info": {
-                    "description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
+                    "description": "Tests if the agent can answer a question from a csv",
                     "difficulty": "intermediate",
-                    "side_effects": []
+                    "side_effects": [
+                        ""
+                    ]
                 },
-                "name": "TestRevenueRetrieval",
-                "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+                "name": "TestAnswerQuestionCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
             },
-            "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
-            "label": "RevenueRetrieval",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionCsv",
             "shape": "dot"
         },
         {
             "color": "grey",
             "data": {
                 "category": [
-                    "data"
+                    "data",
+                    "general"
+                ],
+                "cutoff": 120,
+                "dependencies": [
+                    "TestAnswerQuestionCsv",
+                    "TestCombineCsv"
+                ],
+                "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589",
+                "ground": {
+                    "answer": "The correct amount spent on utilities.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "1861"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can answer a question from a csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestAnswerQuestionCombineCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+            },
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionCombineCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestReadFile"
                 ],
-                "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
+                "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95",
                 "ground": {
                     "answer": "The csv sorted by date",
                     "eval": {
@@ -638,13 +745,52 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "data"
+                    "data",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestReadFile"
+                ],
+                "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
+                "ground": {
+                    "answer": "The correct amount spent on utilities.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "84"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can answer a question from a small csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestAnswerQuestionSmallCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+            },
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionSmallCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestLabelCsv"
                 ],
-                "eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375",
+                "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
                 "ground": {
                     "answer": "The csv data is combined",
                     "eval": {
@@ -681,7 +827,7 @@
                 "dependencies": [
                     "TestSortCsv"
                 ],
-                "eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e",
+                "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
                 "ground": {
                     "answer": "The csv labelled",
                     "eval": {
@@ -712,13 +858,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "scrape_synthesize"
+                    "scrape_synthesize",
+                    "general"
                 ],
                 "cutoff": 240,
                 "dependencies": [
                     "TestReadFile"
                 ],
-                "eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6",
+                "eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
                 "ground": {
                     "answer": "A report highlighting elements from the 2 files.",
                     "eval": {
diff --git a/benchmark/poetry.lock b/benchmark/poetry.lock
index e60acb3a..dc2cdaca 100644
--- a/benchmark/poetry.lock
+++ b/benchmark/poetry.lock
@@ -295,75 +295,63 @@ files = [
 
 [[package]]
 name = "cffi"
-version = "1.15.1"
+version = "1.16.0"
 description = "Foreign Function Interface for Python calling C code."
 optional = false
-python-versions = "*"
+python-versions = ">=3.8"
 files = [
-    {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"},
-    {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"},
-    {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"},
-    {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"},
-    {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"},
-    {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"},
-    {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"},
-    {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"},
-    {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"},
-    {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"},
-    {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"},
-    {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"},
-    {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"},
-    {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"},
-    {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"},
-    {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"},
-    {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"},
-    {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"},
-    {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"},
-    {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"},
-    {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"},
-    {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"},
-    {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"},
-    {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"},
-    {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"},
-    {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"},
-    {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"},
-    {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"},
-    {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"},
-    {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"},
-    {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"},
-    {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"},
-    {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"},
-    {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"},
-    {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"},
+    {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
+    {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
+    {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
+    {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
+    {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
+    {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
+    {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
+    {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
+    {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
+    {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
+    {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
+    {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
+    {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
+    {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
+    {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
 ]
 
 [package.dependencies]
@@ -620,15 +608,19 @@ test-no-images = ["pytest", "pytest-cov", "wurlitzer"]
 
 [[package]]
 name = "cycler"
-version = "0.11.0"
+version = "0.12.0"
 description = "Composable style cycles"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
-    {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
+    {file = "cycler-0.12.0-py3-none-any.whl", hash = "sha256:7896994252d006771357777d0251f3e34d266f4fa5f2c572247a80ab01440947"},
+    {file = "cycler-0.12.0.tar.gz", hash = "sha256:8cc3a7b4861f91b1095157f9916f748549a617046e67eb7619abed9b34d2c94a"},
 ]
 
+[package.extras]
+docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
+tests = ["pytest", "pytest-cov", "pytest-xdist"]
+
 [[package]]
 name = "decorator"
 version = "5.1.1"
@@ -890,20 +882,19 @@ test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mypy", "pre-commit"
 
 [[package]]
 name = "google-auth"
-version = "2.23.1"
+version = "2.23.2"
 description = "Google Authentication Library"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "google-auth-2.23.1.tar.gz", hash = "sha256:d38bdf4fa1e7c5a35e574861bce55784fd08afadb4e48f99f284f1e487ce702d"},
-    {file = "google_auth-2.23.1-py2.py3-none-any.whl", hash = "sha256:9800802266366a2a87890fb2d04923fc0c0d4368af0b86db18edd94a62386ea1"},
+    {file = "google-auth-2.23.2.tar.gz", hash = "sha256:5a9af4be520ba33651471a0264eead312521566f44631cbb621164bc30c8fd40"},
+    {file = "google_auth-2.23.2-py2.py3-none-any.whl", hash = "sha256:c2e253347579d483004f17c3bd0bf92e611ef6c7ba24d41c5c59f2e7aeeaf088"},
 ]
 
 [package.dependencies]
 cachetools = ">=2.0.0,<6.0"
 pyasn1-modules = ">=0.2.1"
 rsa = ">=3.1.4,<5"
-urllib3 = ">=2.0.5"
 
 [package.extras]
 aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"]
@@ -2765,13 +2756,13 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess
 
 [[package]]
 name = "wcwidth"
-version = "0.2.6"
+version = "0.2.7"
 description = "Measures the displayed width of unicode strings in a terminal"
 optional = false
 python-versions = "*"
 files = [
-    {file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"},
-    {file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"},
+    {file = "wcwidth-0.2.7-py2.py3-none-any.whl", hash = "sha256:fabf3e32999d9b0dab7d19d845149f326f04fe29bac67709ee071dbd92640a36"},
+    {file = "wcwidth-0.2.7.tar.gz", hash = "sha256:1b6d30a98ddd5ce9bbdb33658191fd2423fc9da203fe3ef1855407dcb7ee4e26"},
 ]
 
 [[package]]
diff --git a/benchmark/tests/test_benchmark_workflow.py b/benchmark/tests/test_benchmark_workflow.py
index 92fbdbbd..700d42a8 100644
--- a/benchmark/tests/test_benchmark_workflow.py
+++ b/benchmark/tests/test_benchmark_workflow.py
@@ -12,14 +12,14 @@ import time
     "eval_id, input_text, expected_artifact_length, test_name, should_be_successful",
     [
         (
-            "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
+            "f219f3d3-a41b-45a9-a3d0-389832086ee8",
             "Write the word 'Washington' to a .txt file",
             0,
             "WriteFile",
             True,
         ),
         (
-            "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
+            "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
             "Read the file called file_to_read.txt and write its content to a file called output.txt",
             1,
             "ReadFile",
diff --git a/frontend/assets/coding_tree_structure.json b/frontend/assets/coding_tree_structure.json
index 371bae30..54972b46 100644
--- a/frontend/assets/coding_tree_structure.json
+++ b/frontend/assets/coding_tree_structure.json
@@ -63,7 +63,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
+                "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
                 "ground": {
                     "answer": "The content of output.txt should be 'Hello World!'",
                     "eval": {
@@ -101,7 +101,7 @@
                 ],
                 "cutoff": 60,
                 "dependencies": [],
-                "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
+                "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
                 "ground": {
                     "answer": "The word 'Washington', printed to a .txt file named anything",
                     "eval": {
@@ -133,13 +133,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 150,
                 "dependencies": [
                     "TestUrlShortener"
                 ],
-                "eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95",
+                "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
                 "ground": {
                     "answer": "The correct python file for a TicTacToe game is written",
                     "eval": {
@@ -173,7 +174,7 @@
                 "dependencies": [
                     "TestFileOrganizer"
                 ],
-                "eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273",
+                "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
                 "ground": {
                     "answer": "The correct python file for a basic url shortener CLI",
                     "eval": {
@@ -201,13 +202,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 90,
                 "dependencies": [
                     "TestPasswordGenerator"
                 ],
-                "eval_id": "6ace62be-6c18-431a-947f-72fb20984b58",
+                "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
                 "ground": {
                     "answer": "The correct python file is written and organizes the files accordingly",
                     "eval": {
@@ -241,7 +243,7 @@
                 "dependencies": [
                     "TestThreeSum"
                 ],
-                "eval_id": "0823b577-64f2-477b-856d-16726fe464b0",
+                "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
                 "ground": {
                     "answer": "password_generator.py is created and satisfies the requirements.",
                     "eval": {
@@ -269,13 +271,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f",
+                "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
                 "ground": {
                     "answer": "The three_sum function coded properly.",
                     "eval": {
@@ -307,14 +310,15 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 90,
                 "dependencies": [
                     "TestTicTacToe",
                     "TestReadFile"
                 ],
-                "eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a",
+                "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1",
                 "ground": {
                     "answer": "The implementation of battleship that passes all the tests.",
                     "eval": {
diff --git a/frontend/assets/data_tree_structure.json b/frontend/assets/data_tree_structure.json
index bd11dd5b..e48905a8 100644
--- a/frontend/assets/data_tree_structure.json
+++ b/frontend/assets/data_tree_structure.json
@@ -6,11 +6,29 @@
             "id": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]"
         },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]"
+        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
-            "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
-            "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]"
+            "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
         },
         {
             "arrows": "to",
@@ -23,6 +41,12 @@
             "from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
             "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]"
         }
     ],
     "nodes": [
@@ -39,7 +63,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
+                "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
                 "ground": {
                     "answer": "The content of output.txt should be 'Hello World!'",
                     "eval": {
@@ -77,7 +101,7 @@
                 ],
                 "cutoff": 60,
                 "dependencies": [],
-                "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
+                "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
                 "ground": {
                     "answer": "The word 'Washington', printed to a .txt file named anything",
                     "eval": {
@@ -111,48 +135,126 @@
                 "category": [
                     "data"
                 ],
-                "cutoff": 60,
+                "cutoff": 90,
                 "dependencies": [
-                    "TestReadFile"
+                    "TestAnswerQuestionSmallCsv"
                 ],
-                "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
+                "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
                 "ground": {
-                    "answer": "The csv sorted by date",
+                    "answer": "The correct amount spent on utilities.",
                     "eval": {
                         "type": "file"
                     },
                     "files": [
-                        "output.csv"
+                        "output.txt"
                     ],
                     "should_contain": [
-                        "id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00"
+                        "1861"
                     ]
                 },
                 "info": {
-                    "description": "Tests if the agent can sort a csv",
-                    "difficulty": "basic",
+                    "description": "Tests if the agent can answer a question from a csv",
+                    "difficulty": "intermediate",
                     "side_effects": [
                         ""
                     ]
                 },
-                "name": "TestSortCsv",
-                "task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved."
+                "name": "TestAnswerQuestionCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
             },
-            "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
-            "label": "SortCsv",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionCsv",
             "shape": "dot"
         },
         {
             "color": "grey",
             "data": {
                 "category": [
-                    "data"
+                    "data",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestReadFile"
+                ],
+                "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
+                "ground": {
+                    "answer": "The correct amount spent on utilities.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "84"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can answer a question from a small csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestAnswerQuestionSmallCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+            },
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionSmallCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
+                ],
+                "cutoff": 120,
+                "dependencies": [
+                    "TestAnswerQuestionCsv",
+                    "TestCombineCsv"
+                ],
+                "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589",
+                "ground": {
+                    "answer": "The correct amount spent on utilities.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "1861"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can answer a question from a csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestAnswerQuestionCombineCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+            },
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionCombineCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestLabelCsv"
                 ],
-                "eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375",
+                "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
                 "ground": {
                     "answer": "The csv data is combined",
                     "eval": {
@@ -189,7 +291,7 @@
                 "dependencies": [
                     "TestSortCsv"
                 ],
-                "eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e",
+                "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
                 "ground": {
                     "answer": "The csv labelled",
                     "eval": {
@@ -215,6 +317,44 @@
             "id": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
             "label": "LabelCsv",
             "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestReadFile"
+                ],
+                "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95",
+                "ground": {
+                    "answer": "The csv sorted by date",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.csv"
+                    ],
+                    "should_contain": [
+                        "id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can sort a csv",
+                    "difficulty": "basic",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestSortCsv",
+                "task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved."
+            },
+            "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
+            "label": "SortCsv",
+            "shape": "dot"
         }
     ]
 }
diff --git a/frontend/assets/general_tree_structure.json b/frontend/assets/general_tree_structure.json
index 8c331555..85cdc2ec 100644
--- a/frontend/assets/general_tree_structure.json
+++ b/frontend/assets/general_tree_structure.json
@@ -6,6 +6,48 @@
             "id": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]"
         },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]"
+        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
@@ -17,6 +59,72 @@
             "from": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
             "id": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSynthesizeInfo::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestSynthesizeInfo::test_method[challenge_data0]"
         }
     ],
     "nodes": [
@@ -33,7 +141,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
+                "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
                 "ground": {
                     "answer": "The content of output.txt should be 'Hello World!'",
                     "eval": {
@@ -71,7 +179,7 @@
                 ],
                 "cutoff": 60,
                 "dependencies": [],
-                "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
+                "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
                 "ground": {
                     "answer": "The word 'Washington', printed to a .txt file named anything",
                     "eval": {
@@ -99,6 +207,217 @@
             "label": "WriteFile",
             "shape": "dot"
         },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "coding",
+                    "general"
+                ],
+                "cutoff": 150,
+                "dependencies": [
+                    "TestUrlShortener"
+                ],
+                "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
+                "ground": {
+                    "answer": "The correct python file for a TicTacToe game is written",
+                    "eval": {
+                        "type": "python"
+                    },
+                    "files": [
+                        "test.py"
+                    ],
+                    "should_contain": [],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can create Tic-Tac-Toe game",
+                    "difficulty": "basic",
+                    "side_effects": []
+                },
+                "name": "TestTicTacToe",
+                "task": "Build a Tic-Tac-Toe game using a python CLI. Here are the specifications.\n\nThe Grid: The game board is a 3x3 grid, consisting of 3 rows and 3 columns, creating a total of 9 squares.\n\nPlayers: There are two players. One player uses the number \"1\", and the other player uses the number \"2\".\n\nTaking Turns: Players take turns to put their respective numbers (\"1\" or \"2\") in an empty square of the grid. Once a player has placed their number in a square, it cannot be changed or removed.\n\nObjective: The goal is to get three of your numbers in a row, either horizontally, vertically, or diagonally.\n\nEnd of the Game: The game concludes in one of two ways: One player gets three of their numbers in a row (horizontally, vertically, or diagonally) and is declared the winner.\nAll squares on the grid are filled, and no player has three in a row. This situation is a \"draw\" or a \"tie\".\n\nTechnical specifications:\nBuild a file called tic_tac_toe.py. This file will be called through command lines. You will have to prompt users for their move. Player 1 will always start.\nPlayers will input their move in the following format: \"x,y\" where x and y represent the location in the grid (0,0 is top left, 2,2 is bottom right).\n\nYour primary requirement is to halt the game when appropriate and to print only one of these three exact sentences:\n\n\"Player 1 won!\"\n\"Player 2 won!\"\n\"Draw\"\n\nEdge cases: A player can send an incorrect location. Either the location is incorrect or the square is already filled. In this case, this counts as doing nothing, and the player gets prompted for new locations again.\n\n\nYou will be expected to create a python file called tic_tac_toe.py that will run through command lines by using ```python tic_tac_toe.py```.\n\nHere is an example of how your tic_tac_toe.py game will be tested.\n```\nprocess = subprocess.Popen(\n    ['python', 'tic_tac_toe.py'],\n    stdout=subprocess.PIPE,\n    text=True\n)\n\noutput, _ = process.communicate('\\n'.join([\"0,0\", \"1,0\", \"0,1\", \"1,1\", \"0,2\"]))\n\nassert \"Player 1 won!\" in output\n```"
+            },
+            "id": "agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]",
+            "label": "TicTacToe",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "coding"
+                ],
+                "cutoff": 150,
+                "dependencies": [
+                    "TestFileOrganizer"
+                ],
+                "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
+                "ground": {
+                    "answer": "The correct python file for a basic url shortener CLI",
+                    "eval": {
+                        "type": "python"
+                    },
+                    "files": [
+                        "test.py"
+                    ],
+                    "should_contain": [],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can create a URL shortener.",
+                    "difficulty": "basic",
+                    "side_effects": []
+                },
+                "name": "TestUrlShortener",
+                "task": "Build a basic URL shortener using a python CLI. Here are the specifications.\n\nFunctionality: The program should have two primary functionalities.\n\nShorten a given URL.\nRetrieve the original URL from a shortened URL.\n\nCLI: The command-line interface should accept a URL as its first input. It should be able to determine if the url is a shortened url or not. If the url is not shortened, it will display ONLY the shortened url, otherwise, it will display ONLY the original unshortened URL. Afterwards, it should prompt the user for another URL to process.\n\nTechnical specifications:\nBuild a file called url_shortener.py. This file will be called through command lines.\n\nEdge cases:\nFor the sake of simplicity, there will be no edge cases, you can assume the input is always correct and the user immediately passes the shortened version of the url he just shortened.\n\nYou will be expected to create a python file called url_shortener.py that will run through command lines by using python url_shortener.py.\n\nThe url_shortener.py will be tested this way:\n```\nimport unittest\nfrom url_shortener import shorten_url, retrieve_url\n\nclass TestURLShortener(unittest.TestCase):\n    def test_url_retrieval(self):\n        # Shorten the URL to get its shortened form\n        shortened_url = shorten_url('https://www.example.com')\n\n        # Retrieve the original URL using the shortened URL directly\n        retrieved_url = retrieve_url(shortened_url)\n\n        self.assertEqual(retrieved_url, 'https://www.example.com', \"Retrieved URL does not match the original!\")\n\nif __name__ == \"__main__\":\n    unittest.main()\n```"
+            },
+            "id": "agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]",
+            "label": "UrlShortener",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "coding",
+                    "general"
+                ],
+                "cutoff": 90,
+                "dependencies": [
+                    "TestPasswordGenerator"
+                ],
+                "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
+                "ground": {
+                    "answer": "The correct python file is written and organizes the files accordingly",
+                    "eval": {
+                        "type": "python"
+                    },
+                    "files": [
+                        "test.py"
+                    ],
+                    "should_contain": [],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can create a file organizer.",
+                    "difficulty": "basic",
+                    "side_effects": []
+                },
+                "name": "TestFileOrganizer",
+                "task": "Create a file organizer CLI tool in Python that sorts files in a directory based on their file types (e.g., images, documents, audio) and moves them into these corresponding folders: 'images', 'documents', 'audio'. The entry point will be a python file that can be run this way: python organize_files.py --directory_path=YOUR_DIRECTORY_PATH"
+            },
+            "id": "agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]",
+            "label": "FileOrganizer",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "coding"
+                ],
+                "cutoff": 90,
+                "dependencies": [
+                    "TestThreeSum"
+                ],
+                "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
+                "ground": {
+                    "answer": "password_generator.py is created and satisfies the requirements.",
+                    "eval": {
+                        "type": "python"
+                    },
+                    "files": [
+                        "test.py"
+                    ],
+                    "should_contain": [],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can create a random password generator.",
+                    "difficulty": "basic",
+                    "side_effects": []
+                },
+                "name": "TestPasswordGenerator",
+                "task": "Create a random password generator. The password should have between 8 and 16 characters and should contain letters, numbers and symbols. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py [--len x] where x is the length of the password. If no length is specified, the password should be 8 characters long. The password_generator can also be imported as a module and called as password = password_generator.generate_password(len=x). Any invalid input should raise a ValueError."
+            },
+            "id": "agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]",
+            "label": "PasswordGenerator",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "coding",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestWriteFile"
+                ],
+                "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
+                "ground": {
+                    "answer": "The three_sum function coded properly.",
+                    "eval": {
+                        "type": "python"
+                    },
+                    "files": [
+                        "test.py"
+                    ],
+                    "should_contain": [
+                        "[0, 1, 2]",
+                        "[0, 2, 5]",
+                        "[0, 2, 3]"
+                    ],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can create the three_sum function.",
+                    "difficulty": "basic",
+                    "side_effects": []
+                },
+                "name": "TestThreeSum",
+                "task": "Create a three_sum function in a file called sample_code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2]."
+            },
+            "id": "agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]",
+            "label": "ThreeSum",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "coding",
+                    "general"
+                ],
+                "cutoff": 90,
+                "dependencies": [
+                    "TestTicTacToe",
+                    "TestReadFile"
+                ],
+                "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1",
+                "ground": {
+                    "answer": "The implementation of battleship that passes all the tests.",
+                    "eval": {
+                        "type": "pytest"
+                    },
+                    "files": [],
+                    "should_contain": [],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can create a Battleship.",
+                    "difficulty": "expert",
+                    "side_effects": []
+                },
+                "name": "TestBattleship",
+                "task": "Build a battleship game\n\nSpecifications:\n\nOverview: Battleship is a two-player strategy game where each player places their fleet of ships on a grid and tries to sink the opponent's fleet by guessing their locations.\nPlayers take turns calling out a row and column, attempting to name a square containing one of the opponent's ships.\n\nThe Grid: Each player's grid is a 10x10 grid, identified by rows (using numbers 1-10) and columns (using letters A-J).\n\nShips:\n\nCarrier - 5 squares\nBattleship - 4 squares\nCruiser - 3 squares\nSubmarine - 3 squares\nDestroyer - 2 squares\nEach ship occupies contiguous squares on the grid, arranged either horizontally or vertically.\n\nSetup:\n\nAt the start of the game, each player places their fleet on their grid. This setup is hidden from the opponent.\nThe game begins with Player 1, followed by Player 2, and so on.\nTaking Turns:\n\nOn a player's turn, they announce a grid square (e.g., \"D5\").\nThe opponent announces whether that square is a \"hit\" (if there's a part of a ship on that square) or \"miss\" (if the square is empty).\nIf a player hits a square occupied by a ship, they get another turn to guess. This continues until they make a miss, at which point their turn ends.\nIf a player hits all the squares occupied by a ship, the opponent must announce the sinking of that specific ship, e.g., \"You sank my Battleship!\"\n\nObjective: The goal is to sink all of your opponent's ships before they sink yours.\n\nEnd of the Game: The game ends when one player has sunk all of the opponent's ships. The winner is the player who sinks all the opposing fleet first.\n\nTechnical details:\nIn your root folder you will find an abstract class that defines the public interface of the Battleship class you will have to build:\n```\nfrom abc import ABC, abstractmethod\nfrom typing import Optional\n\nfrom pydantic import BaseModel, validator\n\n\n# Models for the request and response payloads\nclass ShipPlacement(BaseModel):\n    ship_type: str\n    start: dict  # {\"row\": int, \"column\": str}\n    direction: str\n\n    @validator(\"start\")\n    def validate_start(cls, start):\n        row, column = start.get(\"row\"), start.get(\"column\")\n\n        if not (1 <= row <= 10):\n            raise ValueError(\"Row must be between 1 and 10 inclusive.\")\n\n        if column not in list(\"ABCDEFGHIJ\"):\n            raise ValueError(\"Column must be one of A, B, C, D, E, F, G, H, I, J.\")\n\n        return start\n\n\nclass Turn(BaseModel):\n    target: dict  # {\"row\": int, \"column\": str}\n\n\nclass TurnResponse(BaseModel):\n    result: str\n    ship_type: Optional[str]  # This would be None if the result is a miss\n\n\nclass GameStatus(BaseModel):\n    is_game_over: bool\n    winner: Optional[str]\n\n\nfrom typing import List\n\n\nclass Game(BaseModel):\n    game_id: str\n    players: List[str]\n    board: dict  # This could represent the state of the game board, you might need to flesh this out further\n    ships: List[ShipPlacement]  # List of ship placements for this game\n    turns: List[Turn]  # List of turns that have been taken\n\n\nclass AbstractBattleship(ABC):\n    SHIP_LENGTHS = {\n        \"carrier\": 5,\n        \"battleship\": 4,\n        \"cruiser\": 3,\n        \"submarine\": 3,\n        \"destroyer\": 2,\n    }\n\n    @abstractmethod\n    def create_ship_placement(self, game_id: str, placement: ShipPlacement) -> None:\n        \"\"\"\n        Place a ship on the grid.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def create_turn(self, game_id: str, turn: Turn) -> TurnResponse:\n        \"\"\"\n        Players take turns to target a grid cell.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def get_game_status(self, game_id: str) -> GameStatus:\n        \"\"\"\n        Check if the game is over and get the winner if there's one.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def get_winner(self, game_id: str) -> str:\n        \"\"\"\n        Get the winner of the game.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def get_game(self) -> Game:\n        \"\"\"\n        Retrieve the state of the game.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def delete_game(self, game_id: str) -> None:\n        \"\"\"\n        Delete a game given its ID.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def create_game(self, game_id: str) -> None:\n        \"\"\"\n        Create a new game.\n        \"\"\"\n        pass\n\n```\nAt any moment you can run ```pytest``` to execute the tests.\nYou have two types of test: \n- positive tests => test the battleship game being used in ideal conditions\n- negative tests => tests the battleship game behaviour when used incorrectly\n\nSuccess criteria:\n- you will need to write a file called battleship.py that implements the abstract Battleship class.\n- this class will have to pass all the tests.\n- you're not allowed to modify any other file than the battleship.py. You can add other files as long as the main entrypoint is the battleship class."
+            },
+            "id": "agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]",
+            "label": "Battleship",
+            "shape": "dot"
+        },
         {
             "color": "grey",
             "data": {
@@ -110,7 +429,7 @@
                 "dependencies": [
                     "TestSearch"
                 ],
-                "eval_id": "525001ed-8b45-4405-9e56-ce4423314294",
+                "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
                 "ground": {
                     "answer": "\u00a325.89",
                     "eval": {
@@ -147,7 +466,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43",
+                "eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
                 "ground": {
                     "answer": "This is a Heading\nThis is a paragraph.",
                     "eval": {
@@ -178,6 +497,401 @@
             "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
             "label": "Search",
             "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "scrape_synthesize",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestRevenueRetrieval2"
+                ],
+                "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
+                "ground": {
+                    "answer": "The twitter handles of the two hosts of Latent Space.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "swyx",
+                        "FanaHOVA"
+                    ],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can retrieve twitter handles given a vague description.",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestTestGetInformation",
+                "task": "Write the twitter handle of the two hosts of Latent Space to a file called output.txt"
+            },
+            "id": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]",
+            "label": "TestGetInformation",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "scrape_synthesize"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestRevenueRetrieval"
+                ],
+                "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1",
+                "ground": {
+                    "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        ".txt"
+                    ],
+                    "should_contain": [
+                        "15",
+                        "112",
+                        "117",
+                        "204",
+                        "413",
+                        "2,014",
+                        "3,198",
+                        "4,046",
+                        "7,000",
+                        "11,759",
+                        "21,461",
+                        "24,578",
+                        "31,536",
+                        "53,823",
+                        "81,462"
+                    ],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can retrieve all the revenues of Tesla since its creation.",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        "tests if there is in fact an LLM attached"
+                    ]
+                },
+                "name": "TestRevenueRetrieval2",
+                "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+            },
+            "id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]",
+            "label": "RevenueRetrieval2",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "scrape_synthesize",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestBasicRetrieval"
+                ],
+                "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
+                "ground": {
+                    "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        ".txt"
+                    ],
+                    "should_contain": [
+                        "81,462"
+                    ],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
+                    "difficulty": "intermediate",
+                    "side_effects": []
+                },
+                "name": "TestRevenueRetrieval",
+                "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+            },
+            "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
+            "label": "RevenueRetrieval",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
+                ],
+                "cutoff": 120,
+                "dependencies": [
+                    "TestAnswerQuestionCsv",
+                    "TestCombineCsv"
+                ],
+                "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589",
+                "ground": {
+                    "answer": "The correct amount spent on utilities.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "1861"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can answer a question from a csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestAnswerQuestionCombineCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+            },
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionCombineCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data"
+                ],
+                "cutoff": 90,
+                "dependencies": [
+                    "TestAnswerQuestionSmallCsv"
+                ],
+                "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
+                "ground": {
+                    "answer": "The correct amount spent on utilities.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "1861"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can answer a question from a csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestAnswerQuestionCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+            },
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestReadFile"
+                ],
+                "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
+                "ground": {
+                    "answer": "The correct amount spent on utilities.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "84"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can answer a question from a small csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestAnswerQuestionSmallCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+            },
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionSmallCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestLabelCsv"
+                ],
+                "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
+                "ground": {
+                    "answer": "The csv data is combined",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.csv"
+                    ],
+                    "should_contain": [
+                        "Age,ID,Name,Occupation,Salary\n28,101,John,Engineer,80000\n34,102,Alice,Doctor,120000\n45,103,Bob,Lawyer,95000"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can combine data from a csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestCombineCsv",
+                "task": "The csvs 'file1.csv' and 'file2.csv' both have a column 'ID'. Combine these 2 csvs using the 'ID' column. Sort the rows by ID in ascending order and the columns alphabetically. Write the output in output.csv"
+            },
+            "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]",
+            "label": "CombineCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestSortCsv"
+                ],
+                "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
+                "ground": {
+                    "answer": "The csv labelled",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.csv"
+                    ],
+                    "should_contain": [
+                        "Item, Color\nBanana, Yellow\nLeaf, Green\nSky, Blue\nSunflower, Yellow\nGrass, Green\nJeans, Blue\nLemon, Yellow\nTree, Green\nOcean, Blue\nDaisy, Yellow\nFern, Green"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can label data in a csv",
+                    "difficulty": "basic",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestLabelCsv",
+                "task": "The csv 'input.csv' has many items. create a 'Color' column for these items and classify them as either 'blue', 'green', or 'yellow' depending on what the most likely color is. Preserve the order of the rows. The color column should be the second column. Write the output in output.csv"
+            },
+            "id": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
+            "label": "LabelCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestReadFile"
+                ],
+                "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95",
+                "ground": {
+                    "answer": "The csv sorted by date",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.csv"
+                    ],
+                    "should_contain": [
+                        "id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can sort a csv",
+                    "difficulty": "basic",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestSortCsv",
+                "task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved."
+            },
+            "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
+            "label": "SortCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "scrape_synthesize",
+                    "general"
+                ],
+                "cutoff": 240,
+                "dependencies": [
+                    "TestReadFile"
+                ],
+                "eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
+                "ground": {
+                    "answer": "A report highlighting elements from the 2 files.",
+                    "eval": {
+                        "scoring": "binary",
+                        "template": "question",
+                        "type": "llm"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "Is the company mentioned in the output actively addressing or capitalizing on the challenges or trends listed?"
+                    ],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can generate content based on the content of 2 files.",
+                    "difficulty": "basic",
+                    "side_effects": []
+                },
+                "name": "TestSynthesizeInfo",
+                "task": "Create a brief report or summary highlighting how one or more companies from companies.txt are addressing or capitalizing on challenges or trends from challenges.txt. Write a file called output.txt."
+            },
+            "id": "agbenchmark/generate_test.py::TestSynthesizeInfo::test_method[challenge_data0]",
+            "label": "SynthesizeInfo",
+            "shape": "dot"
         }
     ]
 }
diff --git a/frontend/assets/scrape_synthesize_tree_structure.json b/frontend/assets/scrape_synthesize_tree_structure.json
index 16bacf56..73460ef0 100644
--- a/frontend/assets/scrape_synthesize_tree_structure.json
+++ b/frontend/assets/scrape_synthesize_tree_structure.json
@@ -57,7 +57,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
+                "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
                 "ground": {
                     "answer": "The content of output.txt should be 'Hello World!'",
                     "eval": {
@@ -95,7 +95,7 @@
                 ],
                 "cutoff": 60,
                 "dependencies": [],
-                "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
+                "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
                 "ground": {
                     "answer": "The word 'Washington', printed to a .txt file named anything",
                     "eval": {
@@ -134,7 +134,7 @@
                 "dependencies": [
                     "TestSearch"
                 ],
-                "eval_id": "525001ed-8b45-4405-9e56-ce4423314294",
+                "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
                 "ground": {
                     "answer": "\u00a325.89",
                     "eval": {
@@ -171,7 +171,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43",
+                "eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
                 "ground": {
                     "answer": "This is a Heading\nThis is a paragraph.",
                     "eval": {
@@ -207,13 +207,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "scrape_synthesize"
+                    "scrape_synthesize",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestRevenueRetrieval2"
                 ],
-                "eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05",
+                "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
                 "ground": {
                     "answer": "The twitter handles of the two hosts of Latent Space.",
                     "eval": {
@@ -252,7 +253,7 @@
                 "dependencies": [
                     "TestRevenueRetrieval"
                 ],
-                "eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb",
+                "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1",
                 "ground": {
                     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
                     "eval": {
@@ -298,13 +299,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "scrape_synthesize"
+                    "scrape_synthesize",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestBasicRetrieval"
                 ],
-                "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
+                "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
                 "ground": {
                     "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
                     "eval": {
@@ -334,13 +336,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "scrape_synthesize"
+                    "scrape_synthesize",
+                    "general"
                 ],
                 "cutoff": 240,
                 "dependencies": [
                     "TestReadFile"
                 ],
-                "eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6",
+                "eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
                 "ground": {
                     "answer": "A report highlighting elements from the 2 files.",
                     "eval": {
diff --git a/frontend/assets/tree_structure.json b/frontend/assets/tree_structure.json
index 27833803..7d4e432c 100644
--- a/frontend/assets/tree_structure.json
+++ b/frontend/assets/tree_structure.json
@@ -12,6 +12,12 @@
             "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]"
         },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]"
+        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
@@ -78,24 +84,42 @@
             "id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]"
         },
-        {
-            "arrows": "to",
-            "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
-            "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
-            "to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]"
-        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
             "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]"
         },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
+        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
             "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
             "to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]"
         },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]"
+        },
+        {
+            "arrows": "to",
+            "from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]",
+            "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
+        },
         {
             "arrows": "to",
             "from": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
@@ -117,7 +141,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
+                "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
                 "ground": {
                     "answer": "The content of output.txt should be 'Hello World!'",
                     "eval": {
@@ -155,7 +179,7 @@
                 ],
                 "cutoff": 60,
                 "dependencies": [],
-                "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
+                "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
                 "ground": {
                     "answer": "The word 'Washington', printed to a .txt file named anything",
                     "eval": {
@@ -187,13 +211,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 150,
                 "dependencies": [
                     "TestUrlShortener"
                 ],
-                "eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95",
+                "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
                 "ground": {
                     "answer": "The correct python file for a TicTacToe game is written",
                     "eval": {
@@ -227,7 +252,7 @@
                 "dependencies": [
                     "TestThreeSum"
                 ],
-                "eval_id": "0823b577-64f2-477b-856d-16726fe464b0",
+                "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
                 "ground": {
                     "answer": "password_generator.py is created and satisfies the requirements.",
                     "eval": {
@@ -255,13 +280,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 90,
                 "dependencies": [
                     "TestPasswordGenerator"
                 ],
-                "eval_id": "6ace62be-6c18-431a-947f-72fb20984b58",
+                "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
                 "ground": {
                     "answer": "The correct python file is written and organizes the files accordingly",
                     "eval": {
@@ -289,13 +315,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f",
+                "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
                 "ground": {
                     "answer": "The three_sum function coded properly.",
                     "eval": {
@@ -327,14 +354,15 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "coding"
+                    "coding",
+                    "general"
                 ],
                 "cutoff": 90,
                 "dependencies": [
                     "TestTicTacToe",
                     "TestReadFile"
                 ],
-                "eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a",
+                "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1",
                 "ground": {
                     "answer": "The implementation of battleship that passes all the tests.",
                     "eval": {
@@ -366,7 +394,7 @@
                 "dependencies": [
                     "TestFileOrganizer"
                 ],
-                "eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273",
+                "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
                 "ground": {
                     "answer": "The correct python file for a basic url shortener CLI",
                     "eval": {
@@ -401,7 +429,7 @@
                 "dependencies": [
                     "TestSearch"
                 ],
-                "eval_id": "525001ed-8b45-4405-9e56-ce4423314294",
+                "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
                 "ground": {
                     "answer": "\u00a325.89",
                     "eval": {
@@ -431,13 +459,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "scrape_synthesize"
+                    "scrape_synthesize",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestRevenueRetrieval2"
                 ],
-                "eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05",
+                "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
                 "ground": {
                     "answer": "The twitter handles of the two hosts of Latent Space.",
                     "eval": {
@@ -476,7 +505,7 @@
                 "dependencies": [
                     "TestRevenueRetrieval"
                 ],
-                "eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb",
+                "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1",
                 "ground": {
                     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
                     "eval": {
@@ -518,6 +547,43 @@
             "label": "RevenueRetrieval2",
             "shape": "dot"
         },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "scrape_synthesize",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestBasicRetrieval"
+                ],
+                "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
+                "ground": {
+                    "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        ".txt"
+                    ],
+                    "should_contain": [
+                        "81,462"
+                    ],
+                    "should_not_contain": []
+                },
+                "info": {
+                    "description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
+                    "difficulty": "intermediate",
+                    "side_effects": []
+                },
+                "name": "TestRevenueRetrieval",
+                "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+            },
+            "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
+            "label": "RevenueRetrieval",
+            "shape": "dot"
+        },
         {
             "color": "grey",
             "data": {
@@ -529,7 +595,7 @@
                 "dependencies": [
                     "TestWriteFile"
                 ],
-                "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43",
+                "eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
                 "ground": {
                     "answer": "This is a Heading\nThis is a paragraph.",
                     "eval": {
@@ -565,49 +631,90 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "scrape_synthesize"
+                    "data"
                 ],
-                "cutoff": 60,
+                "cutoff": 90,
                 "dependencies": [
-                    "TestBasicRetrieval"
+                    "TestAnswerQuestionSmallCsv"
                 ],
-                "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
+                "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
                 "ground": {
-                    "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+                    "answer": "The correct amount spent on utilities.",
                     "eval": {
                         "type": "file"
                     },
                     "files": [
-                        ".txt"
+                        "output.txt"
                     ],
                     "should_contain": [
-                        "81,462"
-                    ],
-                    "should_not_contain": []
+                        "1861"
+                    ]
                 },
                 "info": {
-                    "description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
+                    "description": "Tests if the agent can answer a question from a csv",
                     "difficulty": "intermediate",
-                    "side_effects": []
+                    "side_effects": [
+                        ""
+                    ]
                 },
-                "name": "TestRevenueRetrieval",
-                "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+                "name": "TestAnswerQuestionCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
             },
-            "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
-            "label": "RevenueRetrieval",
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionCsv",
             "shape": "dot"
         },
         {
             "color": "grey",
             "data": {
                 "category": [
-                    "data"
+                    "data",
+                    "general"
+                ],
+                "cutoff": 120,
+                "dependencies": [
+                    "TestAnswerQuestionCsv",
+                    "TestCombineCsv"
+                ],
+                "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589",
+                "ground": {
+                    "answer": "The correct amount spent on utilities.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "1861"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can answer a question from a csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestAnswerQuestionCombineCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+            },
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionCombineCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestReadFile"
                 ],
-                "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
+                "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95",
                 "ground": {
                     "answer": "The csv sorted by date",
                     "eval": {
@@ -638,13 +745,52 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "data"
+                    "data",
+                    "general"
+                ],
+                "cutoff": 60,
+                "dependencies": [
+                    "TestReadFile"
+                ],
+                "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
+                "ground": {
+                    "answer": "The correct amount spent on utilities.",
+                    "eval": {
+                        "type": "file"
+                    },
+                    "files": [
+                        "output.txt"
+                    ],
+                    "should_contain": [
+                        "84"
+                    ]
+                },
+                "info": {
+                    "description": "Tests if the agent can answer a question from a small csv",
+                    "difficulty": "intermediate",
+                    "side_effects": [
+                        ""
+                    ]
+                },
+                "name": "TestAnswerQuestionSmallCsv",
+                "task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
+            },
+            "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
+            "label": "AnswerQuestionSmallCsv",
+            "shape": "dot"
+        },
+        {
+            "color": "grey",
+            "data": {
+                "category": [
+                    "data",
+                    "general"
                 ],
                 "cutoff": 60,
                 "dependencies": [
                     "TestLabelCsv"
                 ],
-                "eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375",
+                "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
                 "ground": {
                     "answer": "The csv data is combined",
                     "eval": {
@@ -681,7 +827,7 @@
                 "dependencies": [
                     "TestSortCsv"
                 ],
-                "eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e",
+                "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
                 "ground": {
                     "answer": "The csv labelled",
                     "eval": {
@@ -712,13 +858,14 @@
             "color": "grey",
             "data": {
                 "category": [
-                    "scrape_synthesize"
+                    "scrape_synthesize",
+                    "general"
                 ],
                 "cutoff": 240,
                 "dependencies": [
                     "TestReadFile"
                 ],
-                "eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6",
+                "eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
                 "ground": {
                     "answer": "A report highlighting elements from the 2 files.",
                     "eval": {