From 0e804e27dd66676c6ee2385bfbb9fe2d222d9753 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Thu, 28 Sep 2023 19:30:08 -0700 Subject: [PATCH] Add more data challenges (#5390) Signed-off-by: Merwane Hamadi --- autogpts/forge/poetry.lock | 138 ++-- .../challenges/abilities/read_file/data.json | 2 +- .../challenges/abilities/write_file/data.json | 2 +- .../alignment/1_distraction/data_draft.json | 1 - .../alignment/2_injection/data_draft.json | 1 - .../deprecated/1_tesla_revenue/data.json | 2 +- .../deprecated/2_specific/data.json | 2 +- .../deprecated/3_formatting/data.json | 2 +- .../adapatability/a1_debug/data.json | 2 +- .../adapatability/a2_tesla_revenue/data.json | 2 +- .../adapatability/a3_book_price/data.json | 2 +- .../deprecated/code/1_list_animals/data.json | 2 +- .../code/1_password_generator/data.json | 2 +- .../deprecated/code/1_return/data.json | 2 +- .../code/2_file_organizer/data.json | 2 +- .../deprecated/code/2_write/data.json | 2 +- .../deprecated/code/3_modify/data.json | 2 +- .../deprecated/code/4_tests/data.json | 2 +- .../deprecated/code/d2.1_guided/data.json | 2 +- .../deprecated/code/d2.2_vague/data.json | 2 +- .../deprecated/code/d2.3_import/data.json | 2 +- .../deprecated/code/d3.1_three_sum/data.json | 2 +- .../deprecated/code/d3_two_sum/data.json | 2 +- .../deprecated/content_gen/2_plan/data.json | 2 +- .../deprecated/d2.1_guided/data.json | 2 +- .../read_file/artifacts_in/file_to_read.txt | 1 - .../read_file/artifacts_out/file_to_check.txt | 1 - .../read_file/artifacts_out/output.txt | 1 - .../deprecated/interface/read_file/data.json | 31 - .../search/artifacts_out/random_file.txt | 2 - .../deprecated/interface/search/data.json | 36 - .../write_file/artifacts_out/random_file.txt | 1 - .../deprecated/interface/write_file/data.json | 30 - .../deprecated/memory/m1_id/data.json | 2 +- .../deprecated/memory/m2_multiple/data.json | 2 +- .../deprecated/memory/m3_noise/data.json | 2 +- .../deprecated/memory/m4_phrases/data.json | 2 +- .../retrieval/1_tesla_revenue/data.json | 2 +- .../deprecated/retrieval/2_specific/data.json | 2 +- .../retrieval/3_formatting/data.json | 2 +- .../retrieval/r1_book_price/data.json | 2 +- .../deprecated/retrieval/r3/data.json | 2 +- .../deprecated/safety/1_simple/data.json | 2 +- .../deprecated/safety/2_medium/data.json | 2 +- .../deprecated/safety/3_advanced/data.json | 2 +- .../deprecated/safety/4_hard/data.json | 2 +- .../deprecated/safety/s2_divergence/data.json | 2 +- .../safety/s3_instructions/data.json | 2 +- .../library/ethereum/check_price/data.json | 2 +- .../verticals/code/1_three_sum/data.json | 5 +- .../code/2_password_generator/data.json | 2 +- .../verticals/code/3_file_organizer/data.json | 5 +- .../verticals/code/4_url_shortener/data.json | 2 +- .../verticals/code/5_tic_tac_toe/data.json | 5 +- .../verticals/code/6_battleship/data.json | 5 +- .../verticals/data/1_sort_csv/data.json | 5 +- .../verticals/data/2_label_csv/data.json | 2 +- .../verticals/data/3_combine_csv/data.json | 5 +- .../artifacts_in/file1.csv | 12 + .../artifacts_out/output.txt | 1 + .../4_answer_question_small_csv/data.json | 32 + .../artifacts_in/file1.csv | 305 ++++++++ .../artifacts_out/output.txt | 1 + .../data/5_answer_question_csv/data.json | 31 + .../artifacts_in/file1.csv | 305 ++++++++ .../artifacts_in/file2.csv | 305 ++++++++ .../artifacts_out/output.txt | 1 + .../6_answer_question_combine_csv/data.json | 33 + .../1_sort_csv/artifacts_in/input.csv | 5 - .../1_sort_csv/artifacts_out/output.csv | 5 - .../verticals/generalist/1_sort_csv/data.json | 31 - .../verticals/scrape/1_search/data.json | 2 +- .../verticals/scrape/2_book_price/data.json | 2 +- .../artifacts_out/random_file.txt | 0 .../data.json | 5 +- .../scrape/4_revenue_retrieval_2/data.json | 2 +- .../scrape/5_get_information/data.json | 5 +- .../synthesize/1_basic_content_gen/data.json | 5 +- benchmark/frontend/public/graph.json | 237 ++++-- benchmark/poetry.lock | 145 ++-- benchmark/tests/test_benchmark_workflow.py | 4 +- frontend/assets/coding_tree_structure.json | 28 +- frontend/assets/data_tree_structure.json | 178 ++++- frontend/assets/general_tree_structure.json | 722 +++++++++++++++++- .../scrape_synthesize_tree_structure.json | 25 +- frontend/assets/tree_structure.json | 237 ++++-- 86 files changed, 2523 insertions(+), 496 deletions(-) delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/search/data.json delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt delete mode 100644 benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json create mode 100644 benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_in/file1.csv create mode 100644 benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_out/output.txt create mode 100644 benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/data.json create mode 100644 benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_in/file1.csv create mode 100644 benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_out/output.txt create mode 100644 benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/data.json create mode 100644 benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file1.csv create mode 100644 benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file2.csv create mode 100644 benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_out/output.txt create mode 100644 benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/data.json delete mode 100644 benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_in/input.csv delete mode 100644 benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_out/output.csv delete mode 100644 benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/data.json rename benchmark/agbenchmark/challenges/verticals/scrape/{3_revenue_retrieval_2 => 3_revenue_retrieval}/artifacts_out/random_file.txt (100%) rename benchmark/agbenchmark/challenges/verticals/scrape/{3_revenue_retrieval_2 => 3_revenue_retrieval}/data.json (88%) diff --git a/autogpts/forge/poetry.lock b/autogpts/forge/poetry.lock index 696b4843..3a66cbe0 100644 --- a/autogpts/forge/poetry.lock +++ b/autogpts/forge/poetry.lock @@ -368,75 +368,63 @@ files = [ [[package]] name = "cffi" -version = "1.15.1" +version = "1.16.0" description = "Foreign Function Interface for Python calling C code." optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, - {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, - {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, - {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, - {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, - {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, - {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, - {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, - {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, - {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, - {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, - {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, - {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, - {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, - {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, - {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, - {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, - {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, - {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, + {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"}, + {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"}, + {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"}, + {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"}, + {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"}, + {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"}, + {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"}, + {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"}, + {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"}, + {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"}, + {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"}, + {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"}, + {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"}, + {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"}, + {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"}, + {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"}, + {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"}, + {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"}, + {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"}, + {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"}, + {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"}, + {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"}, + {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"}, + {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"}, + {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"}, + {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"}, + {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"}, ] [package.dependencies] @@ -794,15 +782,19 @@ test-no-images = ["pytest", "pytest-cov", "wurlitzer"] [[package]] name = "cycler" -version = "0.11.0" +version = "0.12.0" description = "Composable style cycles" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"}, - {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"}, + {file = "cycler-0.12.0-py3-none-any.whl", hash = "sha256:7896994252d006771357777d0251f3e34d266f4fa5f2c572247a80ab01440947"}, + {file = "cycler-0.12.0.tar.gz", hash = "sha256:8cc3a7b4861f91b1095157f9916f748549a617046e67eb7619abed9b34d2c94a"}, ] +[package.extras] +docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] +tests = ["pytest", "pytest-cov", "pytest-xdist"] + [[package]] name = "decorator" version = "5.1.1" @@ -3656,13 +3648,13 @@ anyio = ">=3.0.0" [[package]] name = "wcwidth" -version = "0.2.6" +version = "0.2.7" description = "Measures the displayed width of unicode strings in a terminal" optional = false python-versions = "*" files = [ - {file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"}, - {file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"}, + {file = "wcwidth-0.2.7-py2.py3-none-any.whl", hash = "sha256:fabf3e32999d9b0dab7d19d845149f326f04fe29bac67709ee071dbd92640a36"}, + {file = "wcwidth-0.2.7.tar.gz", hash = "sha256:1b6d30a98ddd5ce9bbdb33658191fd2423fc9da203fe3ef1855407dcb7ee4e26"}, ] [[package]] diff --git a/benchmark/agbenchmark/challenges/abilities/read_file/data.json b/benchmark/agbenchmark/challenges/abilities/read_file/data.json index 63a2b4a4..74315965 100644 --- a/benchmark/agbenchmark/challenges/abilities/read_file/data.json +++ b/benchmark/agbenchmark/challenges/abilities/read_file/data.json @@ -9,7 +9,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1", + "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8", "ground": { "answer": "The content of output.txt should be 'Hello World!'", "eval": { diff --git a/benchmark/agbenchmark/challenges/abilities/write_file/data.json b/benchmark/agbenchmark/challenges/abilities/write_file/data.json index e27590de..d7600a78 100644 --- a/benchmark/agbenchmark/challenges/abilities/write_file/data.json +++ b/benchmark/agbenchmark/challenges/abilities/write_file/data.json @@ -7,7 +7,7 @@ ], "cutoff": 60, "dependencies": [], - "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0", + "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123", "ground": { "answer": "The word 'Washington', printed to a .txt file named anything", "eval": { diff --git a/benchmark/agbenchmark/challenges/alignment/1_distraction/data_draft.json b/benchmark/agbenchmark/challenges/alignment/1_distraction/data_draft.json index 34958140..f5eae494 100644 --- a/benchmark/agbenchmark/challenges/alignment/1_distraction/data_draft.json +++ b/benchmark/agbenchmark/challenges/alignment/1_distraction/data_draft.json @@ -7,7 +7,6 @@ "dependencies": [ "TestReadFile" ], - "eval_id": "fc6643da-8965-4f1d-b692-5ac8d56a3de8", "ground": { "answer": "It should copy the above text with no loss", "eval": { diff --git a/benchmark/agbenchmark/challenges/alignment/2_injection/data_draft.json b/benchmark/agbenchmark/challenges/alignment/2_injection/data_draft.json index 09ee25f3..44ba9a3c 100644 --- a/benchmark/agbenchmark/challenges/alignment/2_injection/data_draft.json +++ b/benchmark/agbenchmark/challenges/alignment/2_injection/data_draft.json @@ -7,7 +7,6 @@ "dependencies": [ "TestRememberGoalSimple" ], - "eval_id": "ae4ef0b8-9fe1-4a24-beda-8cae5eb9b9e5", "ground": { "answer": "It should copy the above text with no loss", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/1_tesla_revenue/data.json b/benchmark/agbenchmark/challenges/deprecated/1_tesla_revenue/data.json index c87bc6e9..0f82bdce 100644 --- a/benchmark/agbenchmark/challenges/deprecated/1_tesla_revenue/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/1_tesla_revenue/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestBasicRetrieval" ], - "eval_id": "c2b3d19d-c179-4132-bf7b-b3f9dc629f58", + "eval_id": "2d64d7a5-d664-4b86-9921-0b5e3aa9cf91", "ground": { "answer": "It was $81.462 billion in 2022.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/2_specific/data.json b/benchmark/agbenchmark/challenges/deprecated/2_specific/data.json index 8e3a5228..b650d458 100644 --- a/benchmark/agbenchmark/challenges/deprecated/2_specific/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/2_specific/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestRevenueRetrieval_1.0" ], - "eval_id": "1ab338b1-2213-4c21-8d14-17baf8237416", + "eval_id": "b79898bb-263a-4184-8e4d-0aa52838bfdb", "ground": { "answer": "It was $81.462 billion in 2022.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/3_formatting/data.json b/benchmark/agbenchmark/challenges/deprecated/3_formatting/data.json index 46883901..72c1e15a 100644 --- a/benchmark/agbenchmark/challenges/deprecated/3_formatting/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/3_formatting/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestRevenueRetrieval1.1" ], - "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d", + "eval_id": "838128f9-79ee-45cf-8a8f-c19b0d576a76", "ground": { "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json index 9b0b96e5..8328ca92 100644 --- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestDebugSimpleTypoWithGuidance" ], - "eval_id": "5d0b8017-8231-4494-be7c-188f1fac24e7", + "eval_id": "38671c68-89ea-4c51-92a5-1bc35a033c49", "ground": { "answer": "Same as the original DebugWithGuidance test - [0, 1] [2, 5] [0, 3]", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json index 8966bb93..bc95c48d 100644 --- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestRevenueRetrieval1.0" ], - "eval_id": "09fed110-077a-4b99-8821-ed071977cebe", + "eval_id": "9d4894d8-6f7c-465a-bc91-ca79a21b6ca3", "ground": { "answer": "It was $81.462 billion in 2022.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json index 302e3eaf..55d5402e 100644 --- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestBasicRetrieval" ], - "eval_id": "7fc79ad5-fd08-451a-8782-5beeba6822f4", + "eval_id": "261ee06f-a7b0-4d5c-bf92-3197763caba6", "ground": { "answer": "\u00a325.89", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/1_list_animals/data.json b/benchmark/agbenchmark/challenges/deprecated/code/1_list_animals/data.json index da929c16..530f20c3 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/1_list_animals/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/1_list_animals/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestWritingCLIFileOrganizer" ], - "eval_id": "270543c8-e101-44e8-8c3f-c2317afd54de", + "eval_id": "94ef736e-c2f1-4fa9-8cbf-a1c0873ee1ee", "ground": { "answer": "A web app where we can list animals and have details about dogs.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json b/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json index b6d501b2..01dd0afc 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "f5079f72-ba53-4273-b7e9-fbbd66279ce7", + "eval_id": "15686763-9be7-41e0-902a-80a99fd88089", "ground": { "answer": "password_generator.py is created and satisfies the requirements.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/1_return/data.json b/benchmark/agbenchmark/challenges/deprecated/code/1_return/data.json index 3e53fc7a..17f47ad4 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/1_return/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/1_return/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestReadFile" ], - "eval_id": "50ec32d0-f957-42ee-a01d-160c77aaf4ae", + "eval_id": "bb23fa8c-6df9-410e-8845-bb2d1ebe0c12", "ground": { "answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/2_file_organizer/data.json b/benchmark/agbenchmark/challenges/deprecated/code/2_file_organizer/data.json index c476dbdf..bc3b6253 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/2_file_organizer/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/2_file_organizer/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestPasswordGeneratorEasy" ], - "eval_id": "b1d6a2c5-6382-4e58-99e3-9ceb2cc02f28", + "eval_id": "d6bbefcc-0ee5-4190-b8a1-3721d016f849", "ground": { "answer": "The correct python file is written and organizes the files accordingly", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/2_write/data.json b/benchmark/agbenchmark/challenges/deprecated/code/2_write/data.json index 0265f679..379b19b5 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/2_write/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/2_write/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestReturnCodeSimple" ], - "eval_id": "d36bf9f5-ee0f-4d84-b4d8-71f20e8454d5", + "eval_id": "a59a1904-e9d6-443b-adb7-2e1ff972843f", "ground": { "answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/3_modify/data.json b/benchmark/agbenchmark/challenges/deprecated/code/3_modify/data.json index 6f7ab5db..835ac004 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/3_modify/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/3_modify/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestReturnCodeWrite" ], - "eval_id": "cffd9e7f-a1d5-478b-a04d-9504f372639a", + "eval_id": "092f3c8a-9723-4262-8e40-93d0cebba98a", "ground": { "answer": "def multiply_int(num, multiplier):\n return num * multiplier\n", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/4_tests/data.json b/benchmark/agbenchmark/challenges/deprecated/code/4_tests/data.json index c9805732..e199b64e 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/4_tests/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/4_tests/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestReturnCodeModify" ], - "eval_id": "c1d35de4-3a0c-410f-95d2-745009de9ee8", + "eval_id": "d39b8ed1-5984-40b0-8de6-a1c5eec30bc7", "ground": { "answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d2.1_guided/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d2.1_guided/data.json index b32a9946..74351f7f 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/d2.1_guided/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/d2.1_guided/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestReadFile" ], - "eval_id": "9984c549-b44d-40f7-8eff-1901ae35f050", + "eval_id": "a758335b-539b-4d8a-b90e-cf7036952294", "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d2.2_vague/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d2.2_vague/data.json index 339efec6..e1e34204 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/d2.2_vague/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/d2.2_vague/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestDebugSimpleTypoWithGuidance" ], - "eval_id": "3cebb926-ec58-4ad6-ab2e-a2d4a7f6f30d", + "eval_id": "1d171b68-0374-4b08-ae6a-c7223f89ecc1", "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d2.3_import/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d2.3_import/data.json index d30add17..10aaef53 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/d2.3_import/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/d2.3_import/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestDebugSimpleTypoWithoutGuidance" ], - "eval_id": "6fbabc52-c9b0-43e7-afbc-4b8d64c1e4da", + "eval_id": "f537c143-ab40-4a95-8cf2-ab90cb829ca5", "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d3.1_three_sum/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d3.1_three_sum/data.json index 761bc624..cecbfddc 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/d3.1_three_sum/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/d3.1_three_sum/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestFunctionCodeGeneration" ], - "eval_id": "8481ff39-9edf-49eb-ab88-ac5368523b71", + "eval_id": "a38396b8-8f61-49fc-a973-0876a4b6b5e9", "ground": { "answer": "The three_sum function coded properly.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/code/d3_two_sum/data.json b/benchmark/agbenchmark/challenges/deprecated/code/d3_two_sum/data.json index a6a5778c..e5a818a4 100644 --- a/benchmark/agbenchmark/challenges/deprecated/code/d3_two_sum/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/code/d3_two_sum/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestReturnCodeWrite" ], - "eval_id": "55a60811-a3a1-4a31-b939-77d52dac1749", + "eval_id": "c6703d23-7d2d-4b9b-a729-8014df9a7b4e", "ground": { "answer": "The two_sum function coded properly.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json b/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json index 2ab5a51b..94ecc1e6 100644 --- a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "bd922943-ffc6-4471-b3f1-5bd8e875c1d8", + "eval_id": "6ff65567-eb1e-4c7d-8b7f-dfc91dc95ed1", "ground": { "answer": "1. Search for flights to Toronto.\n2. Select the cheapest flight.\n3. Book the flight.\n4. Pay for the flight.\n5. Confirm the flight.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json index 5f99d69b..0ef6f3dd 100644 --- a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestReadFile" ], - "eval_id": "1ce0ccdd-cbe3-4000-a2a4-86d9c147fcfe", + "eval_id": "e3794d05-7617-4b95-9e96-d5f84309f66f", "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt b/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt deleted file mode 100644 index 980a0d5f..00000000 --- a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_in/file_to_read.txt +++ /dev/null @@ -1 +0,0 @@ -Hello World! diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt b/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt deleted file mode 100644 index 980a0d5f..00000000 --- a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/file_to_check.txt +++ /dev/null @@ -1 +0,0 @@ -Hello World! diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt deleted file mode 100644 index 980a0d5f..00000000 --- a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/artifacts_out/output.txt +++ /dev/null @@ -1 +0,0 @@ -Hello World! diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json b/benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json deleted file mode 100644 index 8bc3be8f..00000000 --- a/benchmark/agbenchmark/challenges/deprecated/interface/read_file/data.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "category": [ - "interface" - ], - "cutoff": 60, - "dependencies": [ - "TestWriteFile" - ], - "eval_id": "11b6f96f-6712-4a70-98b7-882ee36d7f2a", - "ground": { - "answer": "The content of output.txt should be 'Hello World!'", - "eval": { - "type": "file" - }, - "files": [ - "output.txt" - ], - "should_contain": [ - "Hello World!" - ] - }, - "info": { - "description": "tests the ability for an agent to read a file.", - "difficulty": "interface", - "side_effects": [ - "" - ] - }, - "name": "ReadFile", - "task": "Read the file called file_to_read.txt and write its content to a file called output.txt" -} diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt b/benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt deleted file mode 100644 index 03566759..00000000 --- a/benchmark/agbenchmark/challenges/deprecated/interface/search/artifacts_out/random_file.txt +++ /dev/null @@ -1,2 +0,0 @@ -This is a Heading -This is a paragraph. \ No newline at end of file diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/search/data.json b/benchmark/agbenchmark/challenges/deprecated/interface/search/data.json deleted file mode 100644 index 5fea07e3..00000000 --- a/benchmark/agbenchmark/challenges/deprecated/interface/search/data.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "category": [ - "interface" - ], - "cutoff": 120, - "dependencies": [ - "TestWriteFile" - ], - "eval_id": "3af8ea29-ae35-45dd-ab14-7a995ba4fc93", - "ground": { - "answer": "This is a Heading\nThis is a paragraph.", - "eval": { - "type": "file" - }, - "files": [ - ".txt" - ], - "should_contain": [ - "Heading", - "paragraph" - ], - "should_not_contain": [ - "The", - "the" - ] - }, - "info": { - "description": "Tests if an llm can search", - "difficulty": "interface", - "side_effects": [ - "" - ] - }, - "name": "Search", - "task": "Open 'https://silennaihin.com/random/plain.html' and paste all of the text on the page in a .txt file" -} diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt b/benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt deleted file mode 100644 index 1f275fb9..00000000 --- a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/artifacts_out/random_file.txt +++ /dev/null @@ -1 +0,0 @@ -Washington diff --git a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json b/benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json deleted file mode 100644 index b6406aa3..00000000 --- a/benchmark/agbenchmark/challenges/deprecated/interface/write_file/data.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "category": [ - "interface" - ], - "cutoff": 60, - "dependencies": [], - "eval_id": "5242ffc3-a7a9-4a92-a72a-2141f1206f76", - "ground": { - "answer": "The word 'Washington', printed to a .txt file named anything", - "eval": { - "type": "file" - }, - "files": [ - ".txt" - ], - "should_contain": [ - "Washington" - ], - "should_not_contain": [] - }, - "info": { - "description": "Tests the agents ability to write to a file", - "difficulty": "interface", - "side_effects": [ - "" - ] - }, - "name": "WriteFile", - "task": "Write the word 'Washington' to a .txt file" -} diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json b/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json index 8fbe53e0..de88ba64 100644 --- a/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/memory/m1_id/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestReadFile" ], - "eval_id": "0f0e9d4f-418d-4b43-9f05-20134dc77ed8", + "eval_id": "00227b7f-8952-4577-bfdb-c75db9f1fb19", "ground": { "answer": "2314", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json b/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json index 23707693..77fffcdf 100644 --- a/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/memory/m2_multiple/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestBasicMemory" ], - "eval_id": "ce6e588b-3ac6-45bb-892e-cfed7c7724bf", + "eval_id": "9c48c169-0cf0-46d0-9985-a31710bf398b", "ground": { "answer": "3145\n3791\n9317\n9471", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json b/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json index d4d4ae7c..850a9c5c 100644 --- a/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/memory/m3_noise/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestRememberMultipleIds" ], - "eval_id": "29a88045-67cf-45c1-a509-9a5f53a61ad5", + "eval_id": "480eaff6-e287-4741-9fe0-a4634e0ad491", "ground": { "answer": "3145\n3791\n9317\n9471", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json b/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json index cb2651dd..0f62d5c0 100644 --- a/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/memory/m4_phrases/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestRememberMultipleIdsWithNoise" ], - "eval_id": "1c5d4196-4171-4236-a944-d2905d2dc3c1", + "eval_id": "0c8cb5e5-7f7a-4475-977b-68ac2673d77a", "ground": { "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/1_tesla_revenue/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/1_tesla_revenue/data.json index c8d9708f..2cd9ab20 100644 --- a/benchmark/agbenchmark/challenges/deprecated/retrieval/1_tesla_revenue/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/1_tesla_revenue/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestBasicRetrieval" ], - "eval_id": "061eab17-1104-4ccf-ad28-6614cd1d12ea", + "eval_id": "c379905b-b7d7-49ea-89d8-9b0c113db75f", "ground": { "answer": "It was $81.462 billion in 2022.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/2_specific/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/2_specific/data.json index c69293f7..17d61f03 100644 --- a/benchmark/agbenchmark/challenges/deprecated/retrieval/2_specific/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/2_specific/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestRevenueRetrieval1.0" ], - "eval_id": "85be4356-043c-46a3-91db-23f4bc9d32b0", + "eval_id": "d9a4b0ff-628c-42d2-99ad-ab6053f1c5bb", "ground": { "answer": "It was $81.462 billion in 2022.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/3_formatting/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/3_formatting/data.json index 442a3616..e3720860 100644 --- a/benchmark/agbenchmark/challenges/deprecated/retrieval/3_formatting/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/3_formatting/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestRevenueRetrieval1.1" ], - "eval_id": "02c4f6ec-3246-4677-afb9-c9c836fd78a6", + "eval_id": "e92b86ba-63f3-4322-8f98-4970190d1e69", "ground": { "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json index 77190c86..33507af8 100644 --- a/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/r1_book_price/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestSearch" ], - "eval_id": "2066032d-2228-41d6-b05d-420fff5b1ce0", + "eval_id": "8322a2c3-19e9-46ee-9ae0-6de0ae95becc", "ground": { "answer": "\u00a325.89", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json b/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json index 725a75b4..6e46e85f 100644 --- a/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/retrieval/r3/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestRevenueRetrieval1.2" ], - "eval_id": "202ee93e-bae1-4889-9e9f-88fb0ed9fe00", + "eval_id": "77df2ad6-ae8f-42f0-9a94-fc92c9f88fdd", "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/1_simple/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/1_simple/data.json index b2a6ad6c..a46cd5a5 100644 --- a/benchmark/agbenchmark/challenges/deprecated/safety/1_simple/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/safety/1_simple/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestReadFile" ], - "eval_id": "c61408a4-387d-4731-9ece-2f77b968f0b8", + "eval_id": "d5a39a76-c804-4478-a022-9a808db6152a", "ground": { "answer": "It should copy the above text with no loss", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/2_medium/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/2_medium/data.json index be9a925c..7672c539 100644 --- a/benchmark/agbenchmark/challenges/deprecated/safety/2_medium/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/safety/2_medium/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestRememberGoal_Simple" ], - "eval_id": "fc2e0963-2f22-49c1-b566-ab7022212687", + "eval_id": "aae6a6eb-fa86-498a-9178-b7be733c6ffc", "ground": { "answer": "It should copy the above text with no loss", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/3_advanced/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/3_advanced/data.json index c3b01124..8ee39dc6 100644 --- a/benchmark/agbenchmark/challenges/deprecated/safety/3_advanced/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/safety/3_advanced/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestRememberGoal_Medium" ], - "eval_id": "de0f553d-194f-4853-9646-eb035133fd61", + "eval_id": "29241c0f-594f-4843-b0e2-8230cb8784fd", "ground": { "answer": "It should copy the above text with no loss", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/4_hard/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/4_hard/data.json index cd629b8c..fb36a104 100644 --- a/benchmark/agbenchmark/challenges/deprecated/safety/4_hard/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/safety/4_hard/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestRememberGoal_Advanced" ], - "eval_id": "6e79c281-cda3-4604-b60d-3629bbc5faba", + "eval_id": "290272fa-36e1-4c75-b58f-eb76f4a938b7", "ground": { "answer": "It should copy the above text with no loss", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json index 726d05cf..0f4e31c9 100644 --- a/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/safety/s2_divergence/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestReadFile" ], - "eval_id": "7bfe83c5-ba16-4fba-9edb-d0e8837c0abc", + "eval_id": "0bc68658-389f-4427-94af-9b925df7afe4", "ground": { "answer": "All of the elements should be contained in the output files", "eval": { diff --git a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json b/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json index 98ffe5e4..6d8f2240 100644 --- a/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json +++ b/benchmark/agbenchmark/challenges/deprecated/safety/s3_instructions/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestWrite5FilesWithArray" ], - "eval_id": "54c4fd15-e8e5-4783-94b6-2e25ef8ef388", + "eval_id": "0609fd6e-a753-4f50-89a0-0dc81ec58994", "ground": { "answer": "It should copy the above text with no loss", "eval": { diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json b/benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json index 2cc26141..b3890052 100644 --- a/benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json +++ b/benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "c7372901-5fc0-4431-8c87-302fe3583ab7", + "eval_id": "d14d6a59-a355-424c-a24b-a8aca580e32c", "ground": { "answer": "Should be a number within a 50 USD range of the actual price according to coingecko.", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json b/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json index 8994f892..f89c90c6 100644 --- a/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json +++ b/benchmark/agbenchmark/challenges/verticals/code/1_three_sum/data.json @@ -1,12 +1,13 @@ { "category": [ - "coding" + "coding", + "general" ], "cutoff": 60, "dependencies": [ "TestWriteFile" ], - "eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f", + "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4", "ground": { "answer": "The three_sum function coded properly.", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json b/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json index b5a471af..a08200e5 100644 --- a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json +++ b/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestThreeSum" ], - "eval_id": "0823b577-64f2-477b-856d-16726fe464b0", + "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f", "ground": { "answer": "password_generator.py is created and satisfies the requirements.", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json b/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json index ba56a9f9..c732990e 100644 --- a/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json +++ b/benchmark/agbenchmark/challenges/verticals/code/3_file_organizer/data.json @@ -1,12 +1,13 @@ { "category": [ - "coding" + "coding", + "general" ], "cutoff": 90, "dependencies": [ "TestPasswordGenerator" ], - "eval_id": "6ace62be-6c18-431a-947f-72fb20984b58", + "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e", "ground": { "answer": "The correct python file is written and organizes the files accordingly", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json b/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json index 08e9aa90..e3953140 100644 --- a/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json +++ b/benchmark/agbenchmark/challenges/verticals/code/4_url_shortener/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestFileOrganizer" ], - "eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273", + "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c", "ground": { "answer": "The correct python file for a basic url shortener CLI", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data.json b/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data.json index e0c27ecb..63f19ce6 100644 --- a/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data.json +++ b/benchmark/agbenchmark/challenges/verticals/code/5_tic_tac_toe/data.json @@ -1,12 +1,13 @@ { "category": [ - "coding" + "coding", + "general" ], "cutoff": 150, "dependencies": [ "TestUrlShortener" ], - "eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95", + "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0", "ground": { "answer": "The correct python file for a TicTacToe game is written", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data.json b/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data.json index a61e4a05..023a7b8f 100644 --- a/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data.json +++ b/benchmark/agbenchmark/challenges/verticals/code/6_battleship/data.json @@ -1,13 +1,14 @@ { "category": [ - "coding" + "coding", + "general" ], "cutoff": 90, "dependencies": [ "TestTicTacToe", "TestReadFile" ], - "eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a", + "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1", "ground": { "answer": "The implementation of battleship that passes all the tests.", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/data/1_sort_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/1_sort_csv/data.json index 8515af89..00370108 100644 --- a/benchmark/agbenchmark/challenges/verticals/data/1_sort_csv/data.json +++ b/benchmark/agbenchmark/challenges/verticals/data/1_sort_csv/data.json @@ -1,12 +1,13 @@ { "category": [ - "data" + "data", + "general" ], "cutoff": 60, "dependencies": [ "TestReadFile" ], - "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15", + "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95", "ground": { "answer": "The csv sorted by date", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json index d190b5c5..7a952159 100644 --- a/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json +++ b/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestSortCsv" ], - "eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e", + "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac", "ground": { "answer": "The csv labelled", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/data/3_combine_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/3_combine_csv/data.json index 68578206..3964785f 100644 --- a/benchmark/agbenchmark/challenges/verticals/data/3_combine_csv/data.json +++ b/benchmark/agbenchmark/challenges/verticals/data/3_combine_csv/data.json @@ -1,12 +1,13 @@ { "category": [ - "data" + "data", + "general" ], "cutoff": 60, "dependencies": [ "TestLabelCsv" ], - "eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375", + "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b", "ground": { "answer": "The csv data is combined", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_in/file1.csv b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_in/file1.csv new file mode 100644 index 00000000..55de8371 --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_in/file1.csv @@ -0,0 +1,12 @@ +Date Description Amount Category +2023-01-01 Grocery Store 52.3 Groceries +2023-01-02 Pharmacy 12.5 Healthcare +2023-01-03 Gas Station 29.1 Transportation +2023-01-04 Water 19 Utilities +2023-01-05 Grocery Store 60.25 Groceries +2023-01-06 Coffee Shop 4.5 Dining +2023-01-07 Cinema Tickets 20 Entertainment +2023-01-08 Book Store 30.4 Shopping +2023-01-09 Restaurant Dinner 55.8 Dining +2023-01-10 Electric Bill 65.35 Utilities +2023-01-11 Grocery Store 45.1 Groceries diff --git a/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_out/output.txt new file mode 100644 index 00000000..871727de --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/artifacts_out/output.txt @@ -0,0 +1 @@ +84 diff --git a/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/data.json new file mode 100644 index 00000000..695fc6d2 --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/4_answer_question_small_csv/data.json @@ -0,0 +1,32 @@ +{ + "category": [ + "data", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestReadFile" + ], + "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "84" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a small csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "AnswerQuestionSmallCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." +} diff --git a/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_in/file1.csv b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_in/file1.csv new file mode 100644 index 00000000..1915dfaa --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_in/file1.csv @@ -0,0 +1,305 @@ +Date Description Amount Category +2023-01-01 Grocery Store 52.3 Groceries +2023-01-02 Pharmacy 12.5 Healthcare +2023-01-03 Gas Station 29.1 Transportation +2023-01-04 Cinema Tickets 19 Entertainment +2023-01-05 Grocery Store 60.25 Groceries +2023-01-06 Coffee Shop 4.5 Dining +2023-01-07 Cinema Tickets 20 Entertainment +2023-01-08 Book Store 30.4 Shopping +2023-01-09 Restaurant Dinner 55.8 Dining +2023-01-10 Electric Bill 65.35 Utilities +2023-01-11 Grocery Store 45.1 Groceries +2023-01-12 Clothing Store 100.2 Shopping +2023-01-13 Pharmacy 20.3 Healthcare +2023-01-14 Coffee Shop 4.5 Dining +2023-01-15 Restaurant Dinner 50 Dining +2023-01-16 Gas Station 32.1 Transportation +2023-01-17 Online Shopping 80 Shopping +2023-01-18 Water Bill 20.35 Utilities +2023-01-19 Grocery Store 55.6 Groceries +2023-01-20 Gas Station 28 Transportation +2023-01-21 Pharmacy 15.4 Healthcare +2023-01-22 Phone Bill 40 Utilities +2023-01-23 Cinema Tickets 20 Entertainment +2023-01-24 Coffee Shop 5.5 Dining +2023-01-25 Book Purchase 14 Shopping +2023-01-26 Restaurant Lunch 30 Dining +2023-01-27 Public Transport 20 Transportation +2023-01-28 Grocery Store 58.25 Groceries +2023-01-29 Online Shopping 70 Shopping +2023-01-30 Grocery Store 62.1 Groceries +2023-01-31 Medical Prescription 10.4 Healthcare +2023-02-01 Gas Station 33 Transportation +2023-02-02 Coffee Shop 6 Dining +2023-02-03 Cinema Tickets 22 Entertainment +2023-02-04 Book Store 28.4 Shopping +2023-02-05 Internet Bill 50 Utilities +2023-02-06 Grocery Store 60.1 Groceries +2023-02-07 Clothing Store 120 Shopping +2023-02-08 Grocery Store 58.25 Groceries +2023-02-09 Coffee Shop 4.5 Dining +2023-02-10 Electric Bill 70 Utilities +2023-02-11 Grocery Store 50.1 Groceries +2023-02-12 Public Transport 18 Transportation +2023-02-13 Pharmacy 24 Healthcare +2023-02-14 Restaurant Dinner 60 Dining +2023-02-15 Medical Prescription 11.4 Healthcare +2023-02-16 Gas Station 30 Transportation +2023-02-17 Online Shopping 85 Shopping +2023-02-18 Water Bill 18 Utilities +2023-02-19 Grocery Store 53.6 Groceries +2023-02-20 Public Transport 22 Transportation +2023-02-21 Pharmacy 10 Healthcare +2023-02-22 Phone Bill 42 Utilities +2023-02-23 Cinema Tickets 24 Entertainment +2023-02-24 Coffee Shop 6 Dining +2023-02-25 Book Purchase 16 Shopping +2023-02-26 Restaurant Lunch 28 Dining +2023-02-27 Gas Station 34 Transportation +2023-02-28 Grocery Store 56 Groceries +2023-03-01 Online Shopping 90 Groceries +2023-03-02 Dentist Appointment 130 Healthcare +2023-03-03 Grocery Store 63.45 Groceries +2023-03-04 Cinema Tickets 21 Entertainment +2023-03-05 Coffee Shop 5.8 Dining +2023-03-06 Electric Bill 67.5 Utilities +2023-03-07 Gas Station 31.2 Transportation +2023-03-08 Restaurant Dinner 58 Dining +2023-03-09 Pharmacy 18.3 Healthcare +2023-03-10 Grocery Store 64.7 Groceries +2023-03-11 Book Store 25.4 Shopping +2023-03-12 Online Shopping 78 Shopping +2023-03-13 Coffee Shop 6.5 Dining +2023-03-14 Museum Tickets 15 Entertainment +2023-03-15 Internet Bill 52 Utilities +2023-03-16 Public Transport 19.5 Transportation +2023-03-17 Clothing Store 105.6 Shopping +2023-03-18 Phone Bill 41 Utilities +2023-03-19 Coffee Shop 5 Dining +2023-03-20 Grocery Store 59.2 Groceries +2023-03-21 Gas Station 29.8 Transportation +2023-03-22 Restaurant Lunch 32 Dining +2023-03-23 Pharmacy 16.5 Healthcare +2023-03-24 Concert Tickets 50 Entertainment +2023-03-25 Coffee Shop 5.5 Dining +2023-03-26 Grocery Store 61.8 Groceries +2023-03-27 Online Shopping 82 Shopping +2023-03-28 Water Bill 19.35 Utilities +2023-03-29 Public Transport 21 Transportation +2023-03-30 Book Purchase 17 Shopping +2023-03-31 Grocery Store 60 Groceries +2023-04-01 Cinema Tickets 23 Entertainment +2023-04-02 Pharmacy 17.4 Healthcare +2023-04-03 Gas Station 33.5 Transportation +2023-04-04 Restaurant Dinner 56.7 Dining +2023-04-05 Grocery Store 65.3 Groceries +2023-04-06 Coffee Shop 5.9 Dining +2023-04-07 Online Shopping 87 Shopping +2023-04-08 Electric Bill 69 Utilities +2023-04-09 Clothing Store 112.5 Shopping +2023-04-10 Grocery Store 57.4 Groceries +2023-04-11 Book Store 26.3 Shopping +2023-04-12 Gas Station 30.9 Transportation +2023-04-13 Coffee Shop 6.8 Dining +2023-04-14 Zoo Tickets 24 Entertainment +2023-04-15 Internet Bill 53 Utilities +2023-04-16 Public Transport 20.5 Transportation +2023-04-17 Restaurant Lunch 34 Dining +2023-04-18 Phone Bill 43 Utilities +2023-04-19 Coffee Shop 5.2 Dining +2023-04-20 Grocery Store 58.9 Groceries +2023-04-21 Pharmacy 14.7 Healthcare +2023-04-22 Cinema Tickets 25 Entertainment +2023-04-23 Online Shopping 90 Shopping +2023-04-24 Gas Station 31.4 Transportation +2023-04-25 Water Bill 21 Utilities +2023-04-26 Grocery Store 62.5 Groceries +2023-04-27 Coffee Shop 5.7 Dining +2023-04-28 Book Purchase 18.5 Shopping +2023-04-29 Public Transport 22 Transportation +2023-04-30 Grocery Store 63 Groceries +2023-05-01 Theater Tickets 45 Entertainment +2023-05-02 Dentist Appointment 135 Healthcare +2023-05-03 Gas Station 32.2 Transportation +2023-05-04 Restaurant Dinner 59 Dining +2023-05-05 Grocery Store 66.1 Groceries +2023-05-06 Coffee Shop 6 Dining +2023-05-07 Online Shopping 89 Shopping +2023-05-08 Electric Bill 70.5 Utilities +2023-05-09 Clothing Store 110 Shopping +2023-05-10 Grocery Store 59.7 Groceries +2023-05-11 Coffee Shop 6.1 Dining +2023-05-12 Book Store 29.2 Shopping +2023-05-13 Gas Station 29.9 Transportation +2023-05-14 Museum Tickets 16 Entertainment +2023-05-15 Internet Bill 52.5 Utilities +2023-05-16 Public Transport 21.3 Transportation +2023-05-17 Restaurant Lunch 35.4 Dining +2023-05-18 Phone Bill 43.5 Utilities +2023-05-19 Grocery Store 64.8 Groceries +2023-05-20 Pharmacy 15.2 Healthcare +2023-05-21 Cinema Tickets 26 Entertainment +2023-05-22 Coffee Shop 6.3 Dining +2023-05-23 Gas Station 30.8 Transportation +2023-05-24 Online Shopping 92.5 Shopping +2023-05-25 Water Bill 20.5 Utilities +2023-05-26 Grocery Store 61.9 Groceries +2023-05-27 Public Transport 23 Transportation +2023-05-28 Book Purchase 19 Shopping +2023-05-29 Coffee Shop 5.9 Dining +2023-05-30 Restaurant Dinner 57.8 Dining +2023-05-31 Grocery Store 66.7 Groceries +2023-06-01 Theater Tickets 47 Entertainment +2023-06-02 Dentist Appointment 140 Healthcare +2023-06-03 Gas Station 31.6 Transportation +2023-06-04 Coffee Shop 6.4 Dining +2023-06-05 Online Shopping 94 Shopping +2023-06-06 Electric Bill 72 Utilities +2023-06-07 Restaurant Lunch 36 Dining +2023-06-08 Grocery Store 65.3 Groceries +2023-06-09 Pharmacy 17 Healthcare +2023-06-10 Cinema Tickets 27.5 Entertainment +2023-06-11 Public Transport 21.5 Transportation +2023-06-12 Book Store 30 Shopping +2023-06-13 Gas Station 28.7 Transportation +2023-06-14 Coffee Shop 6.6 Dining +2023-06-15 Internet Bill 53.5 Utilities +2023-06-16 Zoo Tickets 28 Entertainment +2023-06-17 Grocery Store 67.4 Groceries +2023-06-18 Phone Bill 44 Utilities +2023-06-19 Restaurant Dinner 60 Dining +2023-06-20 Coffee Shop 6.7 Dining +2023-06-21 Public Transport 22.5 Transportation +2023-06-22 Online Shopping 96 Shopping +2023-06-23 Gas Station 32.4 Transportation +2023-06-24 Cinema Tickets 29 Entertainment +2023-06-25 Book Purchase 20 Shopping +2023-06-26 Grocery Store 68.3 Groceries +2023-06-27 Water Bill 22 Utilities +2023-06-28 Pharmacy 18.5 Healthcare +2023-06-29 Restaurant Lunch 37 Dining +2023-06-30 Coffee Shop 7 Dining +2023-07-01 Grocery Store 69.5 Groceries +2023-07-02 Theater Tickets 49 Entertainment +2023-07-03 Gas Station 33.2 Transportation +2023-07-04 Park Picnic 40 Dining +2023-07-05 Electric Bill 73.5 Utilities +2023-07-06 Clothing Store 120 Shopping +2023-07-07 Online Shopping 98 Shopping +2023-07-08 Grocery Store 70.6 Groceries +2023-07-09 Coffee Shop 7.1 Dining +2023-07-10 Internet Bill 54 Utilities +2023-07-11 Public Transport 23.5 Transportation +2023-07-12 Museum Tickets 18 Entertainment +2023-07-13 Book Store 31 Shopping +2023-07-14 Gas Station 29.9 Transportation +2023-07-15 Coffee Shop 7.2 Dining +2023-07-16 Restaurant Dinner 62 Dining +2023-07-17 Grocery Store 71.8 Groceries +2023-07-18 Phone Bill 45 Utilities +2023-07-19 Zoo Tickets 30 Entertainment +2023-07-20 Coffee Shop 7.3 Dining +2023-07-21 Public Transport 24 Transportation +2023-07-22 Online Shopping 99.5 Shopping +2023-07-23 Gas Station 34 Transportation +2023-07-24 Cinema Tickets 31 Entertainment +2023-07-25 Book Purchase 21.5 Shopping +2023-07-26 Grocery Store 72.9 Groceries +2023-07-27 Water Bill 23.5 Utilities +2023-07-28 Pharmacy 19.5 Healthcare +2023-07-29 Restaurant Lunch 38.5 Dining +2023-07-30 Coffee Shop 7.4 Dining +2023-07-31 Grocery Store 73.7 Groceries +2023-08-01 Theater Tickets 50 Entertainment +2023-08-02 Gas Station 34.5 Transportation +2023-08-03 Restaurant Dinner 63.5 Dining +2023-08-04 Online Shopping 101 Shopping +2023-08-05 Electric Bill 75 Utilities +2023-08-06 Grocery Store 74.6 Groceries +2023-08-07 Coffee Shop 7.5 Dining +2023-08-08 Phone Bill 46 Utilities +2023-08-09 Public Transport 24.5 Transportation +2023-08-10 Cinema Tickets 32.5 Entertainment +2023-08-11 Book Store 32 Shopping +2023-08-12 Gas Station 35 Transportation +2023-08-13 Coffee Shop 7.6 Dining +2023-08-14 Park Picnic 42 Dining +2023-08-15 Internet Bill 55 Utilities +2023-08-16 Grocery Store 76.3 Groceries +2023-08-17 Clothing Store 125 Shopping +2023-08-18 Pharmacy 20.5 Healthcare +2023-08-19 Restaurant Lunch 40 Dining +2023-08-20 Coffee Shop 7.7 Dining +2023-08-21 Museum Tickets 19 Entertainment +2023-08-22 Public Transport 25 Transportation +2023-08-23 Online Shopping 103 Shopping +2023-08-24 Grocery Store 77.8 Groceries +2023-08-25 Water Bill 24.5 Utilities +2023-08-26 Zoo Tickets 32 Entertainment +2023-08-27 Coffee Shop 7.8 Dining +2023-08-28 Gas Station 35.5 Transportation +2023-08-29 Book Purchase 23 Shopping +2023-08-30 Grocery Store 78.9 Groceries +2023-08-31 Cinema Tickets 34 Entertainment +2023-09-01 Theater Tickets 52 Entertainment +2023-09-02 Gas Station 36 Transportation +2023-09-03 Restaurant Dinner 65 Dining +2023-09-04 Online Shopping 105 Shopping +2023-09-05 Electric Bill 76.5 Utilities +2023-09-06 Grocery Store 79.6 Groceries +2023-09-07 Coffee Shop 8 Dining +2023-09-08 Phone Bill 47 Utilities +2023-09-09 Public Transport 26 Transportation +2023-09-10 Cinema Tickets 35.5 Entertainment +2023-09-11 Book Store 33 Shopping +2023-09-12 Gas Station 36.5 Transportation +2023-09-13 Coffee Shop 8.2 Dining +2023-09-14 Park Picnic 44 Dining +2023-09-15 Internet Bill 56 Utilities +2023-09-16 Grocery Store 80.4 Groceries +2023-09-17 Clothing Store 130 Shopping +2023-09-18 Pharmacy 21.5 Healthcare +2023-09-19 Restaurant Lunch 41.5 Dining +2023-09-20 Coffee Shop 8.4 Dining +2023-09-21 Museum Tickets 20 Entertainment +2023-09-22 Public Transport 26.5 Transportation +2023-09-23 Online Shopping 107 Shopping +2023-09-24 Grocery Store 81.3 Groceries +2023-09-25 Water Bill 25.5 Utilities +2023-09-26 Zoo Tickets 33.5 Entertainment +2023-09-27 Coffee Shop 8.6 Dining +2023-09-28 Gas Station 37.5 Transportation +2023-09-29 Book Purchase 24.5 Shopping +2023-09-30 Grocery Store 82.7 Groceries +2023-10-01 Cinema Tickets 36 Entertainment +2023-10-02 Theater Tickets 54 Entertainment +2023-10-03 Gas Station 38 Transportation +2023-10-04 Restaurant Dinner 66.5 Dining +2023-10-05 Online Shopping 109 Shopping +2023-10-06 Electric Bill 78 Utilities +2023-10-07 Grocery Store 83.9 Groceries +2023-10-08 Coffee Shop 8.8 Dining +2023-10-09 Phone Bill 48 Utilities +2023-10-10 Public Transport 27.5 Transportation +2023-10-11 Cinema Tickets 37.5 Entertainment +2023-10-12 Book Store 34.5 Shopping +2023-10-13 Gas Station 39.5 Transportation +2023-10-14 Coffee Shop 9 Dining +2023-10-15 Park Picnic 46 Dining +2023-10-16 Internet Bill 57.5 Utilities +2023-10-17 Grocery Store 85.2 Groceries +2023-10-18 Clothing Store 135 Shopping +2023-10-19 Pharmacy 22.5 Healthcare +2023-10-20 Restaurant Lunch 43 Dining +2023-10-21 Coffee Shop 9.2 Dining +2023-10-22 Museum Tickets 21.5 Entertainment +2023-10-23 Public Transport 28 Transportation +2023-10-24 Online Shopping 111 Shopping +2023-10-25 Grocery Store 86.5 Groceries +2023-10-26 Water Bill 26.5 Utilities +2023-10-27 Zoo Tickets 35 Entertainment +2023-10-28 Coffee Shop 9.4 Dining +2023-10-29 Gas Station 40.5 Transportation +2023-10-30 Book Purchase 26 Shopping +2023-10-31 Grocery Store 88 Groceries diff --git a/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_out/output.txt new file mode 100644 index 00000000..e4869438 --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/artifacts_out/output.txt @@ -0,0 +1 @@ +1861.55 diff --git a/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/data.json new file mode 100644 index 00000000..24b7179d --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/5_answer_question_csv/data.json @@ -0,0 +1,31 @@ +{ + "category": [ + "data" + ], + "cutoff": 90, + "dependencies": [ + "TestAnswerQuestionSmallCsv" + ], + "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "1861" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "AnswerQuestionCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." +} diff --git a/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file1.csv b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file1.csv new file mode 100644 index 00000000..7c6eddd6 --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file1.csv @@ -0,0 +1,305 @@ +Category ID +Dining 6 +Dining 9 +Dining 14 +Dining 15 +Dining 24 +Dining 26 +Dining 33 +Dining 40 +Dining 45 +Dining 55 +Dining 57 +Dining 64 +Dining 67 +Dining 72 +Dining 78 +Dining 81 +Dining 84 +Dining 94 +Dining 96 +Dining 103 +Dining 107 +Dining 109 +Dining 117 +Dining 124 +Dining 126 +Dining 131 +Dining 137 +Dining 142 +Dining 149 +Dining 150 +Dining 155 +Dining 158 +Dining 165 +Dining 170 +Dining 171 +Dining 180 +Dining 181 +Dining 185 +Dining 190 +Dining 196 +Dining 197 +Dining 201 +Dining 210 +Dining 211 +Dining 215 +Dining 219 +Dining 225 +Dining 226 +Dining 231 +Dining 232 +Dining 239 +Dining 246 +Dining 250 +Dining 256 +Dining 257 +Dining 262 +Dining 263 +Dining 270 +Dining 277 +Dining 281 +Dining 287 +Dining 288 +Dining 293 +Dining 294 +Dining 301 +Entertainment 4 +Entertainment 7 +Entertainment 23 +Entertainment 34 +Entertainment 54 +Entertainment 63 +Entertainment 73 +Entertainment 83 +Entertainment 91 +Entertainment 104 +Entertainment 112 +Entertainment 121 +Entertainment 134 +Entertainment 141 +Entertainment 152 +Entertainment 161 +Entertainment 167 +Entertainment 175 +Entertainment 183 +Entertainment 193 +Entertainment 200 +Entertainment 205 +Entertainment 213 +Entertainment 222 +Entertainment 233 +Entertainment 238 +Entertainment 243 +Entertainment 244 +Entertainment 253 +Entertainment 264 +Entertainment 269 +Entertainment 274 +Entertainment 275 +Entertainment 284 +Entertainment 295 +Entertainment 300 +Groceries 1 +Groceries 5 +Groceries 11 +Groceries 19 +Groceries 28 +Groceries 30 +Groceries 37 +Groceries 39 +Groceries 42 +Groceries 50 +Groceries 59 +Groceries 60 +Groceries 62 +Groceries 69 +Groceries 79 +Groceries 85 +Groceries 90 +Groceries 95 +Groceries 100 +Groceries 110 +Groceries 116 +Groceries 120 +Groceries 125 +Groceries 130 +Groceries 139 +Groceries 146 +Groceries 151 +Groceries 159 +Groceries 168 +Groceries 177 +Groceries 182 +Groceries 189 +Groceries 198 +Groceries 207 +Groceries 212 +Groceries 218 +Groceries 228 +Groceries 236 +Groceries 242 +Groceries 249 +Groceries 259 +Groceries 267 +Groceries 273 +Groceries 280 +Groceries 290 +Groceries 298 +Groceries 304 +Healthcare 2 +Healthcare 13 +Healthcare 21 +Healthcare 31 +Healthcare 44 +Healthcare 46 +Healthcare 52 +Healthcare 61 +Healthcare 68 +Healthcare 82 +Healthcare 92 +Healthcare 111 +Healthcare 122 +Healthcare 140 +Healthcare 153 +Healthcare 160 +Healthcare 179 +Healthcare 209 +Healthcare 230 +Healthcare 261 +Healthcare 292 +Shopping 8 +Shopping 12 +Shopping 17 +Shopping 25 +Shopping 29 +Shopping 35 +Shopping 38 +Shopping 48 +Shopping 56 +Shopping 70 +Shopping 71 +Shopping 76 +Shopping 86 +Shopping 89 +Shopping 97 +Shopping 99 +Shopping 101 +Shopping 113 +Shopping 118 +Shopping 127 +Shopping 129 +Shopping 132 +Shopping 144 +Shopping 148 +Shopping 156 +Shopping 163 +Shopping 173 +Shopping 176 +Shopping 187 +Shopping 188 +Shopping 194 +Shopping 203 +Shopping 206 +Shopping 216 +Shopping 223 +Shopping 229 +Shopping 235 +Shopping 241 +Shopping 247 +Shopping 254 +Shopping 260 +Shopping 266 +Shopping 272 +Shopping 278 +Shopping 285 +Shopping 291 +Shopping 297 +Shopping 303 +Transportation 3 +Transportation 16 +Transportation 20 +Transportation 27 +Transportation 32 +Transportation 43 +Transportation 47 +Transportation 51 +Transportation 58 +Transportation 66 +Transportation 75 +Transportation 80 +Transportation 88 +Transportation 93 +Transportation 102 +Transportation 106 +Transportation 114 +Transportation 119 +Transportation 123 +Transportation 133 +Transportation 136 +Transportation 143 +Transportation 147 +Transportation 154 +Transportation 162 +Transportation 164 +Transportation 172 +Transportation 174 +Transportation 184 +Transportation 192 +Transportation 195 +Transportation 202 +Transportation 204 +Transportation 214 +Transportation 221 +Transportation 224 +Transportation 234 +Transportation 240 +Transportation 245 +Transportation 252 +Transportation 255 +Transportation 265 +Transportation 271 +Transportation 276 +Transportation 283 +Transportation 286 +Transportation 296 +Transportation 302 +Utilities 10 +Utilities 18 +Utilities 22 +Utilities 36 +Utilities 41 +Utilities 49 +Utilities 53 +Utilities 65 +Utilities 74 +Utilities 77 +Utilities 87 +Utilities 98 +Utilities 105 +Utilities 108 +Utilities 115 +Utilities 128 +Utilities 135 +Utilities 138 +Utilities 145 +Utilities 157 +Utilities 166 +Utilities 169 +Utilities 178 +Utilities 186 +Utilities 191 +Utilities 199 +Utilities 208 +Utilities 217 +Utilities 220 +Utilities 227 +Utilities 237 +Utilities 248 +Utilities 251 +Utilities 258 +Utilities 268 +Utilities 279 +Utilities 282 +Utilities 289 +Utilities 299 diff --git a/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file2.csv b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file2.csv new file mode 100644 index 00000000..e95eba53 --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_in/file2.csv @@ -0,0 +1,305 @@ +Date Description Amount ID +2023-01-01 Grocery Store 52.3 1 +2023-01-02 Pharmacy 12.5 2 +2023-01-03 Gas Station 29.1 3 +2023-01-04 Cinema Tickets 19 4 +2023-01-05 Grocery Store 60.25 5 +2023-01-06 Coffee Shop 4.5 6 +2023-01-07 Cinema Tickets 20 7 +2023-01-08 Book Store 30.4 8 +2023-01-09 Restaurant Dinner 55.8 9 +2023-01-10 Electric Bill 65.35 10 +2023-01-11 Grocery Store 45.1 11 +2023-01-12 Clothing Store 100.2 12 +2023-01-13 Pharmacy 20.3 13 +2023-01-14 Coffee Shop 4.5 14 +2023-01-15 Restaurant Dinner 50 15 +2023-01-16 Gas Station 32.1 16 +2023-01-17 Online Shopping 80 17 +2023-01-18 Water Bill 20.35 18 +2023-01-19 Grocery Store 55.6 19 +2023-01-20 Gas Station 28 20 +2023-01-21 Pharmacy 15.4 21 +2023-01-22 Phone Bill 40 22 +2023-01-23 Cinema Tickets 20 23 +2023-01-24 Coffee Shop 5.5 24 +2023-01-25 Book Purchase 14 25 +2023-01-26 Restaurant Lunch 30 26 +2023-01-27 Public Transport 20 27 +2023-01-28 Grocery Store 58.25 28 +2023-01-29 Online Shopping 70 29 +2023-01-30 Grocery Store 62.1 30 +2023-01-31 Medical Prescription 10.4 31 +2023-02-01 Gas Station 33 32 +2023-02-02 Coffee Shop 6 33 +2023-02-03 Cinema Tickets 22 34 +2023-02-04 Book Store 28.4 35 +2023-02-05 Internet Bill 50 36 +2023-02-06 Grocery Store 60.1 37 +2023-02-07 Clothing Store 120 38 +2023-02-08 Grocery Store 58.25 39 +2023-02-09 Coffee Shop 4.5 40 +2023-02-10 Electric Bill 70 41 +2023-02-11 Grocery Store 50.1 42 +2023-02-12 Public Transport 18 43 +2023-02-13 Pharmacy 24 44 +2023-02-14 Restaurant Dinner 60 45 +2023-02-15 Medical Prescription 11.4 46 +2023-02-16 Gas Station 30 47 +2023-02-17 Online Shopping 85 48 +2023-02-18 Water Bill 18 49 +2023-02-19 Grocery Store 53.6 50 +2023-02-20 Public Transport 22 51 +2023-02-21 Pharmacy 10 52 +2023-02-22 Phone Bill 42 53 +2023-02-23 Cinema Tickets 24 54 +2023-02-24 Coffee Shop 6 55 +2023-02-25 Book Purchase 16 56 +2023-02-26 Restaurant Lunch 28 57 +2023-02-27 Gas Station 34 58 +2023-02-28 Grocery Store 56 59 +2023-03-01 Online Shopping 90 60 +2023-03-02 Dentist Appointment 130 61 +2023-03-03 Grocery Store 63.45 62 +2023-03-04 Cinema Tickets 21 63 +2023-03-05 Coffee Shop 5.8 64 +2023-03-06 Electric Bill 67.5 65 +2023-03-07 Gas Station 31.2 66 +2023-03-08 Restaurant Dinner 58 67 +2023-03-09 Pharmacy 18.3 68 +2023-03-10 Grocery Store 64.7 69 +2023-03-11 Book Store 25.4 70 +2023-03-12 Online Shopping 78 71 +2023-03-13 Coffee Shop 6.5 72 +2023-03-14 Museum Tickets 15 73 +2023-03-15 Internet Bill 52 74 +2023-03-16 Public Transport 19.5 75 +2023-03-17 Clothing Store 105.6 76 +2023-03-18 Phone Bill 41 77 +2023-03-19 Coffee Shop 5 78 +2023-03-20 Grocery Store 59.2 79 +2023-03-21 Gas Station 29.8 80 +2023-03-22 Restaurant Lunch 32 81 +2023-03-23 Pharmacy 16.5 82 +2023-03-24 Concert Tickets 50 83 +2023-03-25 Coffee Shop 5.5 84 +2023-03-26 Grocery Store 61.8 85 +2023-03-27 Online Shopping 82 86 +2023-03-28 Water Bill 19.35 87 +2023-03-29 Public Transport 21 88 +2023-03-30 Book Purchase 17 89 +2023-03-31 Grocery Store 60 90 +2023-04-01 Cinema Tickets 23 91 +2023-04-02 Pharmacy 17.4 92 +2023-04-03 Gas Station 33.5 93 +2023-04-04 Restaurant Dinner 56.7 94 +2023-04-05 Grocery Store 65.3 95 +2023-04-06 Coffee Shop 5.9 96 +2023-04-07 Online Shopping 87 97 +2023-04-08 Electric Bill 69 98 +2023-04-09 Clothing Store 112.5 99 +2023-04-10 Grocery Store 57.4 100 +2023-04-11 Book Store 26.3 101 +2023-04-12 Gas Station 30.9 102 +2023-04-13 Coffee Shop 6.8 103 +2023-04-14 Zoo Tickets 24 104 +2023-04-15 Internet Bill 53 105 +2023-04-16 Public Transport 20.5 106 +2023-04-17 Restaurant Lunch 34 107 +2023-04-18 Phone Bill 43 108 +2023-04-19 Coffee Shop 5.2 109 +2023-04-20 Grocery Store 58.9 110 +2023-04-21 Pharmacy 14.7 111 +2023-04-22 Cinema Tickets 25 112 +2023-04-23 Online Shopping 90 113 +2023-04-24 Gas Station 31.4 114 +2023-04-25 Water Bill 21 115 +2023-04-26 Grocery Store 62.5 116 +2023-04-27 Coffee Shop 5.7 117 +2023-04-28 Book Purchase 18.5 118 +2023-04-29 Public Transport 22 119 +2023-04-30 Grocery Store 63 120 +2023-05-01 Theater Tickets 45 121 +2023-05-02 Dentist Appointment 135 122 +2023-05-03 Gas Station 32.2 123 +2023-05-04 Restaurant Dinner 59 124 +2023-05-05 Grocery Store 66.1 125 +2023-05-06 Coffee Shop 6 126 +2023-05-07 Online Shopping 89 127 +2023-05-08 Electric Bill 70.5 128 +2023-05-09 Clothing Store 110 129 +2023-05-10 Grocery Store 59.7 130 +2023-05-11 Coffee Shop 6.1 131 +2023-05-12 Book Store 29.2 132 +2023-05-13 Gas Station 29.9 133 +2023-05-14 Museum Tickets 16 134 +2023-05-15 Internet Bill 52.5 135 +2023-05-16 Public Transport 21.3 136 +2023-05-17 Restaurant Lunch 35.4 137 +2023-05-18 Phone Bill 43.5 138 +2023-05-19 Grocery Store 64.8 139 +2023-05-20 Pharmacy 15.2 140 +2023-05-21 Cinema Tickets 26 141 +2023-05-22 Coffee Shop 6.3 142 +2023-05-23 Gas Station 30.8 143 +2023-05-24 Online Shopping 92.5 144 +2023-05-25 Water Bill 20.5 145 +2023-05-26 Grocery Store 61.9 146 +2023-05-27 Public Transport 23 147 +2023-05-28 Book Purchase 19 148 +2023-05-29 Coffee Shop 5.9 149 +2023-05-30 Restaurant Dinner 57.8 150 +2023-05-31 Grocery Store 66.7 151 +2023-06-01 Theater Tickets 47 152 +2023-06-02 Dentist Appointment 140 153 +2023-06-03 Gas Station 31.6 154 +2023-06-04 Coffee Shop 6.4 155 +2023-06-05 Online Shopping 94 156 +2023-06-06 Electric Bill 72 157 +2023-06-07 Restaurant Lunch 36 158 +2023-06-08 Grocery Store 65.3 159 +2023-06-09 Pharmacy 17 160 +2023-06-10 Cinema Tickets 27.5 161 +2023-06-11 Public Transport 21.5 162 +2023-06-12 Book Store 30 163 +2023-06-13 Gas Station 28.7 164 +2023-06-14 Coffee Shop 6.6 165 +2023-06-15 Internet Bill 53.5 166 +2023-06-16 Zoo Tickets 28 167 +2023-06-17 Grocery Store 67.4 168 +2023-06-18 Phone Bill 44 169 +2023-06-19 Restaurant Dinner 60 170 +2023-06-20 Coffee Shop 6.7 171 +2023-06-21 Public Transport 22.5 172 +2023-06-22 Online Shopping 96 173 +2023-06-23 Gas Station 32.4 174 +2023-06-24 Cinema Tickets 29 175 +2023-06-25 Book Purchase 20 176 +2023-06-26 Grocery Store 68.3 177 +2023-06-27 Water Bill 22 178 +2023-06-28 Pharmacy 18.5 179 +2023-06-29 Restaurant Lunch 37 180 +2023-06-30 Coffee Shop 7 181 +2023-07-01 Grocery Store 69.5 182 +2023-07-02 Theater Tickets 49 183 +2023-07-03 Gas Station 33.2 184 +2023-07-04 Park Picnic 40 185 +2023-07-05 Electric Bill 73.5 186 +2023-07-06 Clothing Store 120 187 +2023-07-07 Online Shopping 98 188 +2023-07-08 Grocery Store 70.6 189 +2023-07-09 Coffee Shop 7.1 190 +2023-07-10 Internet Bill 54 191 +2023-07-11 Public Transport 23.5 192 +2023-07-12 Museum Tickets 18 193 +2023-07-13 Book Store 31 194 +2023-07-14 Gas Station 29.9 195 +2023-07-15 Coffee Shop 7.2 196 +2023-07-16 Restaurant Dinner 62 197 +2023-07-17 Grocery Store 71.8 198 +2023-07-18 Phone Bill 45 199 +2023-07-19 Zoo Tickets 30 200 +2023-07-20 Coffee Shop 7.3 201 +2023-07-21 Public Transport 24 202 +2023-07-22 Online Shopping 99.5 203 +2023-07-23 Gas Station 34 204 +2023-07-24 Cinema Tickets 31 205 +2023-07-25 Book Purchase 21.5 206 +2023-07-26 Grocery Store 72.9 207 +2023-07-27 Water Bill 23.5 208 +2023-07-28 Pharmacy 19.5 209 +2023-07-29 Restaurant Lunch 38.5 210 +2023-07-30 Coffee Shop 7.4 211 +2023-07-31 Grocery Store 73.7 212 +2023-08-01 Theater Tickets 50 213 +2023-08-02 Gas Station 34.5 214 +2023-08-03 Restaurant Dinner 63.5 215 +2023-08-04 Online Shopping 101 216 +2023-08-05 Electric Bill 75 217 +2023-08-06 Grocery Store 74.6 218 +2023-08-07 Coffee Shop 7.5 219 +2023-08-08 Phone Bill 46 220 +2023-08-09 Public Transport 24.5 221 +2023-08-10 Cinema Tickets 32.5 222 +2023-08-11 Book Store 32 223 +2023-08-12 Gas Station 35 224 +2023-08-13 Coffee Shop 7.6 225 +2023-08-14 Park Picnic 42 226 +2023-08-15 Internet Bill 55 227 +2023-08-16 Grocery Store 76.3 228 +2023-08-17 Clothing Store 125 229 +2023-08-18 Pharmacy 20.5 230 +2023-08-19 Restaurant Lunch 40 231 +2023-08-20 Coffee Shop 7.7 232 +2023-08-21 Museum Tickets 19 233 +2023-08-22 Public Transport 25 234 +2023-08-23 Online Shopping 103 235 +2023-08-24 Grocery Store 77.8 236 +2023-08-25 Water Bill 24.5 237 +2023-08-26 Zoo Tickets 32 238 +2023-08-27 Coffee Shop 7.8 239 +2023-08-28 Gas Station 35.5 240 +2023-08-29 Book Purchase 23 241 +2023-08-30 Grocery Store 78.9 242 +2023-08-31 Cinema Tickets 34 243 +2023-09-01 Theater Tickets 52 244 +2023-09-02 Gas Station 36 245 +2023-09-03 Restaurant Dinner 65 246 +2023-09-04 Online Shopping 105 247 +2023-09-05 Electric Bill 76.5 248 +2023-09-06 Grocery Store 79.6 249 +2023-09-07 Coffee Shop 8 250 +2023-09-08 Phone Bill 47 251 +2023-09-09 Public Transport 26 252 +2023-09-10 Cinema Tickets 35.5 253 +2023-09-11 Book Store 33 254 +2023-09-12 Gas Station 36.5 255 +2023-09-13 Coffee Shop 8.2 256 +2023-09-14 Park Picnic 44 257 +2023-09-15 Internet Bill 56 258 +2023-09-16 Grocery Store 80.4 259 +2023-09-17 Clothing Store 130 260 +2023-09-18 Pharmacy 21.5 261 +2023-09-19 Restaurant Lunch 41.5 262 +2023-09-20 Coffee Shop 8.4 263 +2023-09-21 Museum Tickets 20 264 +2023-09-22 Public Transport 26.5 265 +2023-09-23 Online Shopping 107 266 +2023-09-24 Grocery Store 81.3 267 +2023-09-25 Water Bill 25.5 268 +2023-09-26 Zoo Tickets 33.5 269 +2023-09-27 Coffee Shop 8.6 270 +2023-09-28 Gas Station 37.5 271 +2023-09-29 Book Purchase 24.5 272 +2023-09-30 Grocery Store 82.7 273 +2023-10-01 Cinema Tickets 36 274 +2023-10-02 Theater Tickets 54 275 +2023-10-03 Gas Station 38 276 +2023-10-04 Restaurant Dinner 66.5 277 +2023-10-05 Online Shopping 109 278 +2023-10-06 Electric Bill 78 279 +2023-10-07 Grocery Store 83.9 280 +2023-10-08 Coffee Shop 8.8 281 +2023-10-09 Phone Bill 48 282 +2023-10-10 Public Transport 27.5 283 +2023-10-11 Cinema Tickets 37.5 284 +2023-10-12 Book Store 34.5 285 +2023-10-13 Gas Station 39.5 286 +2023-10-14 Coffee Shop 9 287 +2023-10-15 Park Picnic 46 288 +2023-10-16 Internet Bill 57.5 289 +2023-10-17 Grocery Store 85.2 290 +2023-10-18 Clothing Store 135 291 +2023-10-19 Pharmacy 22.5 292 +2023-10-20 Restaurant Lunch 43 293 +2023-10-21 Coffee Shop 9.2 294 +2023-10-22 Museum Tickets 21.5 295 +2023-10-23 Public Transport 28 296 +2023-10-24 Online Shopping 111 297 +2023-10-25 Grocery Store 86.5 298 +2023-10-26 Water Bill 26.5 299 +2023-10-27 Zoo Tickets 35 300 +2023-10-28 Coffee Shop 9.4 301 +2023-10-29 Gas Station 40.5 302 +2023-10-30 Book Purchase 26 303 +2023-10-31 Grocery Store 88 304 diff --git a/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_out/output.txt new file mode 100644 index 00000000..e4869438 --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/artifacts_out/output.txt @@ -0,0 +1 @@ +1861.55 diff --git a/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/data.json new file mode 100644 index 00000000..9f83f3a2 --- /dev/null +++ b/benchmark/agbenchmark/challenges/verticals/data/6_answer_question_combine_csv/data.json @@ -0,0 +1,33 @@ +{ + "category": [ + "data", + "general" + ], + "cutoff": 120, + "dependencies": [ + "TestAnswerQuestionCsv", + "TestCombineCsv" + ], + "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "1861" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "AnswerQuestionCombineCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." +} diff --git a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_in/input.csv b/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_in/input.csv deleted file mode 100644 index a52510f1..00000000 --- a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_in/input.csv +++ /dev/null @@ -1,5 +0,0 @@ -id,name,timestamp -3,Alice,2023-09-25 14:10:00 -1,Bob,2023-09-24 12:05:00 -2,Charlie,2023-09-24 12:10:00 -4,David,2023-09-26 16:20:00 diff --git a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_out/output.csv b/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_out/output.csv deleted file mode 100644 index 6cac7733..00000000 --- a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/artifacts_out/output.csv +++ /dev/null @@ -1,5 +0,0 @@ -id,name,timestamp -1,Bob,2023-09-24 12:05:00 -2,Charlie,2023-09-24 12:10:00 -3,Alice,2023-09-25 14:10:00 -4,David,2023-09-26 16:20:00 diff --git a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/data.json b/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/data.json deleted file mode 100644 index 8515af89..00000000 --- a/benchmark/agbenchmark/challenges/verticals/generalist/1_sort_csv/data.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "category": [ - "data" - ], - "cutoff": 60, - "dependencies": [ - "TestReadFile" - ], - "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15", - "ground": { - "answer": "The csv sorted by date", - "eval": { - "type": "file" - }, - "files": [ - "output.csv" - ], - "should_contain": [ - "id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00" - ] - }, - "info": { - "description": "Tests if the agent can sort a csv", - "difficulty": "basic", - "side_effects": [ - "" - ] - }, - "name": "SortCsv", - "task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved." -} diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/1_search/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/1_search/data.json index 884a583e..2c52ddb6 100644 --- a/benchmark/agbenchmark/challenges/verticals/scrape/1_search/data.json +++ b/benchmark/agbenchmark/challenges/verticals/scrape/1_search/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43", + "eval_id": "0bb23182-b434-402b-a73e-9c226469b959", "ground": { "answer": "This is a Heading\nThis is a paragraph.", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/2_book_price/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/2_book_price/data.json index 328d52e7..53f14a07 100644 --- a/benchmark/agbenchmark/challenges/verticals/scrape/2_book_price/data.json +++ b/benchmark/agbenchmark/challenges/verticals/scrape/2_book_price/data.json @@ -7,7 +7,7 @@ "dependencies": [ "TestSearch" ], - "eval_id": "525001ed-8b45-4405-9e56-ce4423314294", + "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae", "ground": { "answer": "\u00a325.89", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/artifacts_out/random_file.txt b/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/artifacts_out/random_file.txt similarity index 100% rename from benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/artifacts_out/random_file.txt rename to benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/artifacts_out/random_file.txt diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/data.json similarity index 88% rename from benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/data.json rename to benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/data.json index 358ad96b..6e397da5 100644 --- a/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval_2/data.json +++ b/benchmark/agbenchmark/challenges/verticals/scrape/3_revenue_retrieval/data.json @@ -1,12 +1,13 @@ { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 60, "dependencies": [ "TestBasicRetrieval" ], - "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d", + "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f", "ground": { "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json index e2b55f8f..0a9aec55 100644 --- a/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json +++ b/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json @@ -6,7 +6,7 @@ "dependencies": [ "TestRevenueRetrieval" ], - "eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb", + "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1", "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json index e02c489d..bec0b9c8 100644 --- a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json +++ b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json @@ -1,12 +1,13 @@ { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 60, "dependencies": [ "TestRevenueRetrieval2" ], - "eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05", + "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5", "ground": { "answer": "The twitter handles of the two hosts of Latent Space.", "eval": { diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json index 94fd3ba2..4b6c7073 100644 --- a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json +++ b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json @@ -1,12 +1,13 @@ { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 240, "dependencies": [ "TestReadFile" ], - "eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6", + "eval_id": "895ae28a-4513-44ea-a872-0164771d1597", "ground": { "answer": "A report highlighting elements from the 2 files.", "eval": { diff --git a/benchmark/frontend/public/graph.json b/benchmark/frontend/public/graph.json index 27833803..7d4e432c 100644 --- a/benchmark/frontend/public/graph.json +++ b/benchmark/frontend/public/graph.json @@ -12,6 +12,12 @@ "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]" }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]" + }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", @@ -78,24 +84,42 @@ "id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]" }, - { - "arrows": "to", - "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", - "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", - "to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]" - }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]", "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]" }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]" + }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]" }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]" + }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]", @@ -117,7 +141,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1", + "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8", "ground": { "answer": "The content of output.txt should be 'Hello World!'", "eval": { @@ -155,7 +179,7 @@ ], "cutoff": 60, "dependencies": [], - "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0", + "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123", "ground": { "answer": "The word 'Washington', printed to a .txt file named anything", "eval": { @@ -187,13 +211,14 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 150, "dependencies": [ "TestUrlShortener" ], - "eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95", + "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0", "ground": { "answer": "The correct python file for a TicTacToe game is written", "eval": { @@ -227,7 +252,7 @@ "dependencies": [ "TestThreeSum" ], - "eval_id": "0823b577-64f2-477b-856d-16726fe464b0", + "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f", "ground": { "answer": "password_generator.py is created and satisfies the requirements.", "eval": { @@ -255,13 +280,14 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 90, "dependencies": [ "TestPasswordGenerator" ], - "eval_id": "6ace62be-6c18-431a-947f-72fb20984b58", + "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e", "ground": { "answer": "The correct python file is written and organizes the files accordingly", "eval": { @@ -289,13 +315,14 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 60, "dependencies": [ "TestWriteFile" ], - "eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f", + "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4", "ground": { "answer": "The three_sum function coded properly.", "eval": { @@ -327,14 +354,15 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 90, "dependencies": [ "TestTicTacToe", "TestReadFile" ], - "eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a", + "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1", "ground": { "answer": "The implementation of battleship that passes all the tests.", "eval": { @@ -366,7 +394,7 @@ "dependencies": [ "TestFileOrganizer" ], - "eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273", + "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c", "ground": { "answer": "The correct python file for a basic url shortener CLI", "eval": { @@ -401,7 +429,7 @@ "dependencies": [ "TestSearch" ], - "eval_id": "525001ed-8b45-4405-9e56-ce4423314294", + "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae", "ground": { "answer": "\u00a325.89", "eval": { @@ -431,13 +459,14 @@ "color": "grey", "data": { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 60, "dependencies": [ "TestRevenueRetrieval2" ], - "eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05", + "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5", "ground": { "answer": "The twitter handles of the two hosts of Latent Space.", "eval": { @@ -476,7 +505,7 @@ "dependencies": [ "TestRevenueRetrieval" ], - "eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb", + "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1", "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", "eval": { @@ -518,6 +547,43 @@ "label": "RevenueRetrieval2", "shape": "dot" }, + { + "color": "grey", + "data": { + "category": [ + "scrape_synthesize", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestBasicRetrieval" + ], + "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f", + "ground": { + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "eval": { + "type": "file" + }, + "files": [ + ".txt" + ], + "should_contain": [ + "81,462" + ], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can retrieve Tesla's revenue in 2022.", + "difficulty": "intermediate", + "side_effects": [] + }, + "name": "TestRevenueRetrieval", + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + }, + "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]", + "label": "RevenueRetrieval", + "shape": "dot" + }, { "color": "grey", "data": { @@ -529,7 +595,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43", + "eval_id": "0bb23182-b434-402b-a73e-9c226469b959", "ground": { "answer": "This is a Heading\nThis is a paragraph.", "eval": { @@ -565,49 +631,90 @@ "color": "grey", "data": { "category": [ - "scrape_synthesize" + "data" ], - "cutoff": 60, + "cutoff": 90, "dependencies": [ - "TestBasicRetrieval" + "TestAnswerQuestionSmallCsv" ], - "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d", + "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732", "ground": { - "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "answer": "The correct amount spent on utilities.", "eval": { "type": "file" }, "files": [ - ".txt" + "output.txt" ], "should_contain": [ - "81,462" - ], - "should_not_contain": [] + "1861" + ] }, "info": { - "description": "Tests if the agent can retrieve Tesla's revenue in 2022.", + "description": "Tests if the agent can answer a question from a csv", "difficulty": "intermediate", - "side_effects": [] + "side_effects": [ + "" + ] }, - "name": "TestRevenueRetrieval", - "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + "name": "TestAnswerQuestionCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." }, - "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]", - "label": "RevenueRetrieval", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "label": "AnswerQuestionCsv", "shape": "dot" }, { "color": "grey", "data": { "category": [ - "data" + "data", + "general" + ], + "cutoff": 120, + "dependencies": [ + "TestAnswerQuestionCsv", + "TestCombineCsv" + ], + "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "1861" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestAnswerQuestionCombineCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." + }, + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "label": "AnswerQuestionCombineCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" ], "cutoff": 60, "dependencies": [ "TestReadFile" ], - "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15", + "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95", "ground": { "answer": "The csv sorted by date", "eval": { @@ -638,13 +745,52 @@ "color": "grey", "data": { "category": [ - "data" + "data", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestReadFile" + ], + "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "84" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a small csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestAnswerQuestionSmallCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." + }, + "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "label": "AnswerQuestionSmallCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" ], "cutoff": 60, "dependencies": [ "TestLabelCsv" ], - "eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375", + "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b", "ground": { "answer": "The csv data is combined", "eval": { @@ -681,7 +827,7 @@ "dependencies": [ "TestSortCsv" ], - "eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e", + "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac", "ground": { "answer": "The csv labelled", "eval": { @@ -712,13 +858,14 @@ "color": "grey", "data": { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 240, "dependencies": [ "TestReadFile" ], - "eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6", + "eval_id": "895ae28a-4513-44ea-a872-0164771d1597", "ground": { "answer": "A report highlighting elements from the 2 files.", "eval": { diff --git a/benchmark/poetry.lock b/benchmark/poetry.lock index e60acb3a..dc2cdaca 100644 --- a/benchmark/poetry.lock +++ b/benchmark/poetry.lock @@ -295,75 +295,63 @@ files = [ [[package]] name = "cffi" -version = "1.15.1" +version = "1.16.0" description = "Foreign Function Interface for Python calling C code." optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, - {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, - {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, - {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, - {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, - {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, - {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, - {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, - {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, - {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, - {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, - {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, - {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, - {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, - {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, - {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, - {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, - {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, - {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, + {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"}, + {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"}, + {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"}, + {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"}, + {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"}, + {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"}, + {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"}, + {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"}, + {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"}, + {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"}, + {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"}, + {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"}, + {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"}, + {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"}, + {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"}, + {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"}, + {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"}, + {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"}, + {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"}, + {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"}, + {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"}, + {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"}, + {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"}, + {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"}, + {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"}, + {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"}, + {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"}, ] [package.dependencies] @@ -620,15 +608,19 @@ test-no-images = ["pytest", "pytest-cov", "wurlitzer"] [[package]] name = "cycler" -version = "0.11.0" +version = "0.12.0" description = "Composable style cycles" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"}, - {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"}, + {file = "cycler-0.12.0-py3-none-any.whl", hash = "sha256:7896994252d006771357777d0251f3e34d266f4fa5f2c572247a80ab01440947"}, + {file = "cycler-0.12.0.tar.gz", hash = "sha256:8cc3a7b4861f91b1095157f9916f748549a617046e67eb7619abed9b34d2c94a"}, ] +[package.extras] +docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] +tests = ["pytest", "pytest-cov", "pytest-xdist"] + [[package]] name = "decorator" version = "5.1.1" @@ -890,20 +882,19 @@ test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mypy", "pre-commit" [[package]] name = "google-auth" -version = "2.23.1" +version = "2.23.2" description = "Google Authentication Library" optional = false python-versions = ">=3.7" files = [ - {file = "google-auth-2.23.1.tar.gz", hash = "sha256:d38bdf4fa1e7c5a35e574861bce55784fd08afadb4e48f99f284f1e487ce702d"}, - {file = "google_auth-2.23.1-py2.py3-none-any.whl", hash = "sha256:9800802266366a2a87890fb2d04923fc0c0d4368af0b86db18edd94a62386ea1"}, + {file = "google-auth-2.23.2.tar.gz", hash = "sha256:5a9af4be520ba33651471a0264eead312521566f44631cbb621164bc30c8fd40"}, + {file = "google_auth-2.23.2-py2.py3-none-any.whl", hash = "sha256:c2e253347579d483004f17c3bd0bf92e611ef6c7ba24d41c5c59f2e7aeeaf088"}, ] [package.dependencies] cachetools = ">=2.0.0,<6.0" pyasn1-modules = ">=0.2.1" rsa = ">=3.1.4,<5" -urllib3 = ">=2.0.5" [package.extras] aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"] @@ -2765,13 +2756,13 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [[package]] name = "wcwidth" -version = "0.2.6" +version = "0.2.7" description = "Measures the displayed width of unicode strings in a terminal" optional = false python-versions = "*" files = [ - {file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"}, - {file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"}, + {file = "wcwidth-0.2.7-py2.py3-none-any.whl", hash = "sha256:fabf3e32999d9b0dab7d19d845149f326f04fe29bac67709ee071dbd92640a36"}, + {file = "wcwidth-0.2.7.tar.gz", hash = "sha256:1b6d30a98ddd5ce9bbdb33658191fd2423fc9da203fe3ef1855407dcb7ee4e26"}, ] [[package]] diff --git a/benchmark/tests/test_benchmark_workflow.py b/benchmark/tests/test_benchmark_workflow.py index 92fbdbbd..700d42a8 100644 --- a/benchmark/tests/test_benchmark_workflow.py +++ b/benchmark/tests/test_benchmark_workflow.py @@ -12,14 +12,14 @@ import time "eval_id, input_text, expected_artifact_length, test_name, should_be_successful", [ ( - "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0", + "f219f3d3-a41b-45a9-a3d0-389832086ee8", "Write the word 'Washington' to a .txt file", 0, "WriteFile", True, ), ( - "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1", + "021c695a-6cc4-46c2-b93a-f3a9b0f4d123", "Read the file called file_to_read.txt and write its content to a file called output.txt", 1, "ReadFile", diff --git a/frontend/assets/coding_tree_structure.json b/frontend/assets/coding_tree_structure.json index 371bae30..54972b46 100644 --- a/frontend/assets/coding_tree_structure.json +++ b/frontend/assets/coding_tree_structure.json @@ -63,7 +63,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1", + "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8", "ground": { "answer": "The content of output.txt should be 'Hello World!'", "eval": { @@ -101,7 +101,7 @@ ], "cutoff": 60, "dependencies": [], - "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0", + "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123", "ground": { "answer": "The word 'Washington', printed to a .txt file named anything", "eval": { @@ -133,13 +133,14 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 150, "dependencies": [ "TestUrlShortener" ], - "eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95", + "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0", "ground": { "answer": "The correct python file for a TicTacToe game is written", "eval": { @@ -173,7 +174,7 @@ "dependencies": [ "TestFileOrganizer" ], - "eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273", + "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c", "ground": { "answer": "The correct python file for a basic url shortener CLI", "eval": { @@ -201,13 +202,14 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 90, "dependencies": [ "TestPasswordGenerator" ], - "eval_id": "6ace62be-6c18-431a-947f-72fb20984b58", + "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e", "ground": { "answer": "The correct python file is written and organizes the files accordingly", "eval": { @@ -241,7 +243,7 @@ "dependencies": [ "TestThreeSum" ], - "eval_id": "0823b577-64f2-477b-856d-16726fe464b0", + "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f", "ground": { "answer": "password_generator.py is created and satisfies the requirements.", "eval": { @@ -269,13 +271,14 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 60, "dependencies": [ "TestWriteFile" ], - "eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f", + "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4", "ground": { "answer": "The three_sum function coded properly.", "eval": { @@ -307,14 +310,15 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 90, "dependencies": [ "TestTicTacToe", "TestReadFile" ], - "eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a", + "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1", "ground": { "answer": "The implementation of battleship that passes all the tests.", "eval": { diff --git a/frontend/assets/data_tree_structure.json b/frontend/assets/data_tree_structure.json index bd11dd5b..e48905a8 100644 --- a/frontend/assets/data_tree_structure.json +++ b/frontend/assets/data_tree_structure.json @@ -6,11 +6,29 @@ "id": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]" }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]" + }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", - "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", - "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]" + "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]" }, { "arrows": "to", @@ -23,6 +41,12 @@ "from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]" } ], "nodes": [ @@ -39,7 +63,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1", + "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8", "ground": { "answer": "The content of output.txt should be 'Hello World!'", "eval": { @@ -77,7 +101,7 @@ ], "cutoff": 60, "dependencies": [], - "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0", + "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123", "ground": { "answer": "The word 'Washington', printed to a .txt file named anything", "eval": { @@ -111,48 +135,126 @@ "category": [ "data" ], - "cutoff": 60, + "cutoff": 90, "dependencies": [ - "TestReadFile" + "TestAnswerQuestionSmallCsv" ], - "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15", + "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732", "ground": { - "answer": "The csv sorted by date", + "answer": "The correct amount spent on utilities.", "eval": { "type": "file" }, "files": [ - "output.csv" + "output.txt" ], "should_contain": [ - "id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00" + "1861" ] }, "info": { - "description": "Tests if the agent can sort a csv", - "difficulty": "basic", + "description": "Tests if the agent can answer a question from a csv", + "difficulty": "intermediate", "side_effects": [ "" ] }, - "name": "TestSortCsv", - "task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved." + "name": "TestAnswerQuestionCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." }, - "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", - "label": "SortCsv", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "label": "AnswerQuestionCsv", "shape": "dot" }, { "color": "grey", "data": { "category": [ - "data" + "data", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestReadFile" + ], + "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "84" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a small csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestAnswerQuestionSmallCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." + }, + "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "label": "AnswerQuestionSmallCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" + ], + "cutoff": 120, + "dependencies": [ + "TestAnswerQuestionCsv", + "TestCombineCsv" + ], + "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "1861" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestAnswerQuestionCombineCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." + }, + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "label": "AnswerQuestionCombineCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" ], "cutoff": 60, "dependencies": [ "TestLabelCsv" ], - "eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375", + "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b", "ground": { "answer": "The csv data is combined", "eval": { @@ -189,7 +291,7 @@ "dependencies": [ "TestSortCsv" ], - "eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e", + "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac", "ground": { "answer": "The csv labelled", "eval": { @@ -215,6 +317,44 @@ "id": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]", "label": "LabelCsv", "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestReadFile" + ], + "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95", + "ground": { + "answer": "The csv sorted by date", + "eval": { + "type": "file" + }, + "files": [ + "output.csv" + ], + "should_contain": [ + "id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00" + ] + }, + "info": { + "description": "Tests if the agent can sort a csv", + "difficulty": "basic", + "side_effects": [ + "" + ] + }, + "name": "TestSortCsv", + "task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved." + }, + "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", + "label": "SortCsv", + "shape": "dot" } ] } diff --git a/frontend/assets/general_tree_structure.json b/frontend/assets/general_tree_structure.json index 8c331555..85cdc2ec 100644 --- a/frontend/assets/general_tree_structure.json +++ b/frontend/assets/general_tree_structure.json @@ -6,6 +6,48 @@ "id": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]" }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]" + }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", @@ -17,6 +59,72 @@ "from": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "id": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSynthesizeInfo::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestSynthesizeInfo::test_method[challenge_data0]" } ], "nodes": [ @@ -33,7 +141,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1", + "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8", "ground": { "answer": "The content of output.txt should be 'Hello World!'", "eval": { @@ -71,7 +179,7 @@ ], "cutoff": 60, "dependencies": [], - "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0", + "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123", "ground": { "answer": "The word 'Washington', printed to a .txt file named anything", "eval": { @@ -99,6 +207,217 @@ "label": "WriteFile", "shape": "dot" }, + { + "color": "grey", + "data": { + "category": [ + "coding", + "general" + ], + "cutoff": 150, + "dependencies": [ + "TestUrlShortener" + ], + "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0", + "ground": { + "answer": "The correct python file for a TicTacToe game is written", + "eval": { + "type": "python" + }, + "files": [ + "test.py" + ], + "should_contain": [], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can create Tic-Tac-Toe game", + "difficulty": "basic", + "side_effects": [] + }, + "name": "TestTicTacToe", + "task": "Build a Tic-Tac-Toe game using a python CLI. Here are the specifications.\n\nThe Grid: The game board is a 3x3 grid, consisting of 3 rows and 3 columns, creating a total of 9 squares.\n\nPlayers: There are two players. One player uses the number \"1\", and the other player uses the number \"2\".\n\nTaking Turns: Players take turns to put their respective numbers (\"1\" or \"2\") in an empty square of the grid. Once a player has placed their number in a square, it cannot be changed or removed.\n\nObjective: The goal is to get three of your numbers in a row, either horizontally, vertically, or diagonally.\n\nEnd of the Game: The game concludes in one of two ways: One player gets three of their numbers in a row (horizontally, vertically, or diagonally) and is declared the winner.\nAll squares on the grid are filled, and no player has three in a row. This situation is a \"draw\" or a \"tie\".\n\nTechnical specifications:\nBuild a file called tic_tac_toe.py. This file will be called through command lines. You will have to prompt users for their move. Player 1 will always start.\nPlayers will input their move in the following format: \"x,y\" where x and y represent the location in the grid (0,0 is top left, 2,2 is bottom right).\n\nYour primary requirement is to halt the game when appropriate and to print only one of these three exact sentences:\n\n\"Player 1 won!\"\n\"Player 2 won!\"\n\"Draw\"\n\nEdge cases: A player can send an incorrect location. Either the location is incorrect or the square is already filled. In this case, this counts as doing nothing, and the player gets prompted for new locations again.\n\n\nYou will be expected to create a python file called tic_tac_toe.py that will run through command lines by using ```python tic_tac_toe.py```.\n\nHere is an example of how your tic_tac_toe.py game will be tested.\n```\nprocess = subprocess.Popen(\n ['python', 'tic_tac_toe.py'],\n stdout=subprocess.PIPE,\n text=True\n)\n\noutput, _ = process.communicate('\\n'.join([\"0,0\", \"1,0\", \"0,1\", \"1,1\", \"0,2\"]))\n\nassert \"Player 1 won!\" in output\n```" + }, + "id": "agbenchmark/generate_test.py::TestTicTacToe::test_method[challenge_data0]", + "label": "TicTacToe", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "coding" + ], + "cutoff": 150, + "dependencies": [ + "TestFileOrganizer" + ], + "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c", + "ground": { + "answer": "The correct python file for a basic url shortener CLI", + "eval": { + "type": "python" + }, + "files": [ + "test.py" + ], + "should_contain": [], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can create a URL shortener.", + "difficulty": "basic", + "side_effects": [] + }, + "name": "TestUrlShortener", + "task": "Build a basic URL shortener using a python CLI. Here are the specifications.\n\nFunctionality: The program should have two primary functionalities.\n\nShorten a given URL.\nRetrieve the original URL from a shortened URL.\n\nCLI: The command-line interface should accept a URL as its first input. It should be able to determine if the url is a shortened url or not. If the url is not shortened, it will display ONLY the shortened url, otherwise, it will display ONLY the original unshortened URL. Afterwards, it should prompt the user for another URL to process.\n\nTechnical specifications:\nBuild a file called url_shortener.py. This file will be called through command lines.\n\nEdge cases:\nFor the sake of simplicity, there will be no edge cases, you can assume the input is always correct and the user immediately passes the shortened version of the url he just shortened.\n\nYou will be expected to create a python file called url_shortener.py that will run through command lines by using python url_shortener.py.\n\nThe url_shortener.py will be tested this way:\n```\nimport unittest\nfrom url_shortener import shorten_url, retrieve_url\n\nclass TestURLShortener(unittest.TestCase):\n def test_url_retrieval(self):\n # Shorten the URL to get its shortened form\n shortened_url = shorten_url('https://www.example.com')\n\n # Retrieve the original URL using the shortened URL directly\n retrieved_url = retrieve_url(shortened_url)\n\n self.assertEqual(retrieved_url, 'https://www.example.com', \"Retrieved URL does not match the original!\")\n\nif __name__ == \"__main__\":\n unittest.main()\n```" + }, + "id": "agbenchmark/generate_test.py::TestUrlShortener::test_method[challenge_data0]", + "label": "UrlShortener", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "coding", + "general" + ], + "cutoff": 90, + "dependencies": [ + "TestPasswordGenerator" + ], + "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e", + "ground": { + "answer": "The correct python file is written and organizes the files accordingly", + "eval": { + "type": "python" + }, + "files": [ + "test.py" + ], + "should_contain": [], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can create a file organizer.", + "difficulty": "basic", + "side_effects": [] + }, + "name": "TestFileOrganizer", + "task": "Create a file organizer CLI tool in Python that sorts files in a directory based on their file types (e.g., images, documents, audio) and moves them into these corresponding folders: 'images', 'documents', 'audio'. The entry point will be a python file that can be run this way: python organize_files.py --directory_path=YOUR_DIRECTORY_PATH" + }, + "id": "agbenchmark/generate_test.py::TestFileOrganizer::test_method[challenge_data0]", + "label": "FileOrganizer", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "coding" + ], + "cutoff": 90, + "dependencies": [ + "TestThreeSum" + ], + "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f", + "ground": { + "answer": "password_generator.py is created and satisfies the requirements.", + "eval": { + "type": "python" + }, + "files": [ + "test.py" + ], + "should_contain": [], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can create a random password generator.", + "difficulty": "basic", + "side_effects": [] + }, + "name": "TestPasswordGenerator", + "task": "Create a random password generator. The password should have between 8 and 16 characters and should contain letters, numbers and symbols. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py [--len x] where x is the length of the password. If no length is specified, the password should be 8 characters long. The password_generator can also be imported as a module and called as password = password_generator.generate_password(len=x). Any invalid input should raise a ValueError." + }, + "id": "agbenchmark/generate_test.py::TestPasswordGenerator::test_method[challenge_data0]", + "label": "PasswordGenerator", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "coding", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestWriteFile" + ], + "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4", + "ground": { + "answer": "The three_sum function coded properly.", + "eval": { + "type": "python" + }, + "files": [ + "test.py" + ], + "should_contain": [ + "[0, 1, 2]", + "[0, 2, 5]", + "[0, 2, 3]" + ], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can create the three_sum function.", + "difficulty": "basic", + "side_effects": [] + }, + "name": "TestThreeSum", + "task": "Create a three_sum function in a file called sample_code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2]." + }, + "id": "agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0]", + "label": "ThreeSum", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "coding", + "general" + ], + "cutoff": 90, + "dependencies": [ + "TestTicTacToe", + "TestReadFile" + ], + "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1", + "ground": { + "answer": "The implementation of battleship that passes all the tests.", + "eval": { + "type": "pytest" + }, + "files": [], + "should_contain": [], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can create a Battleship.", + "difficulty": "expert", + "side_effects": [] + }, + "name": "TestBattleship", + "task": "Build a battleship game\n\nSpecifications:\n\nOverview: Battleship is a two-player strategy game where each player places their fleet of ships on a grid and tries to sink the opponent's fleet by guessing their locations.\nPlayers take turns calling out a row and column, attempting to name a square containing one of the opponent's ships.\n\nThe Grid: Each player's grid is a 10x10 grid, identified by rows (using numbers 1-10) and columns (using letters A-J).\n\nShips:\n\nCarrier - 5 squares\nBattleship - 4 squares\nCruiser - 3 squares\nSubmarine - 3 squares\nDestroyer - 2 squares\nEach ship occupies contiguous squares on the grid, arranged either horizontally or vertically.\n\nSetup:\n\nAt the start of the game, each player places their fleet on their grid. This setup is hidden from the opponent.\nThe game begins with Player 1, followed by Player 2, and so on.\nTaking Turns:\n\nOn a player's turn, they announce a grid square (e.g., \"D5\").\nThe opponent announces whether that square is a \"hit\" (if there's a part of a ship on that square) or \"miss\" (if the square is empty).\nIf a player hits a square occupied by a ship, they get another turn to guess. This continues until they make a miss, at which point their turn ends.\nIf a player hits all the squares occupied by a ship, the opponent must announce the sinking of that specific ship, e.g., \"You sank my Battleship!\"\n\nObjective: The goal is to sink all of your opponent's ships before they sink yours.\n\nEnd of the Game: The game ends when one player has sunk all of the opponent's ships. The winner is the player who sinks all the opposing fleet first.\n\nTechnical details:\nIn your root folder you will find an abstract class that defines the public interface of the Battleship class you will have to build:\n```\nfrom abc import ABC, abstractmethod\nfrom typing import Optional\n\nfrom pydantic import BaseModel, validator\n\n\n# Models for the request and response payloads\nclass ShipPlacement(BaseModel):\n ship_type: str\n start: dict # {\"row\": int, \"column\": str}\n direction: str\n\n @validator(\"start\")\n def validate_start(cls, start):\n row, column = start.get(\"row\"), start.get(\"column\")\n\n if not (1 <= row <= 10):\n raise ValueError(\"Row must be between 1 and 10 inclusive.\")\n\n if column not in list(\"ABCDEFGHIJ\"):\n raise ValueError(\"Column must be one of A, B, C, D, E, F, G, H, I, J.\")\n\n return start\n\n\nclass Turn(BaseModel):\n target: dict # {\"row\": int, \"column\": str}\n\n\nclass TurnResponse(BaseModel):\n result: str\n ship_type: Optional[str] # This would be None if the result is a miss\n\n\nclass GameStatus(BaseModel):\n is_game_over: bool\n winner: Optional[str]\n\n\nfrom typing import List\n\n\nclass Game(BaseModel):\n game_id: str\n players: List[str]\n board: dict # This could represent the state of the game board, you might need to flesh this out further\n ships: List[ShipPlacement] # List of ship placements for this game\n turns: List[Turn] # List of turns that have been taken\n\n\nclass AbstractBattleship(ABC):\n SHIP_LENGTHS = {\n \"carrier\": 5,\n \"battleship\": 4,\n \"cruiser\": 3,\n \"submarine\": 3,\n \"destroyer\": 2,\n }\n\n @abstractmethod\n def create_ship_placement(self, game_id: str, placement: ShipPlacement) -> None:\n \"\"\"\n Place a ship on the grid.\n \"\"\"\n pass\n\n @abstractmethod\n def create_turn(self, game_id: str, turn: Turn) -> TurnResponse:\n \"\"\"\n Players take turns to target a grid cell.\n \"\"\"\n pass\n\n @abstractmethod\n def get_game_status(self, game_id: str) -> GameStatus:\n \"\"\"\n Check if the game is over and get the winner if there's one.\n \"\"\"\n pass\n\n @abstractmethod\n def get_winner(self, game_id: str) -> str:\n \"\"\"\n Get the winner of the game.\n \"\"\"\n pass\n\n @abstractmethod\n def get_game(self) -> Game:\n \"\"\"\n Retrieve the state of the game.\n \"\"\"\n pass\n\n @abstractmethod\n def delete_game(self, game_id: str) -> None:\n \"\"\"\n Delete a game given its ID.\n \"\"\"\n pass\n\n @abstractmethod\n def create_game(self, game_id: str) -> None:\n \"\"\"\n Create a new game.\n \"\"\"\n pass\n\n```\nAt any moment you can run ```pytest``` to execute the tests.\nYou have two types of test: \n- positive tests => test the battleship game being used in ideal conditions\n- negative tests => tests the battleship game behaviour when used incorrectly\n\nSuccess criteria:\n- you will need to write a file called battleship.py that implements the abstract Battleship class.\n- this class will have to pass all the tests.\n- you're not allowed to modify any other file than the battleship.py. You can add other files as long as the main entrypoint is the battleship class." + }, + "id": "agbenchmark/generate_test.py::TestBattleship::test_method[challenge_data0]", + "label": "Battleship", + "shape": "dot" + }, { "color": "grey", "data": { @@ -110,7 +429,7 @@ "dependencies": [ "TestSearch" ], - "eval_id": "525001ed-8b45-4405-9e56-ce4423314294", + "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae", "ground": { "answer": "\u00a325.89", "eval": { @@ -147,7 +466,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43", + "eval_id": "0bb23182-b434-402b-a73e-9c226469b959", "ground": { "answer": "This is a Heading\nThis is a paragraph.", "eval": { @@ -178,6 +497,401 @@ "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", "label": "Search", "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "scrape_synthesize", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestRevenueRetrieval2" + ], + "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5", + "ground": { + "answer": "The twitter handles of the two hosts of Latent Space.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "swyx", + "FanaHOVA" + ], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can retrieve twitter handles given a vague description.", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestTestGetInformation", + "task": "Write the twitter handle of the two hosts of Latent Space to a file called output.txt" + }, + "id": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]", + "label": "TestGetInformation", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "scrape_synthesize" + ], + "cutoff": 60, + "dependencies": [ + "TestRevenueRetrieval" + ], + "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1", + "ground": { + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "eval": { + "type": "file" + }, + "files": [ + ".txt" + ], + "should_contain": [ + "15", + "112", + "117", + "204", + "413", + "2,014", + "3,198", + "4,046", + "7,000", + "11,759", + "21,461", + "24,578", + "31,536", + "53,823", + "81,462" + ], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can retrieve all the revenues of Tesla since its creation.", + "difficulty": "intermediate", + "side_effects": [ + "tests if there is in fact an LLM attached" + ] + }, + "name": "TestRevenueRetrieval2", + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + }, + "id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]", + "label": "RevenueRetrieval2", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "scrape_synthesize", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestBasicRetrieval" + ], + "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f", + "ground": { + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "eval": { + "type": "file" + }, + "files": [ + ".txt" + ], + "should_contain": [ + "81,462" + ], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can retrieve Tesla's revenue in 2022.", + "difficulty": "intermediate", + "side_effects": [] + }, + "name": "TestRevenueRetrieval", + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + }, + "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]", + "label": "RevenueRetrieval", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" + ], + "cutoff": 120, + "dependencies": [ + "TestAnswerQuestionCsv", + "TestCombineCsv" + ], + "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "1861" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestAnswerQuestionCombineCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." + }, + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "label": "AnswerQuestionCombineCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data" + ], + "cutoff": 90, + "dependencies": [ + "TestAnswerQuestionSmallCsv" + ], + "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "1861" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestAnswerQuestionCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." + }, + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "label": "AnswerQuestionCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestReadFile" + ], + "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "84" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a small csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestAnswerQuestionSmallCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." + }, + "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "label": "AnswerQuestionSmallCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestLabelCsv" + ], + "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b", + "ground": { + "answer": "The csv data is combined", + "eval": { + "type": "file" + }, + "files": [ + "output.csv" + ], + "should_contain": [ + "Age,ID,Name,Occupation,Salary\n28,101,John,Engineer,80000\n34,102,Alice,Doctor,120000\n45,103,Bob,Lawyer,95000" + ] + }, + "info": { + "description": "Tests if the agent can combine data from a csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestCombineCsv", + "task": "The csvs 'file1.csv' and 'file2.csv' both have a column 'ID'. Combine these 2 csvs using the 'ID' column. Sort the rows by ID in ascending order and the columns alphabetically. Write the output in output.csv" + }, + "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]", + "label": "CombineCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data" + ], + "cutoff": 60, + "dependencies": [ + "TestSortCsv" + ], + "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac", + "ground": { + "answer": "The csv labelled", + "eval": { + "type": "file" + }, + "files": [ + "output.csv" + ], + "should_contain": [ + "Item, Color\nBanana, Yellow\nLeaf, Green\nSky, Blue\nSunflower, Yellow\nGrass, Green\nJeans, Blue\nLemon, Yellow\nTree, Green\nOcean, Blue\nDaisy, Yellow\nFern, Green" + ] + }, + "info": { + "description": "Tests if the agent can label data in a csv", + "difficulty": "basic", + "side_effects": [ + "" + ] + }, + "name": "TestLabelCsv", + "task": "The csv 'input.csv' has many items. create a 'Color' column for these items and classify them as either 'blue', 'green', or 'yellow' depending on what the most likely color is. Preserve the order of the rows. The color column should be the second column. Write the output in output.csv" + }, + "id": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]", + "label": "LabelCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestReadFile" + ], + "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95", + "ground": { + "answer": "The csv sorted by date", + "eval": { + "type": "file" + }, + "files": [ + "output.csv" + ], + "should_contain": [ + "id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00" + ] + }, + "info": { + "description": "Tests if the agent can sort a csv", + "difficulty": "basic", + "side_effects": [ + "" + ] + }, + "name": "TestSortCsv", + "task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved." + }, + "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", + "label": "SortCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "scrape_synthesize", + "general" + ], + "cutoff": 240, + "dependencies": [ + "TestReadFile" + ], + "eval_id": "895ae28a-4513-44ea-a872-0164771d1597", + "ground": { + "answer": "A report highlighting elements from the 2 files.", + "eval": { + "scoring": "binary", + "template": "question", + "type": "llm" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "Is the company mentioned in the output actively addressing or capitalizing on the challenges or trends listed?" + ], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can generate content based on the content of 2 files.", + "difficulty": "basic", + "side_effects": [] + }, + "name": "TestSynthesizeInfo", + "task": "Create a brief report or summary highlighting how one or more companies from companies.txt are addressing or capitalizing on challenges or trends from challenges.txt. Write a file called output.txt." + }, + "id": "agbenchmark/generate_test.py::TestSynthesizeInfo::test_method[challenge_data0]", + "label": "SynthesizeInfo", + "shape": "dot" } ] } diff --git a/frontend/assets/scrape_synthesize_tree_structure.json b/frontend/assets/scrape_synthesize_tree_structure.json index 16bacf56..73460ef0 100644 --- a/frontend/assets/scrape_synthesize_tree_structure.json +++ b/frontend/assets/scrape_synthesize_tree_structure.json @@ -57,7 +57,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1", + "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8", "ground": { "answer": "The content of output.txt should be 'Hello World!'", "eval": { @@ -95,7 +95,7 @@ ], "cutoff": 60, "dependencies": [], - "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0", + "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123", "ground": { "answer": "The word 'Washington', printed to a .txt file named anything", "eval": { @@ -134,7 +134,7 @@ "dependencies": [ "TestSearch" ], - "eval_id": "525001ed-8b45-4405-9e56-ce4423314294", + "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae", "ground": { "answer": "\u00a325.89", "eval": { @@ -171,7 +171,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43", + "eval_id": "0bb23182-b434-402b-a73e-9c226469b959", "ground": { "answer": "This is a Heading\nThis is a paragraph.", "eval": { @@ -207,13 +207,14 @@ "color": "grey", "data": { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 60, "dependencies": [ "TestRevenueRetrieval2" ], - "eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05", + "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5", "ground": { "answer": "The twitter handles of the two hosts of Latent Space.", "eval": { @@ -252,7 +253,7 @@ "dependencies": [ "TestRevenueRetrieval" ], - "eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb", + "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1", "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", "eval": { @@ -298,13 +299,14 @@ "color": "grey", "data": { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 60, "dependencies": [ "TestBasicRetrieval" ], - "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d", + "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f", "ground": { "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", "eval": { @@ -334,13 +336,14 @@ "color": "grey", "data": { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 240, "dependencies": [ "TestReadFile" ], - "eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6", + "eval_id": "895ae28a-4513-44ea-a872-0164771d1597", "ground": { "answer": "A report highlighting elements from the 2 files.", "eval": { diff --git a/frontend/assets/tree_structure.json b/frontend/assets/tree_structure.json index 27833803..7d4e432c 100644 --- a/frontend/assets/tree_structure.json +++ b/frontend/assets/tree_structure.json @@ -12,6 +12,12 @@ "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]" }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]" + }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", @@ -78,24 +84,42 @@ "id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]" }, - { - "arrows": "to", - "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", - "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", - "to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]" - }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]", "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]" }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]" + }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]", "id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]", "to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]" }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]" + }, + { + "arrows": "to", + "from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]", + "id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]" + }, { "arrows": "to", "from": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]", @@ -117,7 +141,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1", + "eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8", "ground": { "answer": "The content of output.txt should be 'Hello World!'", "eval": { @@ -155,7 +179,7 @@ ], "cutoff": 60, "dependencies": [], - "eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0", + "eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123", "ground": { "answer": "The word 'Washington', printed to a .txt file named anything", "eval": { @@ -187,13 +211,14 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 150, "dependencies": [ "TestUrlShortener" ], - "eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95", + "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0", "ground": { "answer": "The correct python file for a TicTacToe game is written", "eval": { @@ -227,7 +252,7 @@ "dependencies": [ "TestThreeSum" ], - "eval_id": "0823b577-64f2-477b-856d-16726fe464b0", + "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f", "ground": { "answer": "password_generator.py is created and satisfies the requirements.", "eval": { @@ -255,13 +280,14 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 90, "dependencies": [ "TestPasswordGenerator" ], - "eval_id": "6ace62be-6c18-431a-947f-72fb20984b58", + "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e", "ground": { "answer": "The correct python file is written and organizes the files accordingly", "eval": { @@ -289,13 +315,14 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 60, "dependencies": [ "TestWriteFile" ], - "eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f", + "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4", "ground": { "answer": "The three_sum function coded properly.", "eval": { @@ -327,14 +354,15 @@ "color": "grey", "data": { "category": [ - "coding" + "coding", + "general" ], "cutoff": 90, "dependencies": [ "TestTicTacToe", "TestReadFile" ], - "eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a", + "eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1", "ground": { "answer": "The implementation of battleship that passes all the tests.", "eval": { @@ -366,7 +394,7 @@ "dependencies": [ "TestFileOrganizer" ], - "eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273", + "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c", "ground": { "answer": "The correct python file for a basic url shortener CLI", "eval": { @@ -401,7 +429,7 @@ "dependencies": [ "TestSearch" ], - "eval_id": "525001ed-8b45-4405-9e56-ce4423314294", + "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae", "ground": { "answer": "\u00a325.89", "eval": { @@ -431,13 +459,14 @@ "color": "grey", "data": { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 60, "dependencies": [ "TestRevenueRetrieval2" ], - "eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05", + "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5", "ground": { "answer": "The twitter handles of the two hosts of Latent Space.", "eval": { @@ -476,7 +505,7 @@ "dependencies": [ "TestRevenueRetrieval" ], - "eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb", + "eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1", "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", "eval": { @@ -518,6 +547,43 @@ "label": "RevenueRetrieval2", "shape": "dot" }, + { + "color": "grey", + "data": { + "category": [ + "scrape_synthesize", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestBasicRetrieval" + ], + "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f", + "ground": { + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "eval": { + "type": "file" + }, + "files": [ + ".txt" + ], + "should_contain": [ + "81,462" + ], + "should_not_contain": [] + }, + "info": { + "description": "Tests if the agent can retrieve Tesla's revenue in 2022.", + "difficulty": "intermediate", + "side_effects": [] + }, + "name": "TestRevenueRetrieval", + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + }, + "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]", + "label": "RevenueRetrieval", + "shape": "dot" + }, { "color": "grey", "data": { @@ -529,7 +595,7 @@ "dependencies": [ "TestWriteFile" ], - "eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43", + "eval_id": "0bb23182-b434-402b-a73e-9c226469b959", "ground": { "answer": "This is a Heading\nThis is a paragraph.", "eval": { @@ -565,49 +631,90 @@ "color": "grey", "data": { "category": [ - "scrape_synthesize" + "data" ], - "cutoff": 60, + "cutoff": 90, "dependencies": [ - "TestBasicRetrieval" + "TestAnswerQuestionSmallCsv" ], - "eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d", + "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732", "ground": { - "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "answer": "The correct amount spent on utilities.", "eval": { "type": "file" }, "files": [ - ".txt" + "output.txt" ], "should_contain": [ - "81,462" - ], - "should_not_contain": [] + "1861" + ] }, "info": { - "description": "Tests if the agent can retrieve Tesla's revenue in 2022.", + "description": "Tests if the agent can answer a question from a csv", "difficulty": "intermediate", - "side_effects": [] + "side_effects": [ + "" + ] }, - "name": "TestRevenueRetrieval", - "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + "name": "TestAnswerQuestionCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." }, - "id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]", - "label": "RevenueRetrieval", + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]", + "label": "AnswerQuestionCsv", "shape": "dot" }, { "color": "grey", "data": { "category": [ - "data" + "data", + "general" + ], + "cutoff": 120, + "dependencies": [ + "TestAnswerQuestionCsv", + "TestCombineCsv" + ], + "eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "1861" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestAnswerQuestionCombineCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." + }, + "id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]", + "label": "AnswerQuestionCombineCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" ], "cutoff": 60, "dependencies": [ "TestReadFile" ], - "eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15", + "eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95", "ground": { "answer": "The csv sorted by date", "eval": { @@ -638,13 +745,52 @@ "color": "grey", "data": { "category": [ - "data" + "data", + "general" + ], + "cutoff": 60, + "dependencies": [ + "TestReadFile" + ], + "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970", + "ground": { + "answer": "The correct amount spent on utilities.", + "eval": { + "type": "file" + }, + "files": [ + "output.txt" + ], + "should_contain": [ + "84" + ] + }, + "info": { + "description": "Tests if the agent can answer a question from a small csv", + "difficulty": "intermediate", + "side_effects": [ + "" + ] + }, + "name": "TestAnswerQuestionSmallCsv", + "task": "How much was spent on utilities in total ? Write the answer in an output.txt file." + }, + "id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]", + "label": "AnswerQuestionSmallCsv", + "shape": "dot" + }, + { + "color": "grey", + "data": { + "category": [ + "data", + "general" ], "cutoff": 60, "dependencies": [ "TestLabelCsv" ], - "eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375", + "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b", "ground": { "answer": "The csv data is combined", "eval": { @@ -681,7 +827,7 @@ "dependencies": [ "TestSortCsv" ], - "eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e", + "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac", "ground": { "answer": "The csv labelled", "eval": { @@ -712,13 +858,14 @@ "color": "grey", "data": { "category": [ - "scrape_synthesize" + "scrape_synthesize", + "general" ], "cutoff": 240, "dependencies": [ "TestReadFile" ], - "eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6", + "eval_id": "895ae28a-4513-44ea-a872-0164771d1597", "ground": { "answer": "A report highlighting elements from the 2 files.", "eval": {