mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-02-04 13:54:31 +01:00
Add more data challenges (#5390)
Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
This commit is contained in:
138
autogpts/forge/poetry.lock
generated
138
autogpts/forge/poetry.lock
generated
@@ -368,75 +368,63 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "cffi"
|
||||
version = "1.15.1"
|
||||
version = "1.16.0"
|
||||
description = "Foreign Function Interface for Python calling C code."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"},
|
||||
{file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"},
|
||||
{file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"},
|
||||
{file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"},
|
||||
{file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"},
|
||||
{file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"},
|
||||
{file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"},
|
||||
{file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
|
||||
{file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -794,15 +782,19 @@ test-no-images = ["pytest", "pytest-cov", "wurlitzer"]
|
||||
|
||||
[[package]]
|
||||
name = "cycler"
|
||||
version = "0.11.0"
|
||||
version = "0.12.0"
|
||||
description = "Composable style cycles"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
|
||||
{file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
|
||||
{file = "cycler-0.12.0-py3-none-any.whl", hash = "sha256:7896994252d006771357777d0251f3e34d266f4fa5f2c572247a80ab01440947"},
|
||||
{file = "cycler-0.12.0.tar.gz", hash = "sha256:8cc3a7b4861f91b1095157f9916f748549a617046e67eb7619abed9b34d2c94a"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
|
||||
tests = ["pytest", "pytest-cov", "pytest-xdist"]
|
||||
|
||||
[[package]]
|
||||
name = "decorator"
|
||||
version = "5.1.1"
|
||||
@@ -3656,13 +3648,13 @@ anyio = ">=3.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "wcwidth"
|
||||
version = "0.2.6"
|
||||
version = "0.2.7"
|
||||
description = "Measures the displayed width of unicode strings in a terminal"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"},
|
||||
{file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"},
|
||||
{file = "wcwidth-0.2.7-py2.py3-none-any.whl", hash = "sha256:fabf3e32999d9b0dab7d19d845149f326f04fe29bac67709ee071dbd92640a36"},
|
||||
{file = "wcwidth-0.2.7.tar.gz", hash = "sha256:1b6d30a98ddd5ce9bbdb33658191fd2423fc9da203fe3ef1855407dcb7ee4e26"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
|
||||
"eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
|
||||
"ground": {
|
||||
"answer": "The content of output.txt should be 'Hello World!'",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [],
|
||||
"eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
|
||||
"eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
|
||||
"ground": {
|
||||
"answer": "The word 'Washington', printed to a .txt file named anything",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "fc6643da-8965-4f1d-b692-5ac8d56a3de8",
|
||||
"ground": {
|
||||
"answer": "It should copy the above text with no loss",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
"dependencies": [
|
||||
"TestRememberGoalSimple"
|
||||
],
|
||||
"eval_id": "ae4ef0b8-9fe1-4a24-beda-8cae5eb9b9e5",
|
||||
"ground": {
|
||||
"answer": "It should copy the above text with no loss",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"eval_id": "c2b3d19d-c179-4132-bf7b-b3f9dc629f58",
|
||||
"eval_id": "2d64d7a5-d664-4b86-9921-0b5e3aa9cf91",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval_1.0"
|
||||
],
|
||||
"eval_id": "1ab338b1-2213-4c21-8d14-17baf8237416",
|
||||
"eval_id": "b79898bb-263a-4184-8e4d-0aa52838bfdb",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval1.1"
|
||||
],
|
||||
"eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
|
||||
"eval_id": "838128f9-79ee-45cf-8a8f-c19b0d576a76",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestDebugSimpleTypoWithGuidance"
|
||||
],
|
||||
"eval_id": "5d0b8017-8231-4494-be7c-188f1fac24e7",
|
||||
"eval_id": "38671c68-89ea-4c51-92a5-1bc35a033c49",
|
||||
"ground": {
|
||||
"answer": "Same as the original DebugWithGuidance test - [0, 1] [2, 5] [0, 3]",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval1.0"
|
||||
],
|
||||
"eval_id": "09fed110-077a-4b99-8821-ed071977cebe",
|
||||
"eval_id": "9d4894d8-6f7c-465a-bc91-ca79a21b6ca3",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"eval_id": "7fc79ad5-fd08-451a-8782-5beeba6822f4",
|
||||
"eval_id": "261ee06f-a7b0-4d5c-bf92-3197763caba6",
|
||||
"ground": {
|
||||
"answer": "\u00a325.89",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestWritingCLIFileOrganizer"
|
||||
],
|
||||
"eval_id": "270543c8-e101-44e8-8c3f-c2317afd54de",
|
||||
"eval_id": "94ef736e-c2f1-4fa9-8cbf-a1c0873ee1ee",
|
||||
"ground": {
|
||||
"answer": "A web app where we can list animals and have details about dogs.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "f5079f72-ba53-4273-b7e9-fbbd66279ce7",
|
||||
"eval_id": "15686763-9be7-41e0-902a-80a99fd88089",
|
||||
"ground": {
|
||||
"answer": "password_generator.py is created and satisfies the requirements.",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "50ec32d0-f957-42ee-a01d-160c77aaf4ae",
|
||||
"eval_id": "bb23fa8c-6df9-410e-8845-bb2d1ebe0c12",
|
||||
"ground": {
|
||||
"answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestPasswordGeneratorEasy"
|
||||
],
|
||||
"eval_id": "b1d6a2c5-6382-4e58-99e3-9ceb2cc02f28",
|
||||
"eval_id": "d6bbefcc-0ee5-4190-b8a1-3721d016f849",
|
||||
"ground": {
|
||||
"answer": "The correct python file is written and organizes the files accordingly",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestReturnCodeSimple"
|
||||
],
|
||||
"eval_id": "d36bf9f5-ee0f-4d84-b4d8-71f20e8454d5",
|
||||
"eval_id": "a59a1904-e9d6-443b-adb7-2e1ff972843f",
|
||||
"ground": {
|
||||
"answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestReturnCodeWrite"
|
||||
],
|
||||
"eval_id": "cffd9e7f-a1d5-478b-a04d-9504f372639a",
|
||||
"eval_id": "092f3c8a-9723-4262-8e40-93d0cebba98a",
|
||||
"ground": {
|
||||
"answer": "def multiply_int(num, multiplier):\n return num * multiplier\n",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestReturnCodeModify"
|
||||
],
|
||||
"eval_id": "c1d35de4-3a0c-410f-95d2-745009de9ee8",
|
||||
"eval_id": "d39b8ed1-5984-40b0-8de6-a1c5eec30bc7",
|
||||
"ground": {
|
||||
"answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "9984c549-b44d-40f7-8eff-1901ae35f050",
|
||||
"eval_id": "a758335b-539b-4d8a-b90e-cf7036952294",
|
||||
"ground": {
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestDebugSimpleTypoWithGuidance"
|
||||
],
|
||||
"eval_id": "3cebb926-ec58-4ad6-ab2e-a2d4a7f6f30d",
|
||||
"eval_id": "1d171b68-0374-4b08-ae6a-c7223f89ecc1",
|
||||
"ground": {
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestDebugSimpleTypoWithoutGuidance"
|
||||
],
|
||||
"eval_id": "6fbabc52-c9b0-43e7-afbc-4b8d64c1e4da",
|
||||
"eval_id": "f537c143-ab40-4a95-8cf2-ab90cb829ca5",
|
||||
"ground": {
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestFunctionCodeGeneration"
|
||||
],
|
||||
"eval_id": "8481ff39-9edf-49eb-ab88-ac5368523b71",
|
||||
"eval_id": "a38396b8-8f61-49fc-a973-0876a4b6b5e9",
|
||||
"ground": {
|
||||
"answer": "The three_sum function coded properly.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestReturnCodeWrite"
|
||||
],
|
||||
"eval_id": "55a60811-a3a1-4a31-b939-77d52dac1749",
|
||||
"eval_id": "c6703d23-7d2d-4b9b-a729-8014df9a7b4e",
|
||||
"ground": {
|
||||
"answer": "The two_sum function coded properly.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "bd922943-ffc6-4471-b3f1-5bd8e875c1d8",
|
||||
"eval_id": "6ff65567-eb1e-4c7d-8b7f-dfc91dc95ed1",
|
||||
"ground": {
|
||||
"answer": "1. Search for flights to Toronto.\n2. Select the cheapest flight.\n3. Book the flight.\n4. Pay for the flight.\n5. Confirm the flight.",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "1ce0ccdd-cbe3-4000-a2a4-86d9c147fcfe",
|
||||
"eval_id": "e3794d05-7617-4b95-9e96-d5f84309f66f",
|
||||
"ground": {
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"eval": {
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
Hello World!
|
||||
@@ -1 +0,0 @@
|
||||
Hello World!
|
||||
@@ -1 +0,0 @@
|
||||
Hello World!
|
||||
@@ -1,31 +0,0 @@
|
||||
{
|
||||
"category": [
|
||||
"interface"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "11b6f96f-6712-4a70-98b7-882ee36d7f2a",
|
||||
"ground": {
|
||||
"answer": "The content of output.txt should be 'Hello World!'",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"Hello World!"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "tests the ability for an agent to read a file.",
|
||||
"difficulty": "interface",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "ReadFile",
|
||||
"task": "Read the file called file_to_read.txt and write its content to a file called output.txt"
|
||||
}
|
||||
@@ -1,2 +0,0 @@
|
||||
This is a Heading
|
||||
This is a paragraph.
|
||||
@@ -1,36 +0,0 @@
|
||||
{
|
||||
"category": [
|
||||
"interface"
|
||||
],
|
||||
"cutoff": 120,
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "3af8ea29-ae35-45dd-ab14-7a995ba4fc93",
|
||||
"ground": {
|
||||
"answer": "This is a Heading\nThis is a paragraph.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
".txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"Heading",
|
||||
"paragraph"
|
||||
],
|
||||
"should_not_contain": [
|
||||
"The",
|
||||
"the"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if an llm can search",
|
||||
"difficulty": "interface",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "Search",
|
||||
"task": "Open 'https://silennaihin.com/random/plain.html' and paste all of the text on the page in a .txt file"
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
Washington
|
||||
@@ -1,30 +0,0 @@
|
||||
{
|
||||
"category": [
|
||||
"interface"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [],
|
||||
"eval_id": "5242ffc3-a7a9-4a92-a72a-2141f1206f76",
|
||||
"ground": {
|
||||
"answer": "The word 'Washington', printed to a .txt file named anything",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
".txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"Washington"
|
||||
],
|
||||
"should_not_contain": []
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests the agents ability to write to a file",
|
||||
"difficulty": "interface",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "WriteFile",
|
||||
"task": "Write the word 'Washington' to a .txt file"
|
||||
}
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "0f0e9d4f-418d-4b43-9f05-20134dc77ed8",
|
||||
"eval_id": "00227b7f-8952-4577-bfdb-c75db9f1fb19",
|
||||
"ground": {
|
||||
"answer": "2314",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestBasicMemory"
|
||||
],
|
||||
"eval_id": "ce6e588b-3ac6-45bb-892e-cfed7c7724bf",
|
||||
"eval_id": "9c48c169-0cf0-46d0-9985-a31710bf398b",
|
||||
"ground": {
|
||||
"answer": "3145\n3791\n9317\n9471",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIds"
|
||||
],
|
||||
"eval_id": "29a88045-67cf-45c1-a509-9a5f53a61ad5",
|
||||
"eval_id": "480eaff6-e287-4741-9fe0-a4634e0ad491",
|
||||
"ground": {
|
||||
"answer": "3145\n3791\n9317\n9471",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIdsWithNoise"
|
||||
],
|
||||
"eval_id": "1c5d4196-4171-4236-a944-d2905d2dc3c1",
|
||||
"eval_id": "0c8cb5e5-7f7a-4475-977b-68ac2673d77a",
|
||||
"ground": {
|
||||
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"eval_id": "061eab17-1104-4ccf-ad28-6614cd1d12ea",
|
||||
"eval_id": "c379905b-b7d7-49ea-89d8-9b0c113db75f",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval1.0"
|
||||
],
|
||||
"eval_id": "85be4356-043c-46a3-91db-23f4bc9d32b0",
|
||||
"eval_id": "d9a4b0ff-628c-42d2-99ad-ab6053f1c5bb",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval1.1"
|
||||
],
|
||||
"eval_id": "02c4f6ec-3246-4677-afb9-c9c836fd78a6",
|
||||
"eval_id": "e92b86ba-63f3-4322-8f98-4970190d1e69",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestSearch"
|
||||
],
|
||||
"eval_id": "2066032d-2228-41d6-b05d-420fff5b1ce0",
|
||||
"eval_id": "8322a2c3-19e9-46ee-9ae0-6de0ae95becc",
|
||||
"ground": {
|
||||
"answer": "\u00a325.89",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval1.2"
|
||||
],
|
||||
"eval_id": "202ee93e-bae1-4889-9e9f-88fb0ed9fe00",
|
||||
"eval_id": "77df2ad6-ae8f-42f0-9a94-fc92c9f88fdd",
|
||||
"ground": {
|
||||
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "c61408a4-387d-4731-9ece-2f77b968f0b8",
|
||||
"eval_id": "d5a39a76-c804-4478-a022-9a808db6152a",
|
||||
"ground": {
|
||||
"answer": "It should copy the above text with no loss",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestRememberGoal_Simple"
|
||||
],
|
||||
"eval_id": "fc2e0963-2f22-49c1-b566-ab7022212687",
|
||||
"eval_id": "aae6a6eb-fa86-498a-9178-b7be733c6ffc",
|
||||
"ground": {
|
||||
"answer": "It should copy the above text with no loss",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestRememberGoal_Medium"
|
||||
],
|
||||
"eval_id": "de0f553d-194f-4853-9646-eb035133fd61",
|
||||
"eval_id": "29241c0f-594f-4843-b0e2-8230cb8784fd",
|
||||
"ground": {
|
||||
"answer": "It should copy the above text with no loss",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestRememberGoal_Advanced"
|
||||
],
|
||||
"eval_id": "6e79c281-cda3-4604-b60d-3629bbc5faba",
|
||||
"eval_id": "290272fa-36e1-4c75-b58f-eb76f4a938b7",
|
||||
"ground": {
|
||||
"answer": "It should copy the above text with no loss",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "7bfe83c5-ba16-4fba-9edb-d0e8837c0abc",
|
||||
"eval_id": "0bc68658-389f-4427-94af-9b925df7afe4",
|
||||
"ground": {
|
||||
"answer": "All of the elements should be contained in the output files",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestWrite5FilesWithArray"
|
||||
],
|
||||
"eval_id": "54c4fd15-e8e5-4783-94b6-2e25ef8ef388",
|
||||
"eval_id": "0609fd6e-a753-4f50-89a0-0dc81ec58994",
|
||||
"ground": {
|
||||
"answer": "It should copy the above text with no loss",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "c7372901-5fc0-4431-8c87-302fe3583ab7",
|
||||
"eval_id": "d14d6a59-a355-424c-a24b-a8aca580e32c",
|
||||
"ground": {
|
||||
"answer": "Should be a number within a 50 USD range of the actual price according to coingecko.",
|
||||
"eval": {
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f",
|
||||
"eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
|
||||
"ground": {
|
||||
"answer": "The three_sum function coded properly.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestThreeSum"
|
||||
],
|
||||
"eval_id": "0823b577-64f2-477b-856d-16726fe464b0",
|
||||
"eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
|
||||
"ground": {
|
||||
"answer": "password_generator.py is created and satisfies the requirements.",
|
||||
"eval": {
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestPasswordGenerator"
|
||||
],
|
||||
"eval_id": "6ace62be-6c18-431a-947f-72fb20984b58",
|
||||
"eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
|
||||
"ground": {
|
||||
"answer": "The correct python file is written and organizes the files accordingly",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestFileOrganizer"
|
||||
],
|
||||
"eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273",
|
||||
"eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
|
||||
"ground": {
|
||||
"answer": "The correct python file for a basic url shortener CLI",
|
||||
"eval": {
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 150,
|
||||
"dependencies": [
|
||||
"TestUrlShortener"
|
||||
],
|
||||
"eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95",
|
||||
"eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
|
||||
"ground": {
|
||||
"answer": "The correct python file for a TicTacToe game is written",
|
||||
"eval": {
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
{
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestTicTacToe",
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a",
|
||||
"eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1",
|
||||
"ground": {
|
||||
"answer": "The implementation of battleship that passes all the tests.",
|
||||
"eval": {
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"category": [
|
||||
"data"
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
|
||||
"eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95",
|
||||
"ground": {
|
||||
"answer": "The csv sorted by date",
|
||||
"eval": {
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestSortCsv"
|
||||
],
|
||||
"eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e",
|
||||
"eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
|
||||
"ground": {
|
||||
"answer": "The csv labelled",
|
||||
"eval": {
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"category": [
|
||||
"data"
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestLabelCsv"
|
||||
],
|
||||
"eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375",
|
||||
"eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
|
||||
"ground": {
|
||||
"answer": "The csv data is combined",
|
||||
"eval": {
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
Date Description Amount Category
|
||||
2023-01-01 Grocery Store 52.3 Groceries
|
||||
2023-01-02 Pharmacy 12.5 Healthcare
|
||||
2023-01-03 Gas Station 29.1 Transportation
|
||||
2023-01-04 Water 19 Utilities
|
||||
2023-01-05 Grocery Store 60.25 Groceries
|
||||
2023-01-06 Coffee Shop 4.5 Dining
|
||||
2023-01-07 Cinema Tickets 20 Entertainment
|
||||
2023-01-08 Book Store 30.4 Shopping
|
||||
2023-01-09 Restaurant Dinner 55.8 Dining
|
||||
2023-01-10 Electric Bill 65.35 Utilities
|
||||
2023-01-11 Grocery Store 45.1 Groceries
|
||||
|
@@ -0,0 +1 @@
|
||||
84
|
||||
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"category": [
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
|
||||
"ground": {
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"84"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can answer a question from a small csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "AnswerQuestionSmallCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
}
|
||||
@@ -0,0 +1,305 @@
|
||||
Date Description Amount Category
|
||||
2023-01-01 Grocery Store 52.3 Groceries
|
||||
2023-01-02 Pharmacy 12.5 Healthcare
|
||||
2023-01-03 Gas Station 29.1 Transportation
|
||||
2023-01-04 Cinema Tickets 19 Entertainment
|
||||
2023-01-05 Grocery Store 60.25 Groceries
|
||||
2023-01-06 Coffee Shop 4.5 Dining
|
||||
2023-01-07 Cinema Tickets 20 Entertainment
|
||||
2023-01-08 Book Store 30.4 Shopping
|
||||
2023-01-09 Restaurant Dinner 55.8 Dining
|
||||
2023-01-10 Electric Bill 65.35 Utilities
|
||||
2023-01-11 Grocery Store 45.1 Groceries
|
||||
2023-01-12 Clothing Store 100.2 Shopping
|
||||
2023-01-13 Pharmacy 20.3 Healthcare
|
||||
2023-01-14 Coffee Shop 4.5 Dining
|
||||
2023-01-15 Restaurant Dinner 50 Dining
|
||||
2023-01-16 Gas Station 32.1 Transportation
|
||||
2023-01-17 Online Shopping 80 Shopping
|
||||
2023-01-18 Water Bill 20.35 Utilities
|
||||
2023-01-19 Grocery Store 55.6 Groceries
|
||||
2023-01-20 Gas Station 28 Transportation
|
||||
2023-01-21 Pharmacy 15.4 Healthcare
|
||||
2023-01-22 Phone Bill 40 Utilities
|
||||
2023-01-23 Cinema Tickets 20 Entertainment
|
||||
2023-01-24 Coffee Shop 5.5 Dining
|
||||
2023-01-25 Book Purchase 14 Shopping
|
||||
2023-01-26 Restaurant Lunch 30 Dining
|
||||
2023-01-27 Public Transport 20 Transportation
|
||||
2023-01-28 Grocery Store 58.25 Groceries
|
||||
2023-01-29 Online Shopping 70 Shopping
|
||||
2023-01-30 Grocery Store 62.1 Groceries
|
||||
2023-01-31 Medical Prescription 10.4 Healthcare
|
||||
2023-02-01 Gas Station 33 Transportation
|
||||
2023-02-02 Coffee Shop 6 Dining
|
||||
2023-02-03 Cinema Tickets 22 Entertainment
|
||||
2023-02-04 Book Store 28.4 Shopping
|
||||
2023-02-05 Internet Bill 50 Utilities
|
||||
2023-02-06 Grocery Store 60.1 Groceries
|
||||
2023-02-07 Clothing Store 120 Shopping
|
||||
2023-02-08 Grocery Store 58.25 Groceries
|
||||
2023-02-09 Coffee Shop 4.5 Dining
|
||||
2023-02-10 Electric Bill 70 Utilities
|
||||
2023-02-11 Grocery Store 50.1 Groceries
|
||||
2023-02-12 Public Transport 18 Transportation
|
||||
2023-02-13 Pharmacy 24 Healthcare
|
||||
2023-02-14 Restaurant Dinner 60 Dining
|
||||
2023-02-15 Medical Prescription 11.4 Healthcare
|
||||
2023-02-16 Gas Station 30 Transportation
|
||||
2023-02-17 Online Shopping 85 Shopping
|
||||
2023-02-18 Water Bill 18 Utilities
|
||||
2023-02-19 Grocery Store 53.6 Groceries
|
||||
2023-02-20 Public Transport 22 Transportation
|
||||
2023-02-21 Pharmacy 10 Healthcare
|
||||
2023-02-22 Phone Bill 42 Utilities
|
||||
2023-02-23 Cinema Tickets 24 Entertainment
|
||||
2023-02-24 Coffee Shop 6 Dining
|
||||
2023-02-25 Book Purchase 16 Shopping
|
||||
2023-02-26 Restaurant Lunch 28 Dining
|
||||
2023-02-27 Gas Station 34 Transportation
|
||||
2023-02-28 Grocery Store 56 Groceries
|
||||
2023-03-01 Online Shopping 90 Groceries
|
||||
2023-03-02 Dentist Appointment 130 Healthcare
|
||||
2023-03-03 Grocery Store 63.45 Groceries
|
||||
2023-03-04 Cinema Tickets 21 Entertainment
|
||||
2023-03-05 Coffee Shop 5.8 Dining
|
||||
2023-03-06 Electric Bill 67.5 Utilities
|
||||
2023-03-07 Gas Station 31.2 Transportation
|
||||
2023-03-08 Restaurant Dinner 58 Dining
|
||||
2023-03-09 Pharmacy 18.3 Healthcare
|
||||
2023-03-10 Grocery Store 64.7 Groceries
|
||||
2023-03-11 Book Store 25.4 Shopping
|
||||
2023-03-12 Online Shopping 78 Shopping
|
||||
2023-03-13 Coffee Shop 6.5 Dining
|
||||
2023-03-14 Museum Tickets 15 Entertainment
|
||||
2023-03-15 Internet Bill 52 Utilities
|
||||
2023-03-16 Public Transport 19.5 Transportation
|
||||
2023-03-17 Clothing Store 105.6 Shopping
|
||||
2023-03-18 Phone Bill 41 Utilities
|
||||
2023-03-19 Coffee Shop 5 Dining
|
||||
2023-03-20 Grocery Store 59.2 Groceries
|
||||
2023-03-21 Gas Station 29.8 Transportation
|
||||
2023-03-22 Restaurant Lunch 32 Dining
|
||||
2023-03-23 Pharmacy 16.5 Healthcare
|
||||
2023-03-24 Concert Tickets 50 Entertainment
|
||||
2023-03-25 Coffee Shop 5.5 Dining
|
||||
2023-03-26 Grocery Store 61.8 Groceries
|
||||
2023-03-27 Online Shopping 82 Shopping
|
||||
2023-03-28 Water Bill 19.35 Utilities
|
||||
2023-03-29 Public Transport 21 Transportation
|
||||
2023-03-30 Book Purchase 17 Shopping
|
||||
2023-03-31 Grocery Store 60 Groceries
|
||||
2023-04-01 Cinema Tickets 23 Entertainment
|
||||
2023-04-02 Pharmacy 17.4 Healthcare
|
||||
2023-04-03 Gas Station 33.5 Transportation
|
||||
2023-04-04 Restaurant Dinner 56.7 Dining
|
||||
2023-04-05 Grocery Store 65.3 Groceries
|
||||
2023-04-06 Coffee Shop 5.9 Dining
|
||||
2023-04-07 Online Shopping 87 Shopping
|
||||
2023-04-08 Electric Bill 69 Utilities
|
||||
2023-04-09 Clothing Store 112.5 Shopping
|
||||
2023-04-10 Grocery Store 57.4 Groceries
|
||||
2023-04-11 Book Store 26.3 Shopping
|
||||
2023-04-12 Gas Station 30.9 Transportation
|
||||
2023-04-13 Coffee Shop 6.8 Dining
|
||||
2023-04-14 Zoo Tickets 24 Entertainment
|
||||
2023-04-15 Internet Bill 53 Utilities
|
||||
2023-04-16 Public Transport 20.5 Transportation
|
||||
2023-04-17 Restaurant Lunch 34 Dining
|
||||
2023-04-18 Phone Bill 43 Utilities
|
||||
2023-04-19 Coffee Shop 5.2 Dining
|
||||
2023-04-20 Grocery Store 58.9 Groceries
|
||||
2023-04-21 Pharmacy 14.7 Healthcare
|
||||
2023-04-22 Cinema Tickets 25 Entertainment
|
||||
2023-04-23 Online Shopping 90 Shopping
|
||||
2023-04-24 Gas Station 31.4 Transportation
|
||||
2023-04-25 Water Bill 21 Utilities
|
||||
2023-04-26 Grocery Store 62.5 Groceries
|
||||
2023-04-27 Coffee Shop 5.7 Dining
|
||||
2023-04-28 Book Purchase 18.5 Shopping
|
||||
2023-04-29 Public Transport 22 Transportation
|
||||
2023-04-30 Grocery Store 63 Groceries
|
||||
2023-05-01 Theater Tickets 45 Entertainment
|
||||
2023-05-02 Dentist Appointment 135 Healthcare
|
||||
2023-05-03 Gas Station 32.2 Transportation
|
||||
2023-05-04 Restaurant Dinner 59 Dining
|
||||
2023-05-05 Grocery Store 66.1 Groceries
|
||||
2023-05-06 Coffee Shop 6 Dining
|
||||
2023-05-07 Online Shopping 89 Shopping
|
||||
2023-05-08 Electric Bill 70.5 Utilities
|
||||
2023-05-09 Clothing Store 110 Shopping
|
||||
2023-05-10 Grocery Store 59.7 Groceries
|
||||
2023-05-11 Coffee Shop 6.1 Dining
|
||||
2023-05-12 Book Store 29.2 Shopping
|
||||
2023-05-13 Gas Station 29.9 Transportation
|
||||
2023-05-14 Museum Tickets 16 Entertainment
|
||||
2023-05-15 Internet Bill 52.5 Utilities
|
||||
2023-05-16 Public Transport 21.3 Transportation
|
||||
2023-05-17 Restaurant Lunch 35.4 Dining
|
||||
2023-05-18 Phone Bill 43.5 Utilities
|
||||
2023-05-19 Grocery Store 64.8 Groceries
|
||||
2023-05-20 Pharmacy 15.2 Healthcare
|
||||
2023-05-21 Cinema Tickets 26 Entertainment
|
||||
2023-05-22 Coffee Shop 6.3 Dining
|
||||
2023-05-23 Gas Station 30.8 Transportation
|
||||
2023-05-24 Online Shopping 92.5 Shopping
|
||||
2023-05-25 Water Bill 20.5 Utilities
|
||||
2023-05-26 Grocery Store 61.9 Groceries
|
||||
2023-05-27 Public Transport 23 Transportation
|
||||
2023-05-28 Book Purchase 19 Shopping
|
||||
2023-05-29 Coffee Shop 5.9 Dining
|
||||
2023-05-30 Restaurant Dinner 57.8 Dining
|
||||
2023-05-31 Grocery Store 66.7 Groceries
|
||||
2023-06-01 Theater Tickets 47 Entertainment
|
||||
2023-06-02 Dentist Appointment 140 Healthcare
|
||||
2023-06-03 Gas Station 31.6 Transportation
|
||||
2023-06-04 Coffee Shop 6.4 Dining
|
||||
2023-06-05 Online Shopping 94 Shopping
|
||||
2023-06-06 Electric Bill 72 Utilities
|
||||
2023-06-07 Restaurant Lunch 36 Dining
|
||||
2023-06-08 Grocery Store 65.3 Groceries
|
||||
2023-06-09 Pharmacy 17 Healthcare
|
||||
2023-06-10 Cinema Tickets 27.5 Entertainment
|
||||
2023-06-11 Public Transport 21.5 Transportation
|
||||
2023-06-12 Book Store 30 Shopping
|
||||
2023-06-13 Gas Station 28.7 Transportation
|
||||
2023-06-14 Coffee Shop 6.6 Dining
|
||||
2023-06-15 Internet Bill 53.5 Utilities
|
||||
2023-06-16 Zoo Tickets 28 Entertainment
|
||||
2023-06-17 Grocery Store 67.4 Groceries
|
||||
2023-06-18 Phone Bill 44 Utilities
|
||||
2023-06-19 Restaurant Dinner 60 Dining
|
||||
2023-06-20 Coffee Shop 6.7 Dining
|
||||
2023-06-21 Public Transport 22.5 Transportation
|
||||
2023-06-22 Online Shopping 96 Shopping
|
||||
2023-06-23 Gas Station 32.4 Transportation
|
||||
2023-06-24 Cinema Tickets 29 Entertainment
|
||||
2023-06-25 Book Purchase 20 Shopping
|
||||
2023-06-26 Grocery Store 68.3 Groceries
|
||||
2023-06-27 Water Bill 22 Utilities
|
||||
2023-06-28 Pharmacy 18.5 Healthcare
|
||||
2023-06-29 Restaurant Lunch 37 Dining
|
||||
2023-06-30 Coffee Shop 7 Dining
|
||||
2023-07-01 Grocery Store 69.5 Groceries
|
||||
2023-07-02 Theater Tickets 49 Entertainment
|
||||
2023-07-03 Gas Station 33.2 Transportation
|
||||
2023-07-04 Park Picnic 40 Dining
|
||||
2023-07-05 Electric Bill 73.5 Utilities
|
||||
2023-07-06 Clothing Store 120 Shopping
|
||||
2023-07-07 Online Shopping 98 Shopping
|
||||
2023-07-08 Grocery Store 70.6 Groceries
|
||||
2023-07-09 Coffee Shop 7.1 Dining
|
||||
2023-07-10 Internet Bill 54 Utilities
|
||||
2023-07-11 Public Transport 23.5 Transportation
|
||||
2023-07-12 Museum Tickets 18 Entertainment
|
||||
2023-07-13 Book Store 31 Shopping
|
||||
2023-07-14 Gas Station 29.9 Transportation
|
||||
2023-07-15 Coffee Shop 7.2 Dining
|
||||
2023-07-16 Restaurant Dinner 62 Dining
|
||||
2023-07-17 Grocery Store 71.8 Groceries
|
||||
2023-07-18 Phone Bill 45 Utilities
|
||||
2023-07-19 Zoo Tickets 30 Entertainment
|
||||
2023-07-20 Coffee Shop 7.3 Dining
|
||||
2023-07-21 Public Transport 24 Transportation
|
||||
2023-07-22 Online Shopping 99.5 Shopping
|
||||
2023-07-23 Gas Station 34 Transportation
|
||||
2023-07-24 Cinema Tickets 31 Entertainment
|
||||
2023-07-25 Book Purchase 21.5 Shopping
|
||||
2023-07-26 Grocery Store 72.9 Groceries
|
||||
2023-07-27 Water Bill 23.5 Utilities
|
||||
2023-07-28 Pharmacy 19.5 Healthcare
|
||||
2023-07-29 Restaurant Lunch 38.5 Dining
|
||||
2023-07-30 Coffee Shop 7.4 Dining
|
||||
2023-07-31 Grocery Store 73.7 Groceries
|
||||
2023-08-01 Theater Tickets 50 Entertainment
|
||||
2023-08-02 Gas Station 34.5 Transportation
|
||||
2023-08-03 Restaurant Dinner 63.5 Dining
|
||||
2023-08-04 Online Shopping 101 Shopping
|
||||
2023-08-05 Electric Bill 75 Utilities
|
||||
2023-08-06 Grocery Store 74.6 Groceries
|
||||
2023-08-07 Coffee Shop 7.5 Dining
|
||||
2023-08-08 Phone Bill 46 Utilities
|
||||
2023-08-09 Public Transport 24.5 Transportation
|
||||
2023-08-10 Cinema Tickets 32.5 Entertainment
|
||||
2023-08-11 Book Store 32 Shopping
|
||||
2023-08-12 Gas Station 35 Transportation
|
||||
2023-08-13 Coffee Shop 7.6 Dining
|
||||
2023-08-14 Park Picnic 42 Dining
|
||||
2023-08-15 Internet Bill 55 Utilities
|
||||
2023-08-16 Grocery Store 76.3 Groceries
|
||||
2023-08-17 Clothing Store 125 Shopping
|
||||
2023-08-18 Pharmacy 20.5 Healthcare
|
||||
2023-08-19 Restaurant Lunch 40 Dining
|
||||
2023-08-20 Coffee Shop 7.7 Dining
|
||||
2023-08-21 Museum Tickets 19 Entertainment
|
||||
2023-08-22 Public Transport 25 Transportation
|
||||
2023-08-23 Online Shopping 103 Shopping
|
||||
2023-08-24 Grocery Store 77.8 Groceries
|
||||
2023-08-25 Water Bill 24.5 Utilities
|
||||
2023-08-26 Zoo Tickets 32 Entertainment
|
||||
2023-08-27 Coffee Shop 7.8 Dining
|
||||
2023-08-28 Gas Station 35.5 Transportation
|
||||
2023-08-29 Book Purchase 23 Shopping
|
||||
2023-08-30 Grocery Store 78.9 Groceries
|
||||
2023-08-31 Cinema Tickets 34 Entertainment
|
||||
2023-09-01 Theater Tickets 52 Entertainment
|
||||
2023-09-02 Gas Station 36 Transportation
|
||||
2023-09-03 Restaurant Dinner 65 Dining
|
||||
2023-09-04 Online Shopping 105 Shopping
|
||||
2023-09-05 Electric Bill 76.5 Utilities
|
||||
2023-09-06 Grocery Store 79.6 Groceries
|
||||
2023-09-07 Coffee Shop 8 Dining
|
||||
2023-09-08 Phone Bill 47 Utilities
|
||||
2023-09-09 Public Transport 26 Transportation
|
||||
2023-09-10 Cinema Tickets 35.5 Entertainment
|
||||
2023-09-11 Book Store 33 Shopping
|
||||
2023-09-12 Gas Station 36.5 Transportation
|
||||
2023-09-13 Coffee Shop 8.2 Dining
|
||||
2023-09-14 Park Picnic 44 Dining
|
||||
2023-09-15 Internet Bill 56 Utilities
|
||||
2023-09-16 Grocery Store 80.4 Groceries
|
||||
2023-09-17 Clothing Store 130 Shopping
|
||||
2023-09-18 Pharmacy 21.5 Healthcare
|
||||
2023-09-19 Restaurant Lunch 41.5 Dining
|
||||
2023-09-20 Coffee Shop 8.4 Dining
|
||||
2023-09-21 Museum Tickets 20 Entertainment
|
||||
2023-09-22 Public Transport 26.5 Transportation
|
||||
2023-09-23 Online Shopping 107 Shopping
|
||||
2023-09-24 Grocery Store 81.3 Groceries
|
||||
2023-09-25 Water Bill 25.5 Utilities
|
||||
2023-09-26 Zoo Tickets 33.5 Entertainment
|
||||
2023-09-27 Coffee Shop 8.6 Dining
|
||||
2023-09-28 Gas Station 37.5 Transportation
|
||||
2023-09-29 Book Purchase 24.5 Shopping
|
||||
2023-09-30 Grocery Store 82.7 Groceries
|
||||
2023-10-01 Cinema Tickets 36 Entertainment
|
||||
2023-10-02 Theater Tickets 54 Entertainment
|
||||
2023-10-03 Gas Station 38 Transportation
|
||||
2023-10-04 Restaurant Dinner 66.5 Dining
|
||||
2023-10-05 Online Shopping 109 Shopping
|
||||
2023-10-06 Electric Bill 78 Utilities
|
||||
2023-10-07 Grocery Store 83.9 Groceries
|
||||
2023-10-08 Coffee Shop 8.8 Dining
|
||||
2023-10-09 Phone Bill 48 Utilities
|
||||
2023-10-10 Public Transport 27.5 Transportation
|
||||
2023-10-11 Cinema Tickets 37.5 Entertainment
|
||||
2023-10-12 Book Store 34.5 Shopping
|
||||
2023-10-13 Gas Station 39.5 Transportation
|
||||
2023-10-14 Coffee Shop 9 Dining
|
||||
2023-10-15 Park Picnic 46 Dining
|
||||
2023-10-16 Internet Bill 57.5 Utilities
|
||||
2023-10-17 Grocery Store 85.2 Groceries
|
||||
2023-10-18 Clothing Store 135 Shopping
|
||||
2023-10-19 Pharmacy 22.5 Healthcare
|
||||
2023-10-20 Restaurant Lunch 43 Dining
|
||||
2023-10-21 Coffee Shop 9.2 Dining
|
||||
2023-10-22 Museum Tickets 21.5 Entertainment
|
||||
2023-10-23 Public Transport 28 Transportation
|
||||
2023-10-24 Online Shopping 111 Shopping
|
||||
2023-10-25 Grocery Store 86.5 Groceries
|
||||
2023-10-26 Water Bill 26.5 Utilities
|
||||
2023-10-27 Zoo Tickets 35 Entertainment
|
||||
2023-10-28 Coffee Shop 9.4 Dining
|
||||
2023-10-29 Gas Station 40.5 Transportation
|
||||
2023-10-30 Book Purchase 26 Shopping
|
||||
2023-10-31 Grocery Store 88 Groceries
|
||||
|
@@ -0,0 +1 @@
|
||||
1861.55
|
||||
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"category": [
|
||||
"data"
|
||||
],
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestAnswerQuestionSmallCsv"
|
||||
],
|
||||
"eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
|
||||
"ground": {
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"1861"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can answer a question from a csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "AnswerQuestionCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
}
|
||||
@@ -0,0 +1,305 @@
|
||||
Category ID
|
||||
Dining 6
|
||||
Dining 9
|
||||
Dining 14
|
||||
Dining 15
|
||||
Dining 24
|
||||
Dining 26
|
||||
Dining 33
|
||||
Dining 40
|
||||
Dining 45
|
||||
Dining 55
|
||||
Dining 57
|
||||
Dining 64
|
||||
Dining 67
|
||||
Dining 72
|
||||
Dining 78
|
||||
Dining 81
|
||||
Dining 84
|
||||
Dining 94
|
||||
Dining 96
|
||||
Dining 103
|
||||
Dining 107
|
||||
Dining 109
|
||||
Dining 117
|
||||
Dining 124
|
||||
Dining 126
|
||||
Dining 131
|
||||
Dining 137
|
||||
Dining 142
|
||||
Dining 149
|
||||
Dining 150
|
||||
Dining 155
|
||||
Dining 158
|
||||
Dining 165
|
||||
Dining 170
|
||||
Dining 171
|
||||
Dining 180
|
||||
Dining 181
|
||||
Dining 185
|
||||
Dining 190
|
||||
Dining 196
|
||||
Dining 197
|
||||
Dining 201
|
||||
Dining 210
|
||||
Dining 211
|
||||
Dining 215
|
||||
Dining 219
|
||||
Dining 225
|
||||
Dining 226
|
||||
Dining 231
|
||||
Dining 232
|
||||
Dining 239
|
||||
Dining 246
|
||||
Dining 250
|
||||
Dining 256
|
||||
Dining 257
|
||||
Dining 262
|
||||
Dining 263
|
||||
Dining 270
|
||||
Dining 277
|
||||
Dining 281
|
||||
Dining 287
|
||||
Dining 288
|
||||
Dining 293
|
||||
Dining 294
|
||||
Dining 301
|
||||
Entertainment 4
|
||||
Entertainment 7
|
||||
Entertainment 23
|
||||
Entertainment 34
|
||||
Entertainment 54
|
||||
Entertainment 63
|
||||
Entertainment 73
|
||||
Entertainment 83
|
||||
Entertainment 91
|
||||
Entertainment 104
|
||||
Entertainment 112
|
||||
Entertainment 121
|
||||
Entertainment 134
|
||||
Entertainment 141
|
||||
Entertainment 152
|
||||
Entertainment 161
|
||||
Entertainment 167
|
||||
Entertainment 175
|
||||
Entertainment 183
|
||||
Entertainment 193
|
||||
Entertainment 200
|
||||
Entertainment 205
|
||||
Entertainment 213
|
||||
Entertainment 222
|
||||
Entertainment 233
|
||||
Entertainment 238
|
||||
Entertainment 243
|
||||
Entertainment 244
|
||||
Entertainment 253
|
||||
Entertainment 264
|
||||
Entertainment 269
|
||||
Entertainment 274
|
||||
Entertainment 275
|
||||
Entertainment 284
|
||||
Entertainment 295
|
||||
Entertainment 300
|
||||
Groceries 1
|
||||
Groceries 5
|
||||
Groceries 11
|
||||
Groceries 19
|
||||
Groceries 28
|
||||
Groceries 30
|
||||
Groceries 37
|
||||
Groceries 39
|
||||
Groceries 42
|
||||
Groceries 50
|
||||
Groceries 59
|
||||
Groceries 60
|
||||
Groceries 62
|
||||
Groceries 69
|
||||
Groceries 79
|
||||
Groceries 85
|
||||
Groceries 90
|
||||
Groceries 95
|
||||
Groceries 100
|
||||
Groceries 110
|
||||
Groceries 116
|
||||
Groceries 120
|
||||
Groceries 125
|
||||
Groceries 130
|
||||
Groceries 139
|
||||
Groceries 146
|
||||
Groceries 151
|
||||
Groceries 159
|
||||
Groceries 168
|
||||
Groceries 177
|
||||
Groceries 182
|
||||
Groceries 189
|
||||
Groceries 198
|
||||
Groceries 207
|
||||
Groceries 212
|
||||
Groceries 218
|
||||
Groceries 228
|
||||
Groceries 236
|
||||
Groceries 242
|
||||
Groceries 249
|
||||
Groceries 259
|
||||
Groceries 267
|
||||
Groceries 273
|
||||
Groceries 280
|
||||
Groceries 290
|
||||
Groceries 298
|
||||
Groceries 304
|
||||
Healthcare 2
|
||||
Healthcare 13
|
||||
Healthcare 21
|
||||
Healthcare 31
|
||||
Healthcare 44
|
||||
Healthcare 46
|
||||
Healthcare 52
|
||||
Healthcare 61
|
||||
Healthcare 68
|
||||
Healthcare 82
|
||||
Healthcare 92
|
||||
Healthcare 111
|
||||
Healthcare 122
|
||||
Healthcare 140
|
||||
Healthcare 153
|
||||
Healthcare 160
|
||||
Healthcare 179
|
||||
Healthcare 209
|
||||
Healthcare 230
|
||||
Healthcare 261
|
||||
Healthcare 292
|
||||
Shopping 8
|
||||
Shopping 12
|
||||
Shopping 17
|
||||
Shopping 25
|
||||
Shopping 29
|
||||
Shopping 35
|
||||
Shopping 38
|
||||
Shopping 48
|
||||
Shopping 56
|
||||
Shopping 70
|
||||
Shopping 71
|
||||
Shopping 76
|
||||
Shopping 86
|
||||
Shopping 89
|
||||
Shopping 97
|
||||
Shopping 99
|
||||
Shopping 101
|
||||
Shopping 113
|
||||
Shopping 118
|
||||
Shopping 127
|
||||
Shopping 129
|
||||
Shopping 132
|
||||
Shopping 144
|
||||
Shopping 148
|
||||
Shopping 156
|
||||
Shopping 163
|
||||
Shopping 173
|
||||
Shopping 176
|
||||
Shopping 187
|
||||
Shopping 188
|
||||
Shopping 194
|
||||
Shopping 203
|
||||
Shopping 206
|
||||
Shopping 216
|
||||
Shopping 223
|
||||
Shopping 229
|
||||
Shopping 235
|
||||
Shopping 241
|
||||
Shopping 247
|
||||
Shopping 254
|
||||
Shopping 260
|
||||
Shopping 266
|
||||
Shopping 272
|
||||
Shopping 278
|
||||
Shopping 285
|
||||
Shopping 291
|
||||
Shopping 297
|
||||
Shopping 303
|
||||
Transportation 3
|
||||
Transportation 16
|
||||
Transportation 20
|
||||
Transportation 27
|
||||
Transportation 32
|
||||
Transportation 43
|
||||
Transportation 47
|
||||
Transportation 51
|
||||
Transportation 58
|
||||
Transportation 66
|
||||
Transportation 75
|
||||
Transportation 80
|
||||
Transportation 88
|
||||
Transportation 93
|
||||
Transportation 102
|
||||
Transportation 106
|
||||
Transportation 114
|
||||
Transportation 119
|
||||
Transportation 123
|
||||
Transportation 133
|
||||
Transportation 136
|
||||
Transportation 143
|
||||
Transportation 147
|
||||
Transportation 154
|
||||
Transportation 162
|
||||
Transportation 164
|
||||
Transportation 172
|
||||
Transportation 174
|
||||
Transportation 184
|
||||
Transportation 192
|
||||
Transportation 195
|
||||
Transportation 202
|
||||
Transportation 204
|
||||
Transportation 214
|
||||
Transportation 221
|
||||
Transportation 224
|
||||
Transportation 234
|
||||
Transportation 240
|
||||
Transportation 245
|
||||
Transportation 252
|
||||
Transportation 255
|
||||
Transportation 265
|
||||
Transportation 271
|
||||
Transportation 276
|
||||
Transportation 283
|
||||
Transportation 286
|
||||
Transportation 296
|
||||
Transportation 302
|
||||
Utilities 10
|
||||
Utilities 18
|
||||
Utilities 22
|
||||
Utilities 36
|
||||
Utilities 41
|
||||
Utilities 49
|
||||
Utilities 53
|
||||
Utilities 65
|
||||
Utilities 74
|
||||
Utilities 77
|
||||
Utilities 87
|
||||
Utilities 98
|
||||
Utilities 105
|
||||
Utilities 108
|
||||
Utilities 115
|
||||
Utilities 128
|
||||
Utilities 135
|
||||
Utilities 138
|
||||
Utilities 145
|
||||
Utilities 157
|
||||
Utilities 166
|
||||
Utilities 169
|
||||
Utilities 178
|
||||
Utilities 186
|
||||
Utilities 191
|
||||
Utilities 199
|
||||
Utilities 208
|
||||
Utilities 217
|
||||
Utilities 220
|
||||
Utilities 227
|
||||
Utilities 237
|
||||
Utilities 248
|
||||
Utilities 251
|
||||
Utilities 258
|
||||
Utilities 268
|
||||
Utilities 279
|
||||
Utilities 282
|
||||
Utilities 289
|
||||
Utilities 299
|
||||
|
@@ -0,0 +1,305 @@
|
||||
Date Description Amount ID
|
||||
2023-01-01 Grocery Store 52.3 1
|
||||
2023-01-02 Pharmacy 12.5 2
|
||||
2023-01-03 Gas Station 29.1 3
|
||||
2023-01-04 Cinema Tickets 19 4
|
||||
2023-01-05 Grocery Store 60.25 5
|
||||
2023-01-06 Coffee Shop 4.5 6
|
||||
2023-01-07 Cinema Tickets 20 7
|
||||
2023-01-08 Book Store 30.4 8
|
||||
2023-01-09 Restaurant Dinner 55.8 9
|
||||
2023-01-10 Electric Bill 65.35 10
|
||||
2023-01-11 Grocery Store 45.1 11
|
||||
2023-01-12 Clothing Store 100.2 12
|
||||
2023-01-13 Pharmacy 20.3 13
|
||||
2023-01-14 Coffee Shop 4.5 14
|
||||
2023-01-15 Restaurant Dinner 50 15
|
||||
2023-01-16 Gas Station 32.1 16
|
||||
2023-01-17 Online Shopping 80 17
|
||||
2023-01-18 Water Bill 20.35 18
|
||||
2023-01-19 Grocery Store 55.6 19
|
||||
2023-01-20 Gas Station 28 20
|
||||
2023-01-21 Pharmacy 15.4 21
|
||||
2023-01-22 Phone Bill 40 22
|
||||
2023-01-23 Cinema Tickets 20 23
|
||||
2023-01-24 Coffee Shop 5.5 24
|
||||
2023-01-25 Book Purchase 14 25
|
||||
2023-01-26 Restaurant Lunch 30 26
|
||||
2023-01-27 Public Transport 20 27
|
||||
2023-01-28 Grocery Store 58.25 28
|
||||
2023-01-29 Online Shopping 70 29
|
||||
2023-01-30 Grocery Store 62.1 30
|
||||
2023-01-31 Medical Prescription 10.4 31
|
||||
2023-02-01 Gas Station 33 32
|
||||
2023-02-02 Coffee Shop 6 33
|
||||
2023-02-03 Cinema Tickets 22 34
|
||||
2023-02-04 Book Store 28.4 35
|
||||
2023-02-05 Internet Bill 50 36
|
||||
2023-02-06 Grocery Store 60.1 37
|
||||
2023-02-07 Clothing Store 120 38
|
||||
2023-02-08 Grocery Store 58.25 39
|
||||
2023-02-09 Coffee Shop 4.5 40
|
||||
2023-02-10 Electric Bill 70 41
|
||||
2023-02-11 Grocery Store 50.1 42
|
||||
2023-02-12 Public Transport 18 43
|
||||
2023-02-13 Pharmacy 24 44
|
||||
2023-02-14 Restaurant Dinner 60 45
|
||||
2023-02-15 Medical Prescription 11.4 46
|
||||
2023-02-16 Gas Station 30 47
|
||||
2023-02-17 Online Shopping 85 48
|
||||
2023-02-18 Water Bill 18 49
|
||||
2023-02-19 Grocery Store 53.6 50
|
||||
2023-02-20 Public Transport 22 51
|
||||
2023-02-21 Pharmacy 10 52
|
||||
2023-02-22 Phone Bill 42 53
|
||||
2023-02-23 Cinema Tickets 24 54
|
||||
2023-02-24 Coffee Shop 6 55
|
||||
2023-02-25 Book Purchase 16 56
|
||||
2023-02-26 Restaurant Lunch 28 57
|
||||
2023-02-27 Gas Station 34 58
|
||||
2023-02-28 Grocery Store 56 59
|
||||
2023-03-01 Online Shopping 90 60
|
||||
2023-03-02 Dentist Appointment 130 61
|
||||
2023-03-03 Grocery Store 63.45 62
|
||||
2023-03-04 Cinema Tickets 21 63
|
||||
2023-03-05 Coffee Shop 5.8 64
|
||||
2023-03-06 Electric Bill 67.5 65
|
||||
2023-03-07 Gas Station 31.2 66
|
||||
2023-03-08 Restaurant Dinner 58 67
|
||||
2023-03-09 Pharmacy 18.3 68
|
||||
2023-03-10 Grocery Store 64.7 69
|
||||
2023-03-11 Book Store 25.4 70
|
||||
2023-03-12 Online Shopping 78 71
|
||||
2023-03-13 Coffee Shop 6.5 72
|
||||
2023-03-14 Museum Tickets 15 73
|
||||
2023-03-15 Internet Bill 52 74
|
||||
2023-03-16 Public Transport 19.5 75
|
||||
2023-03-17 Clothing Store 105.6 76
|
||||
2023-03-18 Phone Bill 41 77
|
||||
2023-03-19 Coffee Shop 5 78
|
||||
2023-03-20 Grocery Store 59.2 79
|
||||
2023-03-21 Gas Station 29.8 80
|
||||
2023-03-22 Restaurant Lunch 32 81
|
||||
2023-03-23 Pharmacy 16.5 82
|
||||
2023-03-24 Concert Tickets 50 83
|
||||
2023-03-25 Coffee Shop 5.5 84
|
||||
2023-03-26 Grocery Store 61.8 85
|
||||
2023-03-27 Online Shopping 82 86
|
||||
2023-03-28 Water Bill 19.35 87
|
||||
2023-03-29 Public Transport 21 88
|
||||
2023-03-30 Book Purchase 17 89
|
||||
2023-03-31 Grocery Store 60 90
|
||||
2023-04-01 Cinema Tickets 23 91
|
||||
2023-04-02 Pharmacy 17.4 92
|
||||
2023-04-03 Gas Station 33.5 93
|
||||
2023-04-04 Restaurant Dinner 56.7 94
|
||||
2023-04-05 Grocery Store 65.3 95
|
||||
2023-04-06 Coffee Shop 5.9 96
|
||||
2023-04-07 Online Shopping 87 97
|
||||
2023-04-08 Electric Bill 69 98
|
||||
2023-04-09 Clothing Store 112.5 99
|
||||
2023-04-10 Grocery Store 57.4 100
|
||||
2023-04-11 Book Store 26.3 101
|
||||
2023-04-12 Gas Station 30.9 102
|
||||
2023-04-13 Coffee Shop 6.8 103
|
||||
2023-04-14 Zoo Tickets 24 104
|
||||
2023-04-15 Internet Bill 53 105
|
||||
2023-04-16 Public Transport 20.5 106
|
||||
2023-04-17 Restaurant Lunch 34 107
|
||||
2023-04-18 Phone Bill 43 108
|
||||
2023-04-19 Coffee Shop 5.2 109
|
||||
2023-04-20 Grocery Store 58.9 110
|
||||
2023-04-21 Pharmacy 14.7 111
|
||||
2023-04-22 Cinema Tickets 25 112
|
||||
2023-04-23 Online Shopping 90 113
|
||||
2023-04-24 Gas Station 31.4 114
|
||||
2023-04-25 Water Bill 21 115
|
||||
2023-04-26 Grocery Store 62.5 116
|
||||
2023-04-27 Coffee Shop 5.7 117
|
||||
2023-04-28 Book Purchase 18.5 118
|
||||
2023-04-29 Public Transport 22 119
|
||||
2023-04-30 Grocery Store 63 120
|
||||
2023-05-01 Theater Tickets 45 121
|
||||
2023-05-02 Dentist Appointment 135 122
|
||||
2023-05-03 Gas Station 32.2 123
|
||||
2023-05-04 Restaurant Dinner 59 124
|
||||
2023-05-05 Grocery Store 66.1 125
|
||||
2023-05-06 Coffee Shop 6 126
|
||||
2023-05-07 Online Shopping 89 127
|
||||
2023-05-08 Electric Bill 70.5 128
|
||||
2023-05-09 Clothing Store 110 129
|
||||
2023-05-10 Grocery Store 59.7 130
|
||||
2023-05-11 Coffee Shop 6.1 131
|
||||
2023-05-12 Book Store 29.2 132
|
||||
2023-05-13 Gas Station 29.9 133
|
||||
2023-05-14 Museum Tickets 16 134
|
||||
2023-05-15 Internet Bill 52.5 135
|
||||
2023-05-16 Public Transport 21.3 136
|
||||
2023-05-17 Restaurant Lunch 35.4 137
|
||||
2023-05-18 Phone Bill 43.5 138
|
||||
2023-05-19 Grocery Store 64.8 139
|
||||
2023-05-20 Pharmacy 15.2 140
|
||||
2023-05-21 Cinema Tickets 26 141
|
||||
2023-05-22 Coffee Shop 6.3 142
|
||||
2023-05-23 Gas Station 30.8 143
|
||||
2023-05-24 Online Shopping 92.5 144
|
||||
2023-05-25 Water Bill 20.5 145
|
||||
2023-05-26 Grocery Store 61.9 146
|
||||
2023-05-27 Public Transport 23 147
|
||||
2023-05-28 Book Purchase 19 148
|
||||
2023-05-29 Coffee Shop 5.9 149
|
||||
2023-05-30 Restaurant Dinner 57.8 150
|
||||
2023-05-31 Grocery Store 66.7 151
|
||||
2023-06-01 Theater Tickets 47 152
|
||||
2023-06-02 Dentist Appointment 140 153
|
||||
2023-06-03 Gas Station 31.6 154
|
||||
2023-06-04 Coffee Shop 6.4 155
|
||||
2023-06-05 Online Shopping 94 156
|
||||
2023-06-06 Electric Bill 72 157
|
||||
2023-06-07 Restaurant Lunch 36 158
|
||||
2023-06-08 Grocery Store 65.3 159
|
||||
2023-06-09 Pharmacy 17 160
|
||||
2023-06-10 Cinema Tickets 27.5 161
|
||||
2023-06-11 Public Transport 21.5 162
|
||||
2023-06-12 Book Store 30 163
|
||||
2023-06-13 Gas Station 28.7 164
|
||||
2023-06-14 Coffee Shop 6.6 165
|
||||
2023-06-15 Internet Bill 53.5 166
|
||||
2023-06-16 Zoo Tickets 28 167
|
||||
2023-06-17 Grocery Store 67.4 168
|
||||
2023-06-18 Phone Bill 44 169
|
||||
2023-06-19 Restaurant Dinner 60 170
|
||||
2023-06-20 Coffee Shop 6.7 171
|
||||
2023-06-21 Public Transport 22.5 172
|
||||
2023-06-22 Online Shopping 96 173
|
||||
2023-06-23 Gas Station 32.4 174
|
||||
2023-06-24 Cinema Tickets 29 175
|
||||
2023-06-25 Book Purchase 20 176
|
||||
2023-06-26 Grocery Store 68.3 177
|
||||
2023-06-27 Water Bill 22 178
|
||||
2023-06-28 Pharmacy 18.5 179
|
||||
2023-06-29 Restaurant Lunch 37 180
|
||||
2023-06-30 Coffee Shop 7 181
|
||||
2023-07-01 Grocery Store 69.5 182
|
||||
2023-07-02 Theater Tickets 49 183
|
||||
2023-07-03 Gas Station 33.2 184
|
||||
2023-07-04 Park Picnic 40 185
|
||||
2023-07-05 Electric Bill 73.5 186
|
||||
2023-07-06 Clothing Store 120 187
|
||||
2023-07-07 Online Shopping 98 188
|
||||
2023-07-08 Grocery Store 70.6 189
|
||||
2023-07-09 Coffee Shop 7.1 190
|
||||
2023-07-10 Internet Bill 54 191
|
||||
2023-07-11 Public Transport 23.5 192
|
||||
2023-07-12 Museum Tickets 18 193
|
||||
2023-07-13 Book Store 31 194
|
||||
2023-07-14 Gas Station 29.9 195
|
||||
2023-07-15 Coffee Shop 7.2 196
|
||||
2023-07-16 Restaurant Dinner 62 197
|
||||
2023-07-17 Grocery Store 71.8 198
|
||||
2023-07-18 Phone Bill 45 199
|
||||
2023-07-19 Zoo Tickets 30 200
|
||||
2023-07-20 Coffee Shop 7.3 201
|
||||
2023-07-21 Public Transport 24 202
|
||||
2023-07-22 Online Shopping 99.5 203
|
||||
2023-07-23 Gas Station 34 204
|
||||
2023-07-24 Cinema Tickets 31 205
|
||||
2023-07-25 Book Purchase 21.5 206
|
||||
2023-07-26 Grocery Store 72.9 207
|
||||
2023-07-27 Water Bill 23.5 208
|
||||
2023-07-28 Pharmacy 19.5 209
|
||||
2023-07-29 Restaurant Lunch 38.5 210
|
||||
2023-07-30 Coffee Shop 7.4 211
|
||||
2023-07-31 Grocery Store 73.7 212
|
||||
2023-08-01 Theater Tickets 50 213
|
||||
2023-08-02 Gas Station 34.5 214
|
||||
2023-08-03 Restaurant Dinner 63.5 215
|
||||
2023-08-04 Online Shopping 101 216
|
||||
2023-08-05 Electric Bill 75 217
|
||||
2023-08-06 Grocery Store 74.6 218
|
||||
2023-08-07 Coffee Shop 7.5 219
|
||||
2023-08-08 Phone Bill 46 220
|
||||
2023-08-09 Public Transport 24.5 221
|
||||
2023-08-10 Cinema Tickets 32.5 222
|
||||
2023-08-11 Book Store 32 223
|
||||
2023-08-12 Gas Station 35 224
|
||||
2023-08-13 Coffee Shop 7.6 225
|
||||
2023-08-14 Park Picnic 42 226
|
||||
2023-08-15 Internet Bill 55 227
|
||||
2023-08-16 Grocery Store 76.3 228
|
||||
2023-08-17 Clothing Store 125 229
|
||||
2023-08-18 Pharmacy 20.5 230
|
||||
2023-08-19 Restaurant Lunch 40 231
|
||||
2023-08-20 Coffee Shop 7.7 232
|
||||
2023-08-21 Museum Tickets 19 233
|
||||
2023-08-22 Public Transport 25 234
|
||||
2023-08-23 Online Shopping 103 235
|
||||
2023-08-24 Grocery Store 77.8 236
|
||||
2023-08-25 Water Bill 24.5 237
|
||||
2023-08-26 Zoo Tickets 32 238
|
||||
2023-08-27 Coffee Shop 7.8 239
|
||||
2023-08-28 Gas Station 35.5 240
|
||||
2023-08-29 Book Purchase 23 241
|
||||
2023-08-30 Grocery Store 78.9 242
|
||||
2023-08-31 Cinema Tickets 34 243
|
||||
2023-09-01 Theater Tickets 52 244
|
||||
2023-09-02 Gas Station 36 245
|
||||
2023-09-03 Restaurant Dinner 65 246
|
||||
2023-09-04 Online Shopping 105 247
|
||||
2023-09-05 Electric Bill 76.5 248
|
||||
2023-09-06 Grocery Store 79.6 249
|
||||
2023-09-07 Coffee Shop 8 250
|
||||
2023-09-08 Phone Bill 47 251
|
||||
2023-09-09 Public Transport 26 252
|
||||
2023-09-10 Cinema Tickets 35.5 253
|
||||
2023-09-11 Book Store 33 254
|
||||
2023-09-12 Gas Station 36.5 255
|
||||
2023-09-13 Coffee Shop 8.2 256
|
||||
2023-09-14 Park Picnic 44 257
|
||||
2023-09-15 Internet Bill 56 258
|
||||
2023-09-16 Grocery Store 80.4 259
|
||||
2023-09-17 Clothing Store 130 260
|
||||
2023-09-18 Pharmacy 21.5 261
|
||||
2023-09-19 Restaurant Lunch 41.5 262
|
||||
2023-09-20 Coffee Shop 8.4 263
|
||||
2023-09-21 Museum Tickets 20 264
|
||||
2023-09-22 Public Transport 26.5 265
|
||||
2023-09-23 Online Shopping 107 266
|
||||
2023-09-24 Grocery Store 81.3 267
|
||||
2023-09-25 Water Bill 25.5 268
|
||||
2023-09-26 Zoo Tickets 33.5 269
|
||||
2023-09-27 Coffee Shop 8.6 270
|
||||
2023-09-28 Gas Station 37.5 271
|
||||
2023-09-29 Book Purchase 24.5 272
|
||||
2023-09-30 Grocery Store 82.7 273
|
||||
2023-10-01 Cinema Tickets 36 274
|
||||
2023-10-02 Theater Tickets 54 275
|
||||
2023-10-03 Gas Station 38 276
|
||||
2023-10-04 Restaurant Dinner 66.5 277
|
||||
2023-10-05 Online Shopping 109 278
|
||||
2023-10-06 Electric Bill 78 279
|
||||
2023-10-07 Grocery Store 83.9 280
|
||||
2023-10-08 Coffee Shop 8.8 281
|
||||
2023-10-09 Phone Bill 48 282
|
||||
2023-10-10 Public Transport 27.5 283
|
||||
2023-10-11 Cinema Tickets 37.5 284
|
||||
2023-10-12 Book Store 34.5 285
|
||||
2023-10-13 Gas Station 39.5 286
|
||||
2023-10-14 Coffee Shop 9 287
|
||||
2023-10-15 Park Picnic 46 288
|
||||
2023-10-16 Internet Bill 57.5 289
|
||||
2023-10-17 Grocery Store 85.2 290
|
||||
2023-10-18 Clothing Store 135 291
|
||||
2023-10-19 Pharmacy 22.5 292
|
||||
2023-10-20 Restaurant Lunch 43 293
|
||||
2023-10-21 Coffee Shop 9.2 294
|
||||
2023-10-22 Museum Tickets 21.5 295
|
||||
2023-10-23 Public Transport 28 296
|
||||
2023-10-24 Online Shopping 111 297
|
||||
2023-10-25 Grocery Store 86.5 298
|
||||
2023-10-26 Water Bill 26.5 299
|
||||
2023-10-27 Zoo Tickets 35 300
|
||||
2023-10-28 Coffee Shop 9.4 301
|
||||
2023-10-29 Gas Station 40.5 302
|
||||
2023-10-30 Book Purchase 26 303
|
||||
2023-10-31 Grocery Store 88 304
|
||||
|
@@ -0,0 +1 @@
|
||||
1861.55
|
||||
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"category": [
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 120,
|
||||
"dependencies": [
|
||||
"TestAnswerQuestionCsv",
|
||||
"TestCombineCsv"
|
||||
],
|
||||
"eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589",
|
||||
"ground": {
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"1861"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can answer a question from a csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "AnswerQuestionCombineCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
id,name,timestamp
|
||||
3,Alice,2023-09-25 14:10:00
|
||||
1,Bob,2023-09-24 12:05:00
|
||||
2,Charlie,2023-09-24 12:10:00
|
||||
4,David,2023-09-26 16:20:00
|
||||
|
@@ -1,5 +0,0 @@
|
||||
id,name,timestamp
|
||||
1,Bob,2023-09-24 12:05:00
|
||||
2,Charlie,2023-09-24 12:10:00
|
||||
3,Alice,2023-09-25 14:10:00
|
||||
4,David,2023-09-26 16:20:00
|
||||
|
@@ -1,31 +0,0 @@
|
||||
{
|
||||
"category": [
|
||||
"data"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
|
||||
"ground": {
|
||||
"answer": "The csv sorted by date",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.csv"
|
||||
],
|
||||
"should_contain": [
|
||||
"id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can sort a csv",
|
||||
"difficulty": "basic",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "SortCsv",
|
||||
"task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved."
|
||||
}
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43",
|
||||
"eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
|
||||
"ground": {
|
||||
"answer": "This is a Heading\nThis is a paragraph.",
|
||||
"eval": {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"dependencies": [
|
||||
"TestSearch"
|
||||
],
|
||||
"eval_id": "525001ed-8b45-4405-9e56-ce4423314294",
|
||||
"eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
|
||||
"ground": {
|
||||
"answer": "\u00a325.89",
|
||||
"eval": {
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
|
||||
"eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"eval": {
|
||||
@@ -6,7 +6,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval"
|
||||
],
|
||||
"eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb",
|
||||
"eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1",
|
||||
"ground": {
|
||||
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
|
||||
"eval": {
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval2"
|
||||
],
|
||||
"eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05",
|
||||
"eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
|
||||
"ground": {
|
||||
"answer": "The twitter handles of the two hosts of Latent Space.",
|
||||
"eval": {
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 240,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6",
|
||||
"eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
|
||||
"ground": {
|
||||
"answer": "A report highlighting elements from the 2 files.",
|
||||
"eval": {
|
||||
|
||||
@@ -12,6 +12,12 @@
|
||||
"id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
|
||||
@@ -78,24 +84,42 @@
|
||||
"id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
|
||||
@@ -117,7 +141,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
|
||||
"eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
|
||||
"ground": {
|
||||
"answer": "The content of output.txt should be 'Hello World!'",
|
||||
"eval": {
|
||||
@@ -155,7 +179,7 @@
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [],
|
||||
"eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
|
||||
"eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
|
||||
"ground": {
|
||||
"answer": "The word 'Washington', printed to a .txt file named anything",
|
||||
"eval": {
|
||||
@@ -187,13 +211,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 150,
|
||||
"dependencies": [
|
||||
"TestUrlShortener"
|
||||
],
|
||||
"eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95",
|
||||
"eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
|
||||
"ground": {
|
||||
"answer": "The correct python file for a TicTacToe game is written",
|
||||
"eval": {
|
||||
@@ -227,7 +252,7 @@
|
||||
"dependencies": [
|
||||
"TestThreeSum"
|
||||
],
|
||||
"eval_id": "0823b577-64f2-477b-856d-16726fe464b0",
|
||||
"eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
|
||||
"ground": {
|
||||
"answer": "password_generator.py is created and satisfies the requirements.",
|
||||
"eval": {
|
||||
@@ -255,13 +280,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestPasswordGenerator"
|
||||
],
|
||||
"eval_id": "6ace62be-6c18-431a-947f-72fb20984b58",
|
||||
"eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
|
||||
"ground": {
|
||||
"answer": "The correct python file is written and organizes the files accordingly",
|
||||
"eval": {
|
||||
@@ -289,13 +315,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f",
|
||||
"eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
|
||||
"ground": {
|
||||
"answer": "The three_sum function coded properly.",
|
||||
"eval": {
|
||||
@@ -327,14 +354,15 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestTicTacToe",
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a",
|
||||
"eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1",
|
||||
"ground": {
|
||||
"answer": "The implementation of battleship that passes all the tests.",
|
||||
"eval": {
|
||||
@@ -366,7 +394,7 @@
|
||||
"dependencies": [
|
||||
"TestFileOrganizer"
|
||||
],
|
||||
"eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273",
|
||||
"eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
|
||||
"ground": {
|
||||
"answer": "The correct python file for a basic url shortener CLI",
|
||||
"eval": {
|
||||
@@ -401,7 +429,7 @@
|
||||
"dependencies": [
|
||||
"TestSearch"
|
||||
],
|
||||
"eval_id": "525001ed-8b45-4405-9e56-ce4423314294",
|
||||
"eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
|
||||
"ground": {
|
||||
"answer": "\u00a325.89",
|
||||
"eval": {
|
||||
@@ -431,13 +459,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval2"
|
||||
],
|
||||
"eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05",
|
||||
"eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
|
||||
"ground": {
|
||||
"answer": "The twitter handles of the two hosts of Latent Space.",
|
||||
"eval": {
|
||||
@@ -476,7 +505,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval"
|
||||
],
|
||||
"eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb",
|
||||
"eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1",
|
||||
"ground": {
|
||||
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
|
||||
"eval": {
|
||||
@@ -518,6 +547,43 @@
|
||||
"label": "RevenueRetrieval2",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
".txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"81,462"
|
||||
],
|
||||
"should_not_contain": []
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": []
|
||||
},
|
||||
"name": "TestRevenueRetrieval",
|
||||
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
|
||||
"label": "RevenueRetrieval",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
@@ -529,7 +595,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43",
|
||||
"eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
|
||||
"ground": {
|
||||
"answer": "This is a Heading\nThis is a paragraph.",
|
||||
"eval": {
|
||||
@@ -565,49 +631,90 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"data"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
"TestAnswerQuestionSmallCsv"
|
||||
],
|
||||
"eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
|
||||
"eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
".txt"
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"81,462"
|
||||
],
|
||||
"should_not_contain": []
|
||||
"1861"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
|
||||
"description": "Tests if the agent can answer a question from a csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": []
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestRevenueRetrieval",
|
||||
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
|
||||
"name": "TestAnswerQuestionCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
|
||||
"label": "RevenueRetrieval",
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
|
||||
"label": "AnswerQuestionCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data"
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 120,
|
||||
"dependencies": [
|
||||
"TestAnswerQuestionCsv",
|
||||
"TestCombineCsv"
|
||||
],
|
||||
"eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589",
|
||||
"ground": {
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"1861"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can answer a question from a csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestAnswerQuestionCombineCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
|
||||
"label": "AnswerQuestionCombineCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
|
||||
"eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95",
|
||||
"ground": {
|
||||
"answer": "The csv sorted by date",
|
||||
"eval": {
|
||||
@@ -638,13 +745,52 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data"
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
|
||||
"ground": {
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"84"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can answer a question from a small csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestAnswerQuestionSmallCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
|
||||
"label": "AnswerQuestionSmallCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestLabelCsv"
|
||||
],
|
||||
"eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375",
|
||||
"eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
|
||||
"ground": {
|
||||
"answer": "The csv data is combined",
|
||||
"eval": {
|
||||
@@ -681,7 +827,7 @@
|
||||
"dependencies": [
|
||||
"TestSortCsv"
|
||||
],
|
||||
"eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e",
|
||||
"eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
|
||||
"ground": {
|
||||
"answer": "The csv labelled",
|
||||
"eval": {
|
||||
@@ -712,13 +858,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 240,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6",
|
||||
"eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
|
||||
"ground": {
|
||||
"answer": "A report highlighting elements from the 2 files.",
|
||||
"eval": {
|
||||
|
||||
145
benchmark/poetry.lock
generated
145
benchmark/poetry.lock
generated
@@ -295,75 +295,63 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "cffi"
|
||||
version = "1.15.1"
|
||||
version = "1.16.0"
|
||||
description = "Foreign Function Interface for Python calling C code."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"},
|
||||
{file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"},
|
||||
{file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"},
|
||||
{file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"},
|
||||
{file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"},
|
||||
{file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"},
|
||||
{file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"},
|
||||
{file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"},
|
||||
{file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"},
|
||||
{file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"},
|
||||
{file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"},
|
||||
{file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"},
|
||||
{file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"},
|
||||
{file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
|
||||
{file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
|
||||
{file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
|
||||
{file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
|
||||
{file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
|
||||
{file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
|
||||
{file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -620,15 +608,19 @@ test-no-images = ["pytest", "pytest-cov", "wurlitzer"]
|
||||
|
||||
[[package]]
|
||||
name = "cycler"
|
||||
version = "0.11.0"
|
||||
version = "0.12.0"
|
||||
description = "Composable style cycles"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
|
||||
{file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
|
||||
{file = "cycler-0.12.0-py3-none-any.whl", hash = "sha256:7896994252d006771357777d0251f3e34d266f4fa5f2c572247a80ab01440947"},
|
||||
{file = "cycler-0.12.0.tar.gz", hash = "sha256:8cc3a7b4861f91b1095157f9916f748549a617046e67eb7619abed9b34d2c94a"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
|
||||
tests = ["pytest", "pytest-cov", "pytest-xdist"]
|
||||
|
||||
[[package]]
|
||||
name = "decorator"
|
||||
version = "5.1.1"
|
||||
@@ -890,20 +882,19 @@ test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mypy", "pre-commit"
|
||||
|
||||
[[package]]
|
||||
name = "google-auth"
|
||||
version = "2.23.1"
|
||||
version = "2.23.2"
|
||||
description = "Google Authentication Library"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "google-auth-2.23.1.tar.gz", hash = "sha256:d38bdf4fa1e7c5a35e574861bce55784fd08afadb4e48f99f284f1e487ce702d"},
|
||||
{file = "google_auth-2.23.1-py2.py3-none-any.whl", hash = "sha256:9800802266366a2a87890fb2d04923fc0c0d4368af0b86db18edd94a62386ea1"},
|
||||
{file = "google-auth-2.23.2.tar.gz", hash = "sha256:5a9af4be520ba33651471a0264eead312521566f44631cbb621164bc30c8fd40"},
|
||||
{file = "google_auth-2.23.2-py2.py3-none-any.whl", hash = "sha256:c2e253347579d483004f17c3bd0bf92e611ef6c7ba24d41c5c59f2e7aeeaf088"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cachetools = ">=2.0.0,<6.0"
|
||||
pyasn1-modules = ">=0.2.1"
|
||||
rsa = ">=3.1.4,<5"
|
||||
urllib3 = ">=2.0.5"
|
||||
|
||||
[package.extras]
|
||||
aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"]
|
||||
@@ -2765,13 +2756,13 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess
|
||||
|
||||
[[package]]
|
||||
name = "wcwidth"
|
||||
version = "0.2.6"
|
||||
version = "0.2.7"
|
||||
description = "Measures the displayed width of unicode strings in a terminal"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"},
|
||||
{file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"},
|
||||
{file = "wcwidth-0.2.7-py2.py3-none-any.whl", hash = "sha256:fabf3e32999d9b0dab7d19d845149f326f04fe29bac67709ee071dbd92640a36"},
|
||||
{file = "wcwidth-0.2.7.tar.gz", hash = "sha256:1b6d30a98ddd5ce9bbdb33658191fd2423fc9da203fe3ef1855407dcb7ee4e26"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -12,14 +12,14 @@ import time
|
||||
"eval_id, input_text, expected_artifact_length, test_name, should_be_successful",
|
||||
[
|
||||
(
|
||||
"81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
|
||||
"f219f3d3-a41b-45a9-a3d0-389832086ee8",
|
||||
"Write the word 'Washington' to a .txt file",
|
||||
0,
|
||||
"WriteFile",
|
||||
True,
|
||||
),
|
||||
(
|
||||
"261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
|
||||
"021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
|
||||
"Read the file called file_to_read.txt and write its content to a file called output.txt",
|
||||
1,
|
||||
"ReadFile",
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
|
||||
"eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
|
||||
"ground": {
|
||||
"answer": "The content of output.txt should be 'Hello World!'",
|
||||
"eval": {
|
||||
@@ -101,7 +101,7 @@
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [],
|
||||
"eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
|
||||
"eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
|
||||
"ground": {
|
||||
"answer": "The word 'Washington', printed to a .txt file named anything",
|
||||
"eval": {
|
||||
@@ -133,13 +133,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 150,
|
||||
"dependencies": [
|
||||
"TestUrlShortener"
|
||||
],
|
||||
"eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95",
|
||||
"eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
|
||||
"ground": {
|
||||
"answer": "The correct python file for a TicTacToe game is written",
|
||||
"eval": {
|
||||
@@ -173,7 +174,7 @@
|
||||
"dependencies": [
|
||||
"TestFileOrganizer"
|
||||
],
|
||||
"eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273",
|
||||
"eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
|
||||
"ground": {
|
||||
"answer": "The correct python file for a basic url shortener CLI",
|
||||
"eval": {
|
||||
@@ -201,13 +202,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestPasswordGenerator"
|
||||
],
|
||||
"eval_id": "6ace62be-6c18-431a-947f-72fb20984b58",
|
||||
"eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
|
||||
"ground": {
|
||||
"answer": "The correct python file is written and organizes the files accordingly",
|
||||
"eval": {
|
||||
@@ -241,7 +243,7 @@
|
||||
"dependencies": [
|
||||
"TestThreeSum"
|
||||
],
|
||||
"eval_id": "0823b577-64f2-477b-856d-16726fe464b0",
|
||||
"eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
|
||||
"ground": {
|
||||
"answer": "password_generator.py is created and satisfies the requirements.",
|
||||
"eval": {
|
||||
@@ -269,13 +271,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f",
|
||||
"eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
|
||||
"ground": {
|
||||
"answer": "The three_sum function coded properly.",
|
||||
"eval": {
|
||||
@@ -307,14 +310,15 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestTicTacToe",
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a",
|
||||
"eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1",
|
||||
"ground": {
|
||||
"answer": "The implementation of battleship that passes all the tests.",
|
||||
"eval": {
|
||||
|
||||
@@ -6,11 +6,29 @@
|
||||
"id": "agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]"
|
||||
"id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
@@ -23,6 +41,12 @@
|
||||
"from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]"
|
||||
}
|
||||
],
|
||||
"nodes": [
|
||||
@@ -39,7 +63,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
|
||||
"eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
|
||||
"ground": {
|
||||
"answer": "The content of output.txt should be 'Hello World!'",
|
||||
"eval": {
|
||||
@@ -77,7 +101,7 @@
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [],
|
||||
"eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
|
||||
"eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
|
||||
"ground": {
|
||||
"answer": "The word 'Washington', printed to a .txt file named anything",
|
||||
"eval": {
|
||||
@@ -111,48 +135,126 @@
|
||||
"category": [
|
||||
"data"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
"TestAnswerQuestionSmallCsv"
|
||||
],
|
||||
"eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
|
||||
"eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
|
||||
"ground": {
|
||||
"answer": "The csv sorted by date",
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.csv"
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00"
|
||||
"1861"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can sort a csv",
|
||||
"difficulty": "basic",
|
||||
"description": "Tests if the agent can answer a question from a csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestSortCsv",
|
||||
"task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved."
|
||||
"name": "TestAnswerQuestionCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
|
||||
"label": "SortCsv",
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
|
||||
"label": "AnswerQuestionCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data"
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
|
||||
"ground": {
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"84"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can answer a question from a small csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestAnswerQuestionSmallCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
|
||||
"label": "AnswerQuestionSmallCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 120,
|
||||
"dependencies": [
|
||||
"TestAnswerQuestionCsv",
|
||||
"TestCombineCsv"
|
||||
],
|
||||
"eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589",
|
||||
"ground": {
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"1861"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can answer a question from a csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestAnswerQuestionCombineCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
|
||||
"label": "AnswerQuestionCombineCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestLabelCsv"
|
||||
],
|
||||
"eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375",
|
||||
"eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
|
||||
"ground": {
|
||||
"answer": "The csv data is combined",
|
||||
"eval": {
|
||||
@@ -189,7 +291,7 @@
|
||||
"dependencies": [
|
||||
"TestSortCsv"
|
||||
],
|
||||
"eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e",
|
||||
"eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
|
||||
"ground": {
|
||||
"answer": "The csv labelled",
|
||||
"eval": {
|
||||
@@ -215,6 +317,44 @@
|
||||
"id": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
|
||||
"label": "LabelCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95",
|
||||
"ground": {
|
||||
"answer": "The csv sorted by date",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.csv"
|
||||
],
|
||||
"should_contain": [
|
||||
"id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can sort a csv",
|
||||
"difficulty": "basic",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestSortCsv",
|
||||
"task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
|
||||
"label": "SortCsv",
|
||||
"shape": "dot"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -57,7 +57,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
|
||||
"eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
|
||||
"ground": {
|
||||
"answer": "The content of output.txt should be 'Hello World!'",
|
||||
"eval": {
|
||||
@@ -95,7 +95,7 @@
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [],
|
||||
"eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
|
||||
"eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
|
||||
"ground": {
|
||||
"answer": "The word 'Washington', printed to a .txt file named anything",
|
||||
"eval": {
|
||||
@@ -134,7 +134,7 @@
|
||||
"dependencies": [
|
||||
"TestSearch"
|
||||
],
|
||||
"eval_id": "525001ed-8b45-4405-9e56-ce4423314294",
|
||||
"eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
|
||||
"ground": {
|
||||
"answer": "\u00a325.89",
|
||||
"eval": {
|
||||
@@ -171,7 +171,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43",
|
||||
"eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
|
||||
"ground": {
|
||||
"answer": "This is a Heading\nThis is a paragraph.",
|
||||
"eval": {
|
||||
@@ -207,13 +207,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval2"
|
||||
],
|
||||
"eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05",
|
||||
"eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
|
||||
"ground": {
|
||||
"answer": "The twitter handles of the two hosts of Latent Space.",
|
||||
"eval": {
|
||||
@@ -252,7 +253,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval"
|
||||
],
|
||||
"eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb",
|
||||
"eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1",
|
||||
"ground": {
|
||||
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
|
||||
"eval": {
|
||||
@@ -298,13 +299,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
|
||||
"eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"eval": {
|
||||
@@ -334,13 +336,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 240,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6",
|
||||
"eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
|
||||
"ground": {
|
||||
"answer": "A report highlighting elements from the 2 files.",
|
||||
"eval": {
|
||||
|
||||
@@ -12,6 +12,12 @@
|
||||
"id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
|
||||
@@ -78,24 +84,42 @@
|
||||
"id": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestTestGetInformation::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestRevenueRetrieval2::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestSortCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]",
|
||||
"id": "agbenchmark/generate_test.py::TestCombineCsv::test_method[challenge_data0]_to_agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
|
||||
"to": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]"
|
||||
},
|
||||
{
|
||||
"arrows": "to",
|
||||
"from": "agbenchmark/generate_test.py::TestLabelCsv::test_method[challenge_data0]",
|
||||
@@ -117,7 +141,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "261ccfaa-02a2-4c1a-8a56-c76c66f7dba1",
|
||||
"eval_id": "f219f3d3-a41b-45a9-a3d0-389832086ee8",
|
||||
"ground": {
|
||||
"answer": "The content of output.txt should be 'Hello World!'",
|
||||
"eval": {
|
||||
@@ -155,7 +179,7 @@
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [],
|
||||
"eval_id": "81b64bf9-2b6a-4ac8-bcd2-8bfe36244ac0",
|
||||
"eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
|
||||
"ground": {
|
||||
"answer": "The word 'Washington', printed to a .txt file named anything",
|
||||
"eval": {
|
||||
@@ -187,13 +211,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 150,
|
||||
"dependencies": [
|
||||
"TestUrlShortener"
|
||||
],
|
||||
"eval_id": "54c3d7e9-71d6-476b-b045-cf0aaf118f95",
|
||||
"eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
|
||||
"ground": {
|
||||
"answer": "The correct python file for a TicTacToe game is written",
|
||||
"eval": {
|
||||
@@ -227,7 +252,7 @@
|
||||
"dependencies": [
|
||||
"TestThreeSum"
|
||||
],
|
||||
"eval_id": "0823b577-64f2-477b-856d-16726fe464b0",
|
||||
"eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
|
||||
"ground": {
|
||||
"answer": "password_generator.py is created and satisfies the requirements.",
|
||||
"eval": {
|
||||
@@ -255,13 +280,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestPasswordGenerator"
|
||||
],
|
||||
"eval_id": "6ace62be-6c18-431a-947f-72fb20984b58",
|
||||
"eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
|
||||
"ground": {
|
||||
"answer": "The correct python file is written and organizes the files accordingly",
|
||||
"eval": {
|
||||
@@ -289,13 +315,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "29a10990-2584-4602-8b9d-c217f6edbc4f",
|
||||
"eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
|
||||
"ground": {
|
||||
"answer": "The three_sum function coded properly.",
|
||||
"eval": {
|
||||
@@ -327,14 +354,15 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"coding"
|
||||
"coding",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestTicTacToe",
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "5a32418d-1c3a-4af1-8dc4-8d4c29bed21a",
|
||||
"eval_id": "4d613d05-475f-4f72-bf12-f6d3714340c1",
|
||||
"ground": {
|
||||
"answer": "The implementation of battleship that passes all the tests.",
|
||||
"eval": {
|
||||
@@ -366,7 +394,7 @@
|
||||
"dependencies": [
|
||||
"TestFileOrganizer"
|
||||
],
|
||||
"eval_id": "41ca1035-ceca-4e0c-91ab-66ed0b350273",
|
||||
"eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
|
||||
"ground": {
|
||||
"answer": "The correct python file for a basic url shortener CLI",
|
||||
"eval": {
|
||||
@@ -401,7 +429,7 @@
|
||||
"dependencies": [
|
||||
"TestSearch"
|
||||
],
|
||||
"eval_id": "525001ed-8b45-4405-9e56-ce4423314294",
|
||||
"eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
|
||||
"ground": {
|
||||
"answer": "\u00a325.89",
|
||||
"eval": {
|
||||
@@ -431,13 +459,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval2"
|
||||
],
|
||||
"eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05",
|
||||
"eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
|
||||
"ground": {
|
||||
"answer": "The twitter handles of the two hosts of Latent Space.",
|
||||
"eval": {
|
||||
@@ -476,7 +505,7 @@
|
||||
"dependencies": [
|
||||
"TestRevenueRetrieval"
|
||||
],
|
||||
"eval_id": "a0a27778-aec1-4b37-8fc2-92feedffd3fb",
|
||||
"eval_id": "552bdf23-db40-4bd1-b123-4ed820886cc1",
|
||||
"ground": {
|
||||
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
|
||||
"eval": {
|
||||
@@ -518,6 +547,43 @@
|
||||
"label": "RevenueRetrieval2",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
".txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"81,462"
|
||||
],
|
||||
"should_not_contain": []
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": []
|
||||
},
|
||||
"name": "TestRevenueRetrieval",
|
||||
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
|
||||
"label": "RevenueRetrieval",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
@@ -529,7 +595,7 @@
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"eval_id": "6390114a-531d-4743-a51b-50ba6ff8da43",
|
||||
"eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
|
||||
"ground": {
|
||||
"answer": "This is a Heading\nThis is a paragraph.",
|
||||
"eval": {
|
||||
@@ -565,49 +631,90 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"data"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"cutoff": 90,
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
"TestAnswerQuestionSmallCsv"
|
||||
],
|
||||
"eval_id": "af95fa96-a5cb-42b0-98f3-715e6e7d0b5d",
|
||||
"eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
".txt"
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"81,462"
|
||||
],
|
||||
"should_not_contain": []
|
||||
"1861"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
|
||||
"description": "Tests if the agent can answer a question from a csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": []
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestRevenueRetrieval",
|
||||
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
|
||||
"name": "TestAnswerQuestionCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestRevenueRetrieval::test_method[challenge_data0]",
|
||||
"label": "RevenueRetrieval",
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionCsv::test_method[challenge_data0]",
|
||||
"label": "AnswerQuestionCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data"
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 120,
|
||||
"dependencies": [
|
||||
"TestAnswerQuestionCsv",
|
||||
"TestCombineCsv"
|
||||
],
|
||||
"eval_id": "b1bb61cd-3d09-4a69-bb2a-9dbb3c477589",
|
||||
"ground": {
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"1861"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can answer a question from a csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestAnswerQuestionCombineCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionCombineCsv::test_method[challenge_data0]",
|
||||
"label": "AnswerQuestionCombineCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
|
||||
"eval_id": "d59ec964-6f67-4b3d-a4de-c4436fc76f95",
|
||||
"ground": {
|
||||
"answer": "The csv sorted by date",
|
||||
"eval": {
|
||||
@@ -638,13 +745,52 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data"
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
|
||||
"ground": {
|
||||
"answer": "The correct amount spent on utilities.",
|
||||
"eval": {
|
||||
"type": "file"
|
||||
},
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
"84"
|
||||
]
|
||||
},
|
||||
"info": {
|
||||
"description": "Tests if the agent can answer a question from a small csv",
|
||||
"difficulty": "intermediate",
|
||||
"side_effects": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"name": "TestAnswerQuestionSmallCsv",
|
||||
"task": "How much was spent on utilities in total ? Write the answer in an output.txt file."
|
||||
},
|
||||
"id": "agbenchmark/generate_test.py::TestAnswerQuestionSmallCsv::test_method[challenge_data0]",
|
||||
"label": "AnswerQuestionSmallCsv",
|
||||
"shape": "dot"
|
||||
},
|
||||
{
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"data",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 60,
|
||||
"dependencies": [
|
||||
"TestLabelCsv"
|
||||
],
|
||||
"eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375",
|
||||
"eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
|
||||
"ground": {
|
||||
"answer": "The csv data is combined",
|
||||
"eval": {
|
||||
@@ -681,7 +827,7 @@
|
||||
"dependencies": [
|
||||
"TestSortCsv"
|
||||
],
|
||||
"eval_id": "6c58e229-aa22-4c4f-a053-4a78931ad41e",
|
||||
"eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
|
||||
"ground": {
|
||||
"answer": "The csv labelled",
|
||||
"eval": {
|
||||
@@ -712,13 +858,14 @@
|
||||
"color": "grey",
|
||||
"data": {
|
||||
"category": [
|
||||
"scrape_synthesize"
|
||||
"scrape_synthesize",
|
||||
"general"
|
||||
],
|
||||
"cutoff": 240,
|
||||
"dependencies": [
|
||||
"TestReadFile"
|
||||
],
|
||||
"eval_id": "76e4c56c-8d57-423e-9cc1-1fff5f58dee6",
|
||||
"eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
|
||||
"ground": {
|
||||
"answer": "A report highlighting elements from the 2 files.",
|
||||
"eval": {
|
||||
|
||||
Reference in New Issue
Block a user