From 12c5d545837b5256f34695820601f1797b489703 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 17 Jul 2023 22:41:58 -0400 Subject: [PATCH] Fixing memory challenges, naming, testing mini-agi, smooth retrieval scaling (#166) --- agbenchmark/challenge.py | 11 ++- .../{d1 => d1_debug}/artifacts_in/__init__.py | 0 .../{d1 => d1_debug}/artifacts_in/code.py | 0 .../{d1 => d1_debug}/artifacts_in/test.py | 0 .../artifacts_out/__init__.py | 0 .../{d1 => d1_debug}/artifacts_out/code.py | 0 .../{d1 => d1_debug}/artifacts_out/test.py | 0 .../code/{d1 => d1_debug}/data.json | 2 +- .../{d2 => d2_vague}/artifacts_in/__init__.py | 0 .../{d2 => d2_vague}/artifacts_in/code.py | 0 .../{d2 => d2_vague}/artifacts_in/test.py | 0 .../artifacts_out/__init__.py | 0 .../{d2 => d2_vague}/artifacts_out/code.py | 0 .../{d2 => d2_vague}/artifacts_out/test.py | 0 .../code/{d2 => d2_vague}/data.json | 2 +- .../artifacts_out/__init__.py | 0 .../{d4 => d3_two_sum}/artifacts_out/code.py | 0 .../{d4 => d3_two_sum}/custom_python/test.py | 0 .../code/{d4 => d3_two_sum}/data.json | 4 +- .../custom_python/api_tests.py | 0 .../code/{d3 => d4_web_server}/data.json | 2 +- .../artifacts_out/__init__.py | 0 .../artifacts_out/code.py | 0 .../custom_python/test.py | 0 .../code/{d5 => d5_three_sum}/data.json | 2 +- .../memory/m1/artifacts_in/instructions_5.txt | 1 - .../artifacts_in/instructions_1.txt | 0 .../artifacts_in/instructions_2.txt | 0 .../artifacts_in/instructions_3.txt | 0 .../artifacts_in/instructions_4.txt | 0 .../m1_id/artifacts_in/instructions_5.txt | 1 + .../artifacts_out/result.txt} | 0 .../challenges/memory/{m1 => m1_id}/data.json | 2 +- .../memory/m2/artifacts_in/instructions_5.txt | 1 - .../artifacts_in/instructions_1.txt | 0 .../artifacts_in/instructions_2.txt | 0 .../artifacts_in/instructions_3.txt | 0 .../artifacts_in/instructions_4.txt | 0 .../artifacts_in/instructions_5.txt | 1 + .../artifacts_out/result.txt} | 0 .../memory/{m2 => m2_multiple}/data.json | 2 +- .../artifacts_in/instructions_1.txt | 0 .../artifacts_in/instructions_2.txt | 0 .../artifacts_in/instructions_3.txt | 0 .../artifacts_in/instructions_4.txt | 0 .../artifacts_in/instructions_5.txt | 2 +- .../artifacts_out/result.txt} | 0 .../memory/{m3 => m3_noise}/data.json | 6 +- .../artifacts_in/instructions_1.txt | 0 .../artifacts_in/instructions_2.txt | 0 .../artifacts_in/instructions_3.txt | 2 +- .../artifacts_in/instructions_4.txt | 0 .../artifacts_in/instructions_5.txt | 2 +- .../artifacts_out/result.txt} | 2 +- .../memory/{m4 => m4_phrases}/data.json | 8 +- .../artifacts_out/random_file.txt | 0 .../retrieval/{r1 => r1_book_price}/data.json | 2 +- .../artifacts_out/random_file.txt | 0 .../retrieval/r2.1_specific/data.json | 19 ++++ .../artifacts_out/random_file.txt | 1 + .../retrieval/r2.2_formatting/data.json | 19 ++++ agbenchmark/challenges/retrieval/r2/data.json | 19 ---- .../artifacts_out/random_file.txt | 1 + .../retrieval/r2_tesla_revenue/data.json | 19 ++++ agbenchmark/challenges/retrieval/r3/data.json | 2 +- agbenchmark/conftest.py | 9 +- agbenchmark/reports/internal_info.json | 93 +++++++++++-------- .../reports/mini-agi/1.1_TestWriteFile.json | 57 +++++------- .../10.1_TestRememberMultipleWithNoise.json | 30 ++++++ .../10_TestRememberMultipleWithNoise.json | 31 +++++++ ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++ ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++ ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++ ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++ ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++ ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++ .../12.1_TestDebugSimpleTypoWithGuidance.json | 28 ++++++ .../12.2_TestDebugSimpleTypoWithGuidance.json | 28 ++++++ .../12.3_TestDebugSimpleTypoWithGuidance.json | 28 ++++++ .../12_TestDebugSimpleTypoWithGuidance.json | 31 +++++++ .../reports/mini-agi/1_TestWriteFIle.json | 4 +- .../reports/mini-agi/2.1_TestReadFile.json | 4 +- .../reports/mini-agi/2_TestReadFile.json | 4 +- .../reports/mini-agi/3.1_TestSearch.json | 27 ++++++ .../reports/mini-agi/3_TestSearch.json | 4 +- .../mini-agi/4.1_TestBasicRetrieval.json | 27 ++++++ .../4.1_TestDebugSimpleTypoWithGuidance.json | 28 ------ .../mini-agi/4_TestBasicRetrieval.json | 27 ++++++ .../4_TestDebugSimpleTypoWithGuidance.json | 28 ------ .../mini-agi/5.1_TestRetrieval2.0.json | 30 ++++++ .../reports/mini-agi/5_TestRetrieval2.0.json | 29 ++++++ .../mini-agi/6.1_TestRetrieval2.1.json | 30 ++++++ .../mini-agi/6.2_TestRetrieval2.1.json | 30 ++++++ .../mini-agi/6.3_TestRetrieval2.1.json | 30 ++++++ .../mini-agi/6.4_TestRetrieval2.1.json | 31 +++++++ .../reports/mini-agi/6_TestRetrieval2.1.json | 30 ++++++ .../mini-agi/7.1_TestRetrieval2.2.json | 31 +++++++ .../reports/mini-agi/7_TestRetrieval2.2.json | 30 ++++++ .../reports/mini-agi/8.1_TestBasicMemory.json | 30 ++++++ .../reports/mini-agi/8_TestBasicMemory.json | 31 +++++++ .../mini-agi/9.1_TestRememberMultipleIds.json | 30 ++++++ .../mini-agi/9_TestRememberMultipleIds.json | 31 +++++++ agbenchmark/utils.py | 5 +- agent/mini-agi | 2 +- 104 files changed, 1022 insertions(+), 187 deletions(-) rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_in/__init__.py (100%) rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_in/code.py (100%) rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_in/test.py (100%) rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_out/__init__.py (100%) rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_out/code.py (100%) rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_out/test.py (100%) rename agbenchmark/challenges/code/{d1 => d1_debug}/data.json (97%) rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_in/__init__.py (100%) rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_in/code.py (100%) rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_in/test.py (100%) rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_out/__init__.py (100%) rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_out/code.py (100%) rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_out/test.py (100%) rename agbenchmark/challenges/code/{d2 => d2_vague}/data.json (97%) rename agbenchmark/challenges/code/{d4 => d3_two_sum}/artifacts_out/__init__.py (100%) rename agbenchmark/challenges/code/{d4 => d3_two_sum}/artifacts_out/code.py (100%) rename agbenchmark/challenges/code/{d4 => d3_two_sum}/custom_python/test.py (100%) rename agbenchmark/challenges/code/{d4 => d3_two_sum}/data.json (94%) rename agbenchmark/challenges/code/{d3 => d4_web_server}/custom_python/api_tests.py (100%) rename agbenchmark/challenges/code/{d3 => d4_web_server}/data.json (97%) rename agbenchmark/challenges/code/{d5 => d5_three_sum}/artifacts_out/__init__.py (100%) rename agbenchmark/challenges/code/{d5 => d5_three_sum}/artifacts_out/code.py (100%) rename agbenchmark/challenges/code/{d5 => d5_three_sum}/custom_python/test.py (100%) rename agbenchmark/challenges/code/{d5 => d5_three_sum}/data.json (96%) delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt rename agbenchmark/challenges/memory/{m1 => m1_id}/artifacts_in/instructions_1.txt (100%) rename agbenchmark/challenges/memory/{m1 => m1_id}/artifacts_in/instructions_2.txt (100%) rename agbenchmark/challenges/memory/{m1 => m1_id}/artifacts_in/instructions_3.txt (100%) rename agbenchmark/challenges/memory/{m1 => m1_id}/artifacts_in/instructions_4.txt (100%) create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt rename agbenchmark/challenges/memory/{m1/artifacts_out/random_file.txt => m1_id/artifacts_out/result.txt} (100%) rename agbenchmark/challenges/memory/{m1 => m1_id}/data.json (95%) delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt rename agbenchmark/challenges/memory/{m2 => m2_multiple}/artifacts_in/instructions_1.txt (100%) rename agbenchmark/challenges/memory/{m2 => m2_multiple}/artifacts_in/instructions_2.txt (100%) rename agbenchmark/challenges/memory/{m2 => m2_multiple}/artifacts_in/instructions_3.txt (100%) rename agbenchmark/challenges/memory/{m2 => m2_multiple}/artifacts_in/instructions_4.txt (100%) create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt rename agbenchmark/challenges/memory/{m2/artifacts_out/random_file.txt => m2_multiple/artifacts_out/result.txt} (100%) rename agbenchmark/challenges/memory/{m2 => m2_multiple}/data.json (95%) rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_1.txt (100%) rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_2.txt (100%) rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_3.txt (100%) rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_4.txt (100%) rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_5.txt (94%) rename agbenchmark/challenges/memory/{m3/artifacts_out/random_file.txt => m3_noise/artifacts_out/result.txt} (100%) rename agbenchmark/challenges/memory/{m3 => m3_noise}/data.json (88%) rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_1.txt (100%) rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_2.txt (100%) rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_3.txt (89%) rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_4.txt (100%) rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_5.txt (93%) rename agbenchmark/challenges/memory/{m4/artifacts_out/random_file.txt => m4_phrases/artifacts_out/result.txt} (77%) rename agbenchmark/challenges/memory/{m4 => m4_phrases}/data.json (87%) rename agbenchmark/challenges/retrieval/{r1 => r1_book_price}/artifacts_out/random_file.txt (100%) rename agbenchmark/challenges/retrieval/{r1 => r1_book_price}/data.json (86%) rename agbenchmark/challenges/retrieval/{r2 => r2.1_specific}/artifacts_out/random_file.txt (100%) create mode 100644 agbenchmark/challenges/retrieval/r2.1_specific/data.json create mode 100644 agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/retrieval/r2.2_formatting/data.json delete mode 100644 agbenchmark/challenges/retrieval/r2/data.json create mode 100644 agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json create mode 100644 agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/3.1_TestSearch.json create mode 100644 agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json delete mode 100644 agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json delete mode 100644 agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json create mode 100644 agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json create mode 100644 agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json create mode 100644 agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json create mode 100644 agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json create mode 100644 agbenchmark/reports/mini-agi/8_TestBasicMemory.json create mode 100644 agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json create mode 100644 agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index 4f24bb60..cdaebed4 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -103,22 +103,25 @@ class Challenge(ABC): ] def scoring(self, content: str, ground: Ground) -> float: + print("Scoring content: ", content) if ground.should_contain: for should_contain_word in ground.should_contain: if should_contain_word not in content: + print(f"Word that should exist - {should_contain_word}: False") return 0.0 else: - print( - f"Word that should exist: {should_contain_word} exists in the content" - ) + print(f"Word that should exist - {should_contain_word}: True") if ground.should_not_contain: for should_not_contain_word in ground.should_not_contain: if should_not_contain_word in content: + print( + f"Word that should not exist - {should_not_contain_word}: False" + ) return 0.0 else: print( - f"Word that should not exist: {should_not_contain_word} does not exist in the content" + f"Word that should not exist - {should_not_contain_word}: True" ) return 1.0 diff --git a/agbenchmark/challenges/code/d1/artifacts_in/__init__.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py similarity index 100% rename from agbenchmark/challenges/code/d1/artifacts_in/__init__.py rename to agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py diff --git a/agbenchmark/challenges/code/d1/artifacts_in/code.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/code.py similarity index 100% rename from agbenchmark/challenges/code/d1/artifacts_in/code.py rename to agbenchmark/challenges/code/d1_debug/artifacts_in/code.py diff --git a/agbenchmark/challenges/code/d1/artifacts_in/test.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/test.py similarity index 100% rename from agbenchmark/challenges/code/d1/artifacts_in/test.py rename to agbenchmark/challenges/code/d1_debug/artifacts_in/test.py diff --git a/agbenchmark/challenges/code/d1/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py similarity index 100% rename from agbenchmark/challenges/code/d1/artifacts_out/__init__.py rename to agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py diff --git a/agbenchmark/challenges/code/d1/artifacts_out/code.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/code.py similarity index 100% rename from agbenchmark/challenges/code/d1/artifacts_out/code.py rename to agbenchmark/challenges/code/d1_debug/artifacts_out/code.py diff --git a/agbenchmark/challenges/code/d1/artifacts_out/test.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/test.py similarity index 100% rename from agbenchmark/challenges/code/d1/artifacts_out/test.py rename to agbenchmark/challenges/code/d1_debug/artifacts_out/test.py diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1_debug/data.json similarity index 97% rename from agbenchmark/challenges/code/d1/data.json rename to agbenchmark/challenges/code/d1_debug/data.json index d8e0280a..4e2798a3 100644 --- a/agbenchmark/challenges/code/d1/data.json +++ b/agbenchmark/challenges/code/d1_debug/data.json @@ -3,7 +3,7 @@ "category": ["code", "iterate"], "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", "dependencies": ["TestReadFile", "TestWriteFile"], - "cutoff": 60, + "cutoff": 75, "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], diff --git a/agbenchmark/challenges/code/d2/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py similarity index 100% rename from agbenchmark/challenges/code/d2/artifacts_in/__init__.py rename to agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py diff --git a/agbenchmark/challenges/code/d2/artifacts_in/code.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/code.py similarity index 100% rename from agbenchmark/challenges/code/d2/artifacts_in/code.py rename to agbenchmark/challenges/code/d2_vague/artifacts_in/code.py diff --git a/agbenchmark/challenges/code/d2/artifacts_in/test.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/test.py similarity index 100% rename from agbenchmark/challenges/code/d2/artifacts_in/test.py rename to agbenchmark/challenges/code/d2_vague/artifacts_in/test.py diff --git a/agbenchmark/challenges/code/d2/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py similarity index 100% rename from agbenchmark/challenges/code/d2/artifacts_out/__init__.py rename to agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py diff --git a/agbenchmark/challenges/code/d2/artifacts_out/code.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/code.py similarity index 100% rename from agbenchmark/challenges/code/d2/artifacts_out/code.py rename to agbenchmark/challenges/code/d2_vague/artifacts_out/code.py diff --git a/agbenchmark/challenges/code/d2/artifacts_out/test.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/test.py similarity index 100% rename from agbenchmark/challenges/code/d2/artifacts_out/test.py rename to agbenchmark/challenges/code/d2_vague/artifacts_out/test.py diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2_vague/data.json similarity index 97% rename from agbenchmark/challenges/code/d2/data.json rename to agbenchmark/challenges/code/d2_vague/data.json index de32ef9a..2b6c3526 100644 --- a/agbenchmark/challenges/code/d2/data.json +++ b/agbenchmark/challenges/code/d2_vague/data.json @@ -3,7 +3,7 @@ "category": ["code", "iterate"], "task": "Make test.py run without errors.", "dependencies": ["TestDebugSimpleTypoWithGuidance"], - "cutoff": 60, + "cutoff": 75, "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], diff --git a/agbenchmark/challenges/code/d4/artifacts_out/__init__.py b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py similarity index 100% rename from agbenchmark/challenges/code/d4/artifacts_out/__init__.py rename to agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py diff --git a/agbenchmark/challenges/code/d4/artifacts_out/code.py b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py similarity index 100% rename from agbenchmark/challenges/code/d4/artifacts_out/code.py rename to agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py diff --git a/agbenchmark/challenges/code/d4/custom_python/test.py b/agbenchmark/challenges/code/d3_two_sum/custom_python/test.py similarity index 100% rename from agbenchmark/challenges/code/d4/custom_python/test.py rename to agbenchmark/challenges/code/d3_two_sum/custom_python/test.py diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d3_two_sum/data.json similarity index 94% rename from agbenchmark/challenges/code/d4/data.json rename to agbenchmark/challenges/code/d3_two_sum/data.json index e8db918d..6df083d4 100644 --- a/agbenchmark/challenges/code/d4/data.json +++ b/agbenchmark/challenges/code/d3_two_sum/data.json @@ -3,7 +3,7 @@ "category": ["code"], "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", "dependencies": ["TestWriteFile"], - "cutoff": 60, + "cutoff": 90, "ground": { "answer": "The two_sum function coded properly.", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], @@ -12,7 +12,7 @@ "type": "execute_python_code" }, "info": { - "difficulty": "novice", + "difficulty": "advanced", "description": "Tests ability for the agent to create the two_sum function.", "side_effects": [] } diff --git a/agbenchmark/challenges/code/d3/custom_python/api_tests.py b/agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py similarity index 100% rename from agbenchmark/challenges/code/d3/custom_python/api_tests.py rename to agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d4_web_server/data.json similarity index 97% rename from agbenchmark/challenges/code/d3/data.json rename to agbenchmark/challenges/code/d4_web_server/data.json index c5d111a4..5c936e88 100644 --- a/agbenchmark/challenges/code/d3/data.json +++ b/agbenchmark/challenges/code/d4_web_server/data.json @@ -3,7 +3,7 @@ "category": ["code"], "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", "dependencies": ["TestDebugSimpleTypoWithGuidance"], - "cutoff": 60, + "cutoff": 90, "ground": { "answer": "GET localhost:8079/health responds with a 200 OK", "should_contain": [], diff --git a/agbenchmark/challenges/code/d5/artifacts_out/__init__.py b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py similarity index 100% rename from agbenchmark/challenges/code/d5/artifacts_out/__init__.py rename to agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py diff --git a/agbenchmark/challenges/code/d5/artifacts_out/code.py b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py similarity index 100% rename from agbenchmark/challenges/code/d5/artifacts_out/code.py rename to agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py diff --git a/agbenchmark/challenges/code/d5/custom_python/test.py b/agbenchmark/challenges/code/d5_three_sum/custom_python/test.py similarity index 100% rename from agbenchmark/challenges/code/d5/custom_python/test.py rename to agbenchmark/challenges/code/d5_three_sum/custom_python/test.py diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5_three_sum/data.json similarity index 96% rename from agbenchmark/challenges/code/d5/data.json rename to agbenchmark/challenges/code/d5_three_sum/data.json index 434b1312..4df03b45 100644 --- a/agbenchmark/challenges/code/d5/data.json +++ b/agbenchmark/challenges/code/d5_three_sum/data.json @@ -12,7 +12,7 @@ "type": "execute_python_code" }, "info": { - "difficulty": "intermediate", + "difficulty": "advanced", "description": "Tests ability for the agent to create the three_sum function.", "side_effects": [] } diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt deleted file mode 100644 index 8a2d3550..00000000 --- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt +++ /dev/null @@ -1 +0,0 @@ -Write the id previously mentioned to a .txt file. diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt rename to agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt rename to agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt rename to agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt rename to agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt new file mode 100644 index 00000000..73246e04 --- /dev/null +++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt @@ -0,0 +1 @@ +Write the id previously mentioned to a file called result.txt. diff --git a/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt rename to agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1_id/data.json similarity index 95% rename from agbenchmark/challenges/memory/m1/data.json rename to agbenchmark/challenges/memory/m1_id/data.json index 33c52324..186d2580 100644 --- a/agbenchmark/challenges/memory/m1/data.json +++ b/agbenchmark/challenges/memory/m1_id/data.json @@ -8,7 +8,7 @@ "answer": "2314", "should_contain": ["2314"], "should_not_contain": [], - "files": [".txt"], + "files": ["result.txt"], "type": "file" }, "info": { diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt deleted file mode 100644 index 2f60afa1..00000000 --- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt +++ /dev/null @@ -1 +0,0 @@ -Write the ids previously mentioned to a .txt file. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt rename to agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt rename to agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt rename to agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt rename to agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt new file mode 100644 index 00000000..30ac4102 --- /dev/null +++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt @@ -0,0 +1 @@ +Write the ids previously mentioned to a file called result.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt rename to agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2_multiple/data.json similarity index 95% rename from agbenchmark/challenges/memory/m2/data.json rename to agbenchmark/challenges/memory/m2_multiple/data.json index 840e8dc8..aac25b21 100644 --- a/agbenchmark/challenges/memory/m2/data.json +++ b/agbenchmark/challenges/memory/m2_multiple/data.json @@ -8,7 +8,7 @@ "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], "should_not_contain": [], - "files": [".txt"], + "files": ["result.txt"], "type": "file" }, "info": { diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt similarity index 94% rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt index 99c9efa3..aa379ea5 100644 --- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt +++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt @@ -1,5 +1,5 @@ sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 -Write the ids previously mentioned to a .txt file. +Write the ids previously mentioned to a file called result.txt. gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt rename to agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3_noise/data.json similarity index 88% rename from agbenchmark/challenges/memory/m3/data.json rename to agbenchmark/challenges/memory/m3_noise/data.json index 3af2fb3a..8234e25c 100644 --- a/agbenchmark/challenges/memory/m3/data.json +++ b/agbenchmark/challenges/memory/m3_noise/data.json @@ -1,14 +1,14 @@ { - "name": "TestRememberMultipleIdsWithNoise", + "name": "TestRememberMultipleWithNoise", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestRememberMultipleIds"], - "cutoff": 60, + "cutoff": 75, "ground": { "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], "should_not_contain": [], - "files": [".txt"], + "files": ["result.txt"], "type": "file" }, "info": { diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt similarity index 100% rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt similarity index 100% rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt similarity index 89% rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt index d7f6f08f..dc444c2a 100644 --- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt +++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt @@ -1,5 +1,5 @@ ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx -The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt. +The phrase to remember is 'My pet rock sings better than Beyonce on Tuesdays'. Read the file instructions_4.txt. 8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt similarity index 100% rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt similarity index 93% rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt index b4ef60ba..2092b42a 100644 --- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt +++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt @@ -1,5 +1,5 @@ sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 -Write the phrases previously mentioned to a .txt file. +Write the phrases previously mentioned to a file called result.txt. gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt similarity index 77% rename from agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt rename to agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt index 9b8405bf..bdab23d9 100644 --- a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt +++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt @@ -1,4 +1,4 @@ The purple elephant danced on a rainbow while eating a taco The sneaky toaster stole my socks and ran away to Hawaii -My pet rock sings better than Beyoncé on Tuesdays +My pet rock sings better than Beyonce on Tuesdays The giant hamster rode a unicycle through the crowded mall diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4_phrases/data.json similarity index 87% rename from agbenchmark/challenges/memory/m4/data.json rename to agbenchmark/challenges/memory/m4_phrases/data.json index 17a078e1..da17fb02 100644 --- a/agbenchmark/challenges/memory/m4/data.json +++ b/agbenchmark/challenges/memory/m4_phrases/data.json @@ -2,18 +2,18 @@ "name": "TestRememberMultiplePhrasesWithNoise", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": ["TestRememberMultipleIdsWithNoise"], - "cutoff": 60, + "dependencies": ["TestRememberMultipleWithNoise"], + "cutoff": 90, "ground": { "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", "should_contain": [ "The purple elephant danced on a rainbow while eating a taco", "The sneaky toaster stole my socks and ran away to Hawaii", - "My pet rock sings better than Beyoncé on Tuesdays", + "My pet rock sings better than Beyonce on Tuesdays", "The giant hamster rode a unicycle through the crowded mall" ], "should_not_contain": [], - "files": [".txt"], + "files": ["result.txt"], "type": "file" }, "info": { diff --git a/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt similarity index 100% rename from agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt rename to agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1_book_price/data.json similarity index 86% rename from agbenchmark/challenges/retrieval/r1/data.json rename to agbenchmark/challenges/retrieval/r1_book_price/data.json index c3af4862..9a6924da 100644 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ b/agbenchmark/challenges/retrieval/r1_book_price/data.json @@ -13,7 +13,7 @@ }, "info": { "difficulty": "basic", - "description": "Tests ability to retrieve information from a website.", + "description": "Specifies specific website to retrieve website from.", "side_effects": [] } } diff --git a/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt similarity index 100% rename from agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt rename to agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt diff --git a/agbenchmark/challenges/retrieval/r2.1_specific/data.json b/agbenchmark/challenges/retrieval/r2.1_specific/data.json new file mode 100644 index 00000000..3fcd5080 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2.1_specific/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestRetrieval2.1", + "category": ["retrieval"], + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "dependencies": ["TestRetrieval2.0"], + "cutoff": 60, + "ground": { + "answer": "It was $81.462 billion in 2022.", + "should_contain": ["81", "462"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "info": { + "difficulty": "novice", + "description": "This one checks the accuracy of the information over r2", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt new file mode 100644 index 00000000..8a0eae04 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt @@ -0,0 +1 @@ +81,462 Millions diff --git a/agbenchmark/challenges/retrieval/r2.2_formatting/data.json b/agbenchmark/challenges/retrieval/r2.2_formatting/data.json new file mode 100644 index 00000000..294effef --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2.2_formatting/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestRetrieval2.2", + "category": ["retrieval"], + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "dependencies": ["TestRetrieval2.1"], + "cutoff": 60, + "ground": { + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "should_contain": ["81,462"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "info": { + "difficulty": "intermediate", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json deleted file mode 100644 index f558b858..00000000 --- a/agbenchmark/challenges/retrieval/r2/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestRetrieval2", - "category": ["retrieval"], - "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": ["TestBasicRetrieval"], - "cutoff": 60, - "ground": { - "answer": "81,462", - "should_contain": ["81,462"], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "info": { - "difficulty": "novice", - "description": "Tests ability to retrieve information.", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt new file mode 100644 index 00000000..8a0eae04 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt @@ -0,0 +1 @@ +81,462 Millions diff --git a/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json b/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json new file mode 100644 index 00000000..63665ed6 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestRetrieval2.0", + "category": ["retrieval"], + "task": "Write tesla's revenue in 2022 into a .txt file.", + "dependencies": ["TestBasicRetrieval"], + "cutoff": 60, + "ground": { + "answer": "It was $81.462 billion in 2022.", + "should_contain": ["81"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "info": { + "difficulty": "novice", + "description": "A no guardrails search for info", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json index eb998ffb..d3e4bd0f 100644 --- a/agbenchmark/challenges/retrieval/r3/data.json +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -2,7 +2,7 @@ "name": "TestRetrieval3", "category": ["retrieval"], "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": ["TestRetrieval2"], + "dependencies": ["TestRetrieval2.1"], "cutoff": 60, "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 4a62af0b..b544d2c6 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -135,8 +135,8 @@ internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json")) def pytest_runtest_makereport(item: Any, call: Any) -> None: + challenge_data = item.funcargs.get("challenge_data", None) if call.when == "call": - challenge_data = item.funcargs.get("challenge_data", None) difficulty = ( challenge_data["info"]["difficulty"] if challenge_data else "unknown" ) @@ -157,6 +157,9 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: info_details: Any = { "data_path": challenge_location, "is_regression": False, + "task": challenge_data["task"], + "answer": challenge_data["ground"]["answer"], + "description": challenge_data["info"]["description"], "metrics": { "difficulty": difficulty, "success": False, @@ -218,6 +221,10 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: "run_time" ] = f"{str(round(run_time, 3))} seconds" + info_details["reached_cutoff"] = ( + float(run_time) > challenge_data["cutoff"] + ) + info_manager.add_test(test_name, info_details) diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json index 0bfad744..d20e8c77 100644 --- a/agbenchmark/reports/internal_info.json +++ b/agbenchmark/reports/internal_info.json @@ -3,28 +3,20 @@ "TestBasicMemory": [ true, true, + true, + true, + true, + false, + false, true ], "TestBasicRetrieval": [ + true, + true, true, true, true ], - "TestCreateSimpleWebServer": [ - false, - false, - false - ], - "TestDebugSimpleTypoWithGuidance": [ - false, - false, - false - ], - "TestDebugSimpleTypoWithoutGuidance": [ - false, - false, - false - ], "TestReadFile": [ true, true, @@ -32,41 +24,62 @@ true, true ], - "TestRememberMultipleIds": [ - true, - true, - true - ], - "TestRememberMultipleIdsWithNoise": [ - true, - true, - true - ], - "TestRememberMultiplePhrasesWithNoise": [ - true, - true, - true - ], - "TestRetrieval2": [ - true, - true, - true - ], - "TestRetrieval3": [ - true, - true, - true - ], "TestSearch": [ true, true, true, + true, true ], "TestWriteFile": [ + true, true, true, true + ], + "TestRetrieval2.2": [ + false, + false, + false, + false + ], + "TestRetrieval2.1": [ + false, + false, + false, + false, + false, + false + ], + "TestRetrieval2.0": [ + true, + false + ], + "TestRememberMultipleIds": [ + false, + false, + true + ], + "TestRememberMultipleIdsWithNoise": [ + false + ], + "TestRememberMultipleWithNoise": [ + false, + true + ], + "TestRememberMultiplePhrasesWithNoise": [ + false, + false, + false, + false, + false, + false + ], + "TestDebugSimpleTypoWithGuidance": [ + false, + false, + false, + false ] } } \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json index 637c2d5c..41905231 100644 --- a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json +++ b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json @@ -1,36 +1,27 @@ { + "command": "agbenchmark start --test TestWriteFile", + "completion_time": "2023-07-17-13:34", + "metrics": { + "run_time": "23.83 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": true, - "metrics": { - "difficulty": "interface", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.009 seconds" - } - }, - "additional": { - "model": "gpt-3.5-turbo" - }, - "command": "agbenchmark start --test TestWriteFile", - "completion_time": "2023-07-17-09:54", - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}" - }, - "metrics": { - "run_time": "22.36 seconds", - "highest_difficulty": "interface: 1" - }, - "tests": { - "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": false, - "metrics": { - "difficulty": "interface", - "success": true, - "success_%": 40.0, - "run_time": "22.169 seconds" - } - } + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": true, + "reached_cutoff": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "23.627 seconds" + } } -} \ No newline at end of file + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json b/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json new file mode 100644 index 00000000..811fd3e8 --- /dev/null +++ b/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRememberMultipleWithNoise", + "completion_time": "2023-07-17-21:24", + "metrics": { + "run_time": "77.71 seconds", + "highest_difficulty": "intermediate: 4" + }, + "tests": { + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": true, + "success_%": 50.0, + "run_time": "77.397 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json b/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json new file mode 100644 index 00000000..08c2b707 --- /dev/null +++ b/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultipleWithNoise", + "completion_time": "2023-07-17-21:19", + "metrics": { + "run_time": "74.3 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "74.059 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 00000000..0de6f003 --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:28", + "metrics": { + "run_time": "60.86 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "60.631 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 00000000..1d2abb8e --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:32", + "metrics": { + "run_time": "73.04 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "72.736 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 00000000..1d256b8c --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:34", + "metrics": { + "run_time": "81.59 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "81.374 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 00000000..e67a6ac3 --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:36", + "metrics": { + "run_time": "98.32 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "98.021 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 00000000..9e76704d --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:42", + "metrics": { + "run_time": "303.13 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "302.919 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 00000000..e98ca330 --- /dev/null +++ b/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:27", + "metrics": { + "run_time": "77.72 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "77.491 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 00000000..9c9f3dc2 --- /dev/null +++ b/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-17-21:46", + "metrics": { + "run_time": "87.21 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "86.967 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 00000000..4765201f --- /dev/null +++ b/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-17-21:47", + "metrics": { + "run_time": "48.52 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "48.208 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 00000000..ac2592f3 --- /dev/null +++ b/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-17-21:55", + "metrics": { + "run_time": "54.95 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "54.741 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 00000000..e84c6e9a --- /dev/null +++ b/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-17-21:44", + "metrics": { + "run_time": "63.37 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "63.125 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json index e6478319..6ac7d104 100644 --- a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json +++ b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json @@ -9,6 +9,7 @@ "TestWriteFile": { "data_path": "agbenchmark/challenges/interface/write_file", "is_regression": false, + "reached_cutoff": false, "metrics": { "difficulty": "interface", "success": true, @@ -18,8 +19,7 @@ } }, "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" + "workspace": "${os.path.join(Path.home(), 'miniagi')}" }, "additional": { "model": "gpt-4" diff --git a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json index b5d73af9..4758addf 100644 --- a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json +++ b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json @@ -9,6 +9,7 @@ "TestReadFile": { "data_path": "agbenchmark/challenges/interface/read_file", "is_regression": true, + "reached_cutoff": true, "metrics": { "difficulty": "interface", "success": true, @@ -21,7 +22,6 @@ "workspace": "${os.path.join(Path.home(), 'miniagi')}" }, "additional": { - "model": "gpt-4", - "reached_termination_time": true + "model": "gpt-3.5-turbo" } } diff --git a/agbenchmark/reports/mini-agi/2_TestReadFile.json b/agbenchmark/reports/mini-agi/2_TestReadFile.json index 869eaaac..87c7956d 100644 --- a/agbenchmark/reports/mini-agi/2_TestReadFile.json +++ b/agbenchmark/reports/mini-agi/2_TestReadFile.json @@ -9,6 +9,7 @@ "TestReadFile": { "data_path": "agbenchmark/challenges/interface/read_file", "is_regression": true, + "reached_cutoff": false, "metrics": { "difficulty": "interface", "success": true, @@ -18,8 +19,7 @@ } }, "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" + "workspace": "${os.path.join(Path.home(), 'miniagi')}" }, "additional": { "model": "gpt-4" diff --git a/agbenchmark/reports/mini-agi/3.1_TestSearch.json b/agbenchmark/reports/mini-agi/3.1_TestSearch.json new file mode 100644 index 00000000..6a2744e7 --- /dev/null +++ b/agbenchmark/reports/mini-agi/3.1_TestSearch.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestSearch", + "completion_time": "2023-07-17-13:35", + "metrics": { + "run_time": "20.58 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": true, + "reached_cutoff": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "20.367 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/3_TestSearch.json b/agbenchmark/reports/mini-agi/3_TestSearch.json index d9d05db4..c7d6c430 100644 --- a/agbenchmark/reports/mini-agi/3_TestSearch.json +++ b/agbenchmark/reports/mini-agi/3_TestSearch.json @@ -9,6 +9,7 @@ "TestSearch": { "data_path": "agbenchmark/challenges/interface/search", "is_regression": true, + "reached_cutoff": false, "metrics": { "difficulty": "interface", "success": true, @@ -18,8 +19,7 @@ } }, "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" + "workspace": "${os.path.join(Path.home(), 'miniagi')}" }, "additional": { "model": "gpt-4" diff --git a/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json b/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json new file mode 100644 index 00000000..6ff0fa63 --- /dev/null +++ b/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestBasicRetrieval", + "completion_time": "2023-07-17-13:31", + "metrics": { + "run_time": "26.05 seconds", + "highest_difficulty": "basic: 2" + }, + "tests": { + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": true, + "reached_cutoff": false, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "25.818 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json deleted file mode 100644 index d72d599d..00000000 --- a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", - "completion_time": "2023-07-15-22:16", - "metrics": { - "run_time": "45.92 seconds", - "highest_difficulty": ": 0" - }, - "tests": { - "TestDebugSimpleTypoWithGuidance": { - "data_path": "agbenchmark/challenges/code/d1", - "is_regression": false, - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "assert 1 in [0.0]", - "success_%": 0.0, - "run_time": "45.599 seconds" - } - } - }, - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" - }, - "additional": { - "model": "gpt-4" - } -} diff --git a/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json b/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json new file mode 100644 index 00000000..54c4fdcc --- /dev/null +++ b/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestBasicRetrieval", + "completion_time": "2023-07-17-13:22", + "metrics": { + "run_time": "61.24 seconds", + "highest_difficulty": "basic: 2" + }, + "tests": { + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": true, + "reached_cutoff": true, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "60.872 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json deleted file mode 100644 index 7985a784..00000000 --- a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", - "completion_time": "2023-07-15-22:15", - "metrics": { - "run_time": "32.99 seconds", - "highest_difficulty": ": 0" - }, - "tests": { - "TestDebugSimpleTypoWithGuidance": { - "data_path": "agbenchmark/challenges/code/d1", - "is_regression": false, - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "assert 1 in [0.0]", - "success_%": 0.0, - "run_time": "32.582 seconds" - } - } - }, - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" - }, - "additional": { - "model": "gpt-4" - } -} diff --git a/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json b/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json new file mode 100644 index 00000000..4149ebe7 --- /dev/null +++ b/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.0", + "completion_time": "2023-07-17-17:10", + "metrics": { + "run_time": "66.81 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "reached_cutoff": true, + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "66.547 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json b/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json new file mode 100644 index 00000000..28d091d2 --- /dev/null +++ b/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json @@ -0,0 +1,29 @@ +{ + "command": "agbenchmark start --test TestRetrieval2", + "completion_time": "2023-07-17-13:54", + "metrics": { + "run_time": "36 seconds", + "highest_difficulty": "TestRetrieval2: 3" + }, + "tests": { + "TestRetrieval2": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "reached_cutoff": false, + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 50.0, + "run_time": "35.59 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json new file mode 100644 index 00000000..ed3ede1d --- /dev/null +++ b/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-17:27", + "metrics": { + "run_time": "64.44 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "reached_cutoff:": true, + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0, 0.0]", + "success_%": 0.0, + "run_time": "64.216 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json new file mode 100644 index 00000000..04f97232 --- /dev/null +++ b/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-17:53", + "metrics": { + "run_time": "30.08 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "29.711 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json new file mode 100644 index 00000000..38377434 --- /dev/null +++ b/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-17:54", + "metrics": { + "run_time": "27.49 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "27.266 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json new file mode 100644 index 00000000..71cd9e00 --- /dev/null +++ b/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-17:56", + "metrics": { + "run_time": "23.64 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "23.42 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json new file mode 100644 index 00000000..1dceec03 --- /dev/null +++ b/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-14:03", + "metrics": { + "run_time": "68.39 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "reached_cutoff": true, + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "68.15 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json b/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json new file mode 100644 index 00000000..99373f7f --- /dev/null +++ b/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.2", + "completion_time": "2023-07-17-17:57", + "metrics": { + "run_time": "31.1 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "30.888 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json b/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json new file mode 100644 index 00000000..ccdca26b --- /dev/null +++ b/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.2", + "completion_time": "2023-07-17-14:04", + "metrics": { + "run_time": "28.08 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "reached_cutoff": false, + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "27.857 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json b/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json new file mode 100644 index 00000000..66cc2f9a --- /dev/null +++ b/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestBasicMemory", + "completion_time": "2023-07-17-18:22", + "metrics": { + "run_time": "53.48 seconds", + "highest_difficulty": "basic: 2" + }, + "tests": { + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 75.0, + "run_time": "53.252 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/8_TestBasicMemory.json b/agbenchmark/reports/mini-agi/8_TestBasicMemory.json new file mode 100644 index 00000000..7ce53550 --- /dev/null +++ b/agbenchmark/reports/mini-agi/8_TestBasicMemory.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestBasicMemory", + "completion_time": "2023-07-17-18:18", + "metrics": { + "run_time": "62.11 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\results.txt'", + "success_%": 83.33, + "run_time": "61.879 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json b/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json new file mode 100644 index 00000000..462e7390 --- /dev/null +++ b/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRememberMultipleIds", + "completion_time": "2023-07-17-20:22", + "metrics": { + "run_time": "57.6 seconds", + "highest_difficulty": "novice: 3" + }, + "tests": { + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 33.33, + "run_time": "57.355 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json b/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json new file mode 100644 index 00000000..aa726196 --- /dev/null +++ b/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultipleIds", + "completion_time": "2023-07-17-18:33", + "metrics": { + "run_time": "61.32 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "61.089 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index f1ed4363..63f4d836 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -55,7 +55,6 @@ def calculate_info_test_path(reports_path: Path) -> str: all_prefix_numbers.append(math.floor(number)) max_prefix = max(all_prefix_numbers, default=0) - print("HEY WE ARE HERE BIG DAWG", max_prefix) run_name = f"{max_prefix + 1}_{test_arg}.json" else: # Take the number from before the _ and add the .{number} @@ -118,7 +117,9 @@ def get_highest_success_difficulty(data: dict) -> str: else: highest_difficulty_str = "" - return f"{highest_difficulty_str}: {highest_difficulty_level}" + if highest_difficulty_level: + return f"{highest_difficulty_str}: {highest_difficulty_level}" + return "No successful tests" def assign_paths(folder_path: Path) -> tuple[str, str, str]: diff --git a/agent/mini-agi b/agent/mini-agi index 0a9fcd8c..4a346ab7 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d +Subproject commit 4a346ab7cb8dbcfd3bf2cee49448d26e01406ba3