From 9ede17891bb4a322d51ec2bf1cc9e60e93db0acd Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 7 Jul 2023 13:50:53 -0700 Subject: [PATCH] Add 'Debug simple typo with guidance' challenge (#65) Signed-off-by: Merwane Hamadi --- agbenchmark/agent_interface.py | 31 ++++++++-- agbenchmark/challenge.py | 62 +++++++++++-------- agbenchmark/challenges/README.md | 3 +- agbenchmark/challenges/code/code.py | 8 +++ .../artifacts_in/__init__.py} | 0 .../challenges/code/d1/artifacts_in/code.py | 13 ++++ .../challenges/code/d1/artifacts_in/test.py | 31 ++++++++++ .../code/d1/artifacts_out/__init__.py | 0 .../challenges/code/d1/artifacts_out/code.py | 12 ++++ .../challenges/code/d1/artifacts_out/test.py | 31 ++++++++++ .../debug_simple_typo_with_guidance_data.json | 22 +++++++ .../debug_simple_typo_with_guidance_test.py | 31 ++++++++++ agbenchmark/challenges/define_task_types.py | 3 +- .../instructions_1.txt | 0 .../instructions_2.txt | 0 .../instructions_3.txt | 0 .../instructions_4.txt | 0 .../instructions_5.txt | 0 agbenchmark/challenges/memory/m1/m1_data.json | 3 +- agbenchmark/challenges/memory/m1/m1_test.py | 4 +- .../instructions_1.txt | 0 .../instructions_2.txt | 0 .../instructions_3.txt | 0 .../instructions_4.txt | 0 .../instructions_5.txt | 0 .../memory/m2/remember_multiple_ids_data.json | 3 +- .../memory/m2/remember_multiple_ids_test.py | 4 +- .../instructions_1.txt | 0 .../instructions_2.txt | 0 .../instructions_3.txt | 0 .../instructions_4.txt | 0 .../instructions_5.txt | 0 ...remember_multiple_ids_with_noise_data.json | 3 +- .../remember_multiple_ids_with_noise_test.py | 4 +- .../instructions_1.txt | 0 .../instructions_2.txt | 0 .../instructions_3.txt | 0 .../instructions_4.txt | 0 .../instructions_5.txt | 0 ...mber_multiple_phrases_with_noise_data.json | 3 +- ...member_multiple_phrases_with_noise_test.py | 4 +- .../challenges/retrieval/r1/r1_data.json | 3 +- .../challenges/retrieval/r1/r1_test.py | 4 +- .../challenges/retrieval/r2/r2_data.json | 3 +- .../challenges/retrieval/r2/r2_test.py | 4 +- .../challenges/retrieval/r3/r3_data.json | 3 +- .../challenges/retrieval/r3/r3_test.py | 4 +- agbenchmark/mocks/tests/basic_mocks.py | 12 ---- .../file_to_check.txt | 0 .../read_file/artifacts_out/file_to_check.txt | 1 + .../read_file/r_file_data.json | 7 ++- .../read_file/read_file_test.py | 4 +- .../write_file/w_file_data.json | 3 +- .../write_file/write_file_test.py | 4 +- pyproject.toml | 1 + regression_tests.json | 45 ++++++++------ 56 files changed, 288 insertions(+), 85 deletions(-) create mode 100644 agbenchmark/challenges/code/code.py rename agbenchmark/challenges/code/{c1_test.py => d1/artifacts_in/__init__.py} (100%) create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/code.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/test.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/__init__.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/code.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/test.py create mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json create mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_1.txt (100%) rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_2.txt (100%) rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_3.txt (100%) rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_4.txt (100%) rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_5.txt (100%) rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_1.txt (100%) rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_2.txt (100%) rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_3.txt (100%) rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_4.txt (100%) rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_5.txt (100%) rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_1.txt (100%) rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_2.txt (100%) rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_3.txt (100%) rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_4.txt (100%) rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_5.txt (100%) rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_1.txt (100%) rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_2.txt (100%) rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_3.txt (100%) rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_4.txt (100%) rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_5.txt (100%) rename agbenchmark/tests/basic_abilities/read_file/{artifacts => artifacts_in}/file_to_check.txt (100%) create mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 8e9e5a14..05540f6d 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,4 +1,5 @@ import os +import shutil import subprocess import sys import time @@ -14,13 +15,20 @@ MOCK_FLAG = os.getenv("MOCK_TEST") def run_agent( - task: Optional[str], mock_func: Optional[str], config: Dict[str, Any] + task: Optional[str], + mock_func: Optional[str], + config: Dict[str, Any], + challenge_location: str, ) -> None: """Calling to get a response""" - if mock_func == None and MOCK_FLAG == "True": - print("No mock provided") - elif MOCK_FLAG == "True": + if MOCK_FLAG == "True": + copy_artifacts_into_workspace( + config["workspace"], "artifacts_out", challenge_location + ) + if mock_func is None: + print("No mock provided") + return mock_manager = MockManager( task, config ) # workspace doesn't need to be passed in, stays the same @@ -77,4 +85,19 @@ def run_agent( process.wait() +def copy_artifacts_into_workspace( + workspace: str, artifact_folder_name: str, challenge_dir_path: str +) -> None: + source_dir = os.path.join(challenge_dir_path, artifact_folder_name) + + # Check if source_dir exists, if not then return immediately. + if not os.path.exists(source_dir): + return + + for file_name in os.listdir(source_dir): + full_file_name = os.path.join(source_dir, file_name) + if os.path.isfile(full_file_name): + shutil.copy(full_file_name, workspace) + + ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index dee2b435..4c8e6984 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -1,9 +1,10 @@ import glob import inspect import os -import shutil -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional +import subprocess +import types +from abc import ABC, ABCMeta, abstractmethod +from typing import Any, Dict, List, Optional, Tuple, Type, cast import pytest from dotenv import load_dotenv @@ -16,7 +17,20 @@ mock_test_str = os.getenv("MOCK_TEST") MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False -class Challenge(ABC): +class ChallengeMeta(ABCMeta): + def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None: + + super().__init__(name, bases, dct) + try: + frame = cast(types.FrameType, inspect.currentframe()) + assert frame.f_back is not None + self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back)) + except Exception as e: + print(f"Unable to get the file from 8 frames back due to: {str(e)}") + raise e + + +class Challenge(ABC, metaclass=ChallengeMeta): """The parent class to all specific challenges classes. Defines helper methods for running a challenge""" @@ -52,11 +66,13 @@ class Challenge(ABC): return self.data.dependencies def setup_challenge(self, config: Dict[str, Any]) -> None: - from agbenchmark.agent_interface import run_agent + from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent - self.copy_artifacts_into_workspace(config["workspace"]) + copy_artifacts_into_workspace( + config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION + ) - run_agent(self.task, self.mock, config) + run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION) @property def name(self) -> str: @@ -77,8 +93,7 @@ class Challenge(ABC): with open(workspace_dir, "r") as f: return f.read() - @staticmethod - def open_files(workspace: str, file_patterns: list) -> List[str]: + def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]: script_dir = os.path.abspath(workspace) files_contents = [] @@ -92,8 +107,17 @@ class Challenge(ABC): matching_files = [os.path.join(script_dir, file_pattern)] for file_path in matching_files: - with open(file_path, "r") as f: - files_contents.append(f.read()) + if self.data.ground.type == "execute_python_code": + result = subprocess.run( + ["python3", file_path], + cwd=os.path.abspath(workspace), + capture_output=True, + text=True, + ) + files_contents.append(result.stdout) + else: + with open(file_path, "r") as f: + files_contents.append(f.read()) return files_contents @@ -135,19 +159,3 @@ class Challenge(ABC): ) return 1.0 - - def copy_artifacts_into_workspace(self, workspace: str) -> None: - curr_frame = inspect.currentframe() - outer_frame = inspect.getouterframes(curr_frame)[2] - caller_file_path = outer_frame.filename - caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path)) - source_dir = os.path.join(caller_dir_path, "artifacts") - - # Check if source_dir exists, if not then return immediately. - if not os.path.exists(source_dir): - return - - for file_name in os.listdir(source_dir): - full_file_name = os.path.join(source_dir, file_name) - if os.path.isfile(full_file_name): - shutil.copy(full_file_name, workspace) diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 9e74d19c..2d782d1f 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -33,7 +33,8 @@ Example: "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_write_file_mock", diff --git a/agbenchmark/challenges/code/code.py b/agbenchmark/challenges/code/code.py new file mode 100644 index 00000000..508d24a9 --- /dev/null +++ b/agbenchmark/challenges/code/code.py @@ -0,0 +1,8 @@ +import pytest + +from agbenchmark.challenge import Challenge + + +@pytest.mark.code +class CodeChallenge(Challenge): + """Challenge for memory""" diff --git a/agbenchmark/challenges/code/c1_test.py b/agbenchmark/challenges/code/d1/artifacts_in/__init__.py similarity index 100% rename from agbenchmark/challenges/code/c1_test.py rename to agbenchmark/challenges/code/d1/artifacts_in/__init__.py diff --git a/agbenchmark/challenges/code/d1/artifacts_in/code.py b/agbenchmark/challenges/code/d1/artifacts_in/code.py new file mode 100644 index 00000000..df8120bf --- /dev/null +++ b/agbenchmark/challenges/code/d1/artifacts_in/code.py @@ -0,0 +1,13 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + typo + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d1/artifacts_in/test.py b/agbenchmark/challenges/code/d1/artifacts_in/test.py new file mode 100644 index 00000000..d85d1353 --- /dev/null +++ b/agbenchmark/challenges/code/d1/artifacts_in/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d1/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1/artifacts_out/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/agbenchmark/challenges/code/d1/artifacts_out/code.py b/agbenchmark/challenges/code/d1/artifacts_out/code.py new file mode 100644 index 00000000..de3d8c62 --- /dev/null +++ b/agbenchmark/challenges/code/d1/artifacts_out/code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d1/artifacts_out/test.py b/agbenchmark/challenges/code/d1/artifacts_out/test.py new file mode 100644 index 00000000..d85d1353 --- /dev/null +++ b/agbenchmark/challenges/code/d1/artifacts_out/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json new file mode 100644 index 00000000..ce9d9298 --- /dev/null +++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json @@ -0,0 +1,22 @@ +{ + "name": "debug_simple_typo_with_guidance", + "category": ["code"], + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "dependencies": [], + "ground": { + "answer": "2314", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "mock": { + "mock_func": null, + "mock_task": null + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py new file mode 100644 index 00000000..e5f50c70 --- /dev/null +++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py @@ -0,0 +1,31 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.code.code import CodeChallenge + + +class TestDebugSimpleTypoWithGuidance(CodeChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join( + os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json" + ) + + @pytest.mark.depends(name="test_debug_simple_typo_with_guidance") + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 52df3017..f84df126 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -5,7 +5,7 @@ from pydantic import BaseModel class Mock(BaseModel): - mock_func: str + mock_func: Optional[str] = None mock_task: Optional[str] = None @@ -20,6 +20,7 @@ class Ground(BaseModel): should_contain: Optional[List[str]] = None should_not_contain: Optional[List[str]] = None files: List[str] + type: str class ChallengeData(BaseModel): diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt similarity index 100% rename from agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt diff --git a/agbenchmark/challenges/memory/m1/m1_data.json b/agbenchmark/challenges/memory/m1/m1_data.json index 7023f85f..3e410ac5 100644 --- a/agbenchmark/challenges/memory/m1/m1_data.json +++ b/agbenchmark/challenges/memory/m1/m1_data.json @@ -7,7 +7,8 @@ "answer": "2314", "should_contain": ["2314"], "should_not_contain": [], - "files": ["file_to_check.txt"] + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "basic_memory_mock", diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py index 28e600cc..c1f37024 100644 --- a/agbenchmark/challenges/memory/m1/m1_test.py +++ b/agbenchmark/challenges/memory/m1/m1_test.py @@ -16,7 +16,9 @@ class TestBasicMemory(MemoryChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt similarity index 100% rename from agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json index 374df616..29d7339b 100644 --- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json +++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json @@ -7,7 +7,8 @@ "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], "should_not_contain": [], - "files": ["file_to_check.txt"] + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "remember_multiple_ids_mock", diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py index d5f0cf1a..f0f2b397 100644 --- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py +++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py @@ -20,7 +20,9 @@ class TestRememberMultipleIds(MemoryChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt similarity index 100% rename from agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json index 95c93ccb..6b53c457 100644 --- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json +++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json @@ -7,7 +7,8 @@ "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], "should_not_contain": [], - "files": ["file_to_check.txt"] + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "remember_multiple_ids_mock", diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py index 4d2d6495..493ea357 100644 --- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py +++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py @@ -21,7 +21,9 @@ class TestRememberMultipleIdsWithNoise(MemoryChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt similarity index 100% rename from agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt similarity index 100% rename from agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt similarity index 100% rename from agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt similarity index 100% rename from agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt similarity index 100% rename from agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json index e1ecb16f..316ef947 100644 --- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json @@ -12,7 +12,8 @@ "The giant hamster rode a unicycle through the crowded mall" ], "should_not_contain": [], - "files": ["file_to_check.txt"] + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "remember_multiple_phrases_with_noise_mock", diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py index fd33da1c..e37e9a38 100644 --- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py @@ -21,7 +21,9 @@ class TestRememberMultiplePhrasesWithNoise(MemoryChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 44fce24e..8fca01b7 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -7,7 +7,8 @@ "answer": "£25.89", "should_contain": ["25.89"], "should_not_contain": [], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_retrieval_mock", diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index d107d964..285b8aff 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -16,7 +16,9 @@ class TestRetrieval(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r2/r2_data.json b/agbenchmark/challenges/retrieval/r2/r2_data.json index 925e6db8..3c388f19 100644 --- a/agbenchmark/challenges/retrieval/r2/r2_data.json +++ b/agbenchmark/challenges/retrieval/r2/r2_data.json @@ -7,7 +7,8 @@ "answer": "81,462", "should_contain": ["81,462"], "should_not_contain": [], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_retrieval_2_mock", diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py index a60296ec..ba727b8e 100644 --- a/agbenchmark/challenges/retrieval/r2/r2_test.py +++ b/agbenchmark/challenges/retrieval/r2/r2_test.py @@ -16,7 +16,9 @@ class TestRetrieval2(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r3/r3_data.json b/agbenchmark/challenges/retrieval/r3/r3_data.json index 183529c4..41545615 100644 --- a/agbenchmark/challenges/retrieval/r3/r3_data.json +++ b/agbenchmark/challenges/retrieval/r3/r3_data.json @@ -7,7 +7,8 @@ "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"], "should_not_contain": [], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_retrieval_3_mock", diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py index bcd48d33..b58f4267 100644 --- a/agbenchmark/challenges/retrieval/r3/r3_test.py +++ b/agbenchmark/challenges/retrieval/r3/r3_test.py @@ -16,7 +16,9 @@ class TestRetrieval3(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 37ded0ae..32149eb8 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -1,18 +1,6 @@ from agbenchmark.challenge import Challenge -def basic_read_file_mock(task: str, workspace: str) -> None: - """ - This mock reads a file and returns its content. - """ - - file_contents = Challenge.open_file(workspace, "file_to_check.txt") - - Challenge.write_to_file( - workspace, "file_to_check.txt", f"random string: {file_contents}" - ) - - def basic_write_file_mock(task: str, workspace: str) -> None: """ This mock writes to a file (creates one if it doesn't exist) diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt similarity index 100% rename from agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt rename to agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt new file mode 100644 index 00000000..c1a7879a --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt @@ -0,0 +1 @@ +random string Hello World! diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index a74b875a..7463d22f 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -4,9 +4,10 @@ "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", "dependencies": ["basic_write_file"], "ground": { - "answer": "random string: Hello World!", - "should_contain": ["random string: Hello World!"], - "files": ["file_to_check.txt"] + "answer": "random string Hello World!", + "should_contain": ["random string", "Hello World!"], + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "basic_read_file_mock" diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index e7f2af9e..7c38d283 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -15,7 +15,9 @@ class TestReadFile(BasicChallenge): @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 358ebb53..9232a45a 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -7,7 +7,8 @@ "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_write_file_mock", diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 81f72cc9..474d6712 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,7 +16,9 @@ class TestWriteFile(BasicChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/pyproject.toml b/pyproject.toml index e0d579ca..33a8671c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ markers = [ "retrieval", "regression", "basic", + "code", "memory" ] diff --git a/regression_tests.json b/regression_tests.json index 1195efbc..3c8988a1 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,9 +1,34 @@ { + "TestDebugSimpleTypoWithGuidance": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py" + }, "TestBasicMemory": { "difficulty": "basic", "dependencies": [], "test": "agbenchmark/challenges/memory/m1/m1_test.py" }, + "TestRememberMultipleIds": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py" + }, + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" + }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" + }, + "TestRetrieval": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" + }, "TestWriteFile": { "difficulty": "basic", "dependencies": [], @@ -19,31 +44,11 @@ "dependencies": [], "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" }, - "TestRetrieval": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" - }, "TestReadFile": { "difficulty": "basic", "dependencies": [ "basic_write_file" ], "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" - }, - "TestRememberMultipleIds": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py" - }, - "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", - "dependencies": [], - "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" - }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", - "dependencies": [], - "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" } } \ No newline at end of file