diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json index ce9d9298..c29c3d83 100644 --- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json +++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json @@ -4,7 +4,7 @@ "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", "dependencies": [], "ground": { - "answer": "2314", + "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], "should_not_contain": [], "files": ["test.py"], diff --git a/agbenchmark/challenges/code/d2/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2/artifacts_in/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/agbenchmark/challenges/code/d2/artifacts_in/code.py b/agbenchmark/challenges/code/d2/artifacts_in/code.py new file mode 100644 index 00000000..df8120bf --- /dev/null +++ b/agbenchmark/challenges/code/d2/artifacts_in/code.py @@ -0,0 +1,13 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + typo + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d2/artifacts_in/test.py b/agbenchmark/challenges/code/d2/artifacts_in/test.py new file mode 100644 index 00000000..d85d1353 --- /dev/null +++ b/agbenchmark/challenges/code/d2/artifacts_in/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d2/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2/artifacts_out/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/agbenchmark/challenges/code/d2/artifacts_out/code.py b/agbenchmark/challenges/code/d2/artifacts_out/code.py new file mode 100644 index 00000000..de3d8c62 --- /dev/null +++ b/agbenchmark/challenges/code/d2/artifacts_out/code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d2/artifacts_out/test.py b/agbenchmark/challenges/code/d2/artifacts_out/test.py new file mode 100644 index 00000000..d85d1353 --- /dev/null +++ b/agbenchmark/challenges/code/d2/artifacts_out/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d2/d2_data.json b/agbenchmark/challenges/code/d2/d2_data.json new file mode 100644 index 00000000..6003055a --- /dev/null +++ b/agbenchmark/challenges/code/d2/d2_data.json @@ -0,0 +1,22 @@ +{ + "name": "debug_simple_typo_without_guidance", + "category": ["code"], + "task": "Make test.py run without errors.", + "dependencies": [], + "ground": { + "answer": "[0, 1] [2, 5] [0, 3]", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "mock": { + "mock_func": null, + "mock_task": null + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py new file mode 100644 index 00000000..d49f9dfe --- /dev/null +++ b/agbenchmark/challenges/code/d2/d2_test.py @@ -0,0 +1,32 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.code.code import CodeChallenge + + +class TestDebugSimpleTypoWithoutGuidance(CodeChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "d2_data.json") + + @pytest.mark.depends( + name="test_debug_simple_typo_without_guidance", + depends=["test_debug_simple_typo_with_guidance"], + ) + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py index 5b84965c..57c03405 100644 --- a/agbenchmark/mocks/mock_manager.py +++ b/agbenchmark/mocks/mock_manager.py @@ -1,11 +1,11 @@ -from typing import Any, Dict +from typing import Any, Dict, Optional import agbenchmark.mocks.tests.basic_mocks as basic_mocks import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks class MockManager: - def __init__(self, task: str, config: Dict[str, Any]) -> None: + def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None: self.task = task self.workspace = config["workspace"] self.modules = [basic_mocks, retrieval_mocks] diff --git a/mypy.ini b/mypy.ini index ceb13fcd..764c239f 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,4 +1,5 @@ [mypy] +namespace_packages = True follow_imports = skip check_untyped_defs = True disallow_untyped_defs = True diff --git a/regression_tests.json b/regression_tests.json index 3c8988a1..59a9694b 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -50,5 +50,10 @@ "basic_write_file" ], "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" + }, + "TestDebugSimpleTypoWithoutGuidance": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/code/d2/d2_test.py" } } \ No newline at end of file