Add 'Debug simple typo with guidance' challenge (#65)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
2026-01-08 16:54:31 +01:00 · 2023-07-07 13:50:53 -07:00
parent bfd0d5c826
commit 9ede17891b
56 changed files with 288 additions and 85 deletions
--- a/agbenchmark/challenges/code/code.py
+++ b/agbenchmark/challenges/code/code.py
@@ -0,0 +1,8 @@
+import pytest
+
+from agbenchmark.challenge import Challenge
+
+
+@pytest.mark.code
+class CodeChallenge(Challenge):
+    """Challenge for memory"""
--- a/agbenchmark/challenges/code/d1/artifacts_in/init.py
+++ b/agbenchmark/challenges/code/d1/artifacts_in/init.py
--- a/agbenchmark/challenges/code/d1/artifacts_in/code.py
+++ b/agbenchmark/challenges/code/d1/artifacts_in/code.py
@@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
--- a/agbenchmark/challenges/code/d1/artifacts_in/test.py
+++ b/agbenchmark/challenges/code/d1/artifacts_in/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
--- a/agbenchmark/challenges/code/d1/artifacts_out/init.py
+++ b/agbenchmark/challenges/code/d1/artifacts_out/init.py
--- a/agbenchmark/challenges/code/d1/artifacts_out/code.py
+++ b/agbenchmark/challenges/code/d1/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
--- a/agbenchmark/challenges/code/d1/artifacts_out/test.py
+++ b/agbenchmark/challenges/code/d1/artifacts_out/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
@@ -0,0 +1,22 @@
+{
+  "name": "debug_simple_typo_with_guidance",
+  "category": ["code"],
+  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+  "dependencies": [],
+  "ground": {
+    "answer": "2314",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "mock": {
+    "mock_func": null,
+    "mock_task": null
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@@ -0,0 +1,31 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.code.code import CodeChallenge
+
+
+class TestDebugSimpleTypoWithGuidance(CodeChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(
+            os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json"
+        )
+
+    @pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores