diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
index ce9d9298..c29c3d83 100644
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
@@ -4,7 +4,7 @@
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
   "dependencies": [],
   "ground": {
-    "answer": "2314",
+    "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
     "should_not_contain": [],
     "files": ["test.py"],
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2/artifacts_in/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/code.py b/agbenchmark/challenges/code/d2/artifacts_in/code.py
new file mode 100644
index 00000000..df8120bf
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/artifacts_in/code.py
@@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/test.py b/agbenchmark/challenges/code/d2/artifacts_in/test.py
new file mode 100644
index 00000000..d85d1353
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/artifacts_in/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2/artifacts_out/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/code.py b/agbenchmark/challenges/code/d2/artifacts_out/code.py
new file mode 100644
index 00000000..de3d8c62
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/test.py b/agbenchmark/challenges/code/d2/artifacts_out/test.py
new file mode 100644
index 00000000..d85d1353
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/artifacts_out/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d2/d2_data.json b/agbenchmark/challenges/code/d2/d2_data.json
new file mode 100644
index 00000000..6003055a
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/d2_data.json
@@ -0,0 +1,22 @@
+{
+  "name": "debug_simple_typo_without_guidance",
+  "category": ["code"],
+  "task": "Make test.py run without errors.",
+  "dependencies": [],
+  "ground": {
+    "answer": "[0, 1] [2, 5] [0, 3]",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "mock": {
+    "mock_func": null,
+    "mock_task": null
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py
new file mode 100644
index 00000000..d49f9dfe
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/d2_test.py
@@ -0,0 +1,32 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.code.code import CodeChallenge
+
+
+class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "d2_data.json")
+
+    @pytest.mark.depends(
+        name="test_debug_simple_typo_without_guidance",
+        depends=["test_debug_simple_typo_with_guidance"],
+    )
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py
index 5b84965c..57c03405 100644
--- a/agbenchmark/mocks/mock_manager.py
+++ b/agbenchmark/mocks/mock_manager.py
@@ -1,11 +1,11 @@
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import agbenchmark.mocks.tests.basic_mocks as basic_mocks
 import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks
 
 
 class MockManager:
-    def __init__(self, task: str, config: Dict[str, Any]) -> None:
+    def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None:
         self.task = task
         self.workspace = config["workspace"]
         self.modules = [basic_mocks, retrieval_mocks]
diff --git a/mypy.ini b/mypy.ini
index ceb13fcd..764c239f 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,4 +1,5 @@
 [mypy]
+namespace_packages = True
 follow_imports = skip
 check_untyped_defs = True
 disallow_untyped_defs = True
diff --git a/regression_tests.json b/regression_tests.json
index 3c8988a1..59a9694b 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -50,5 +50,10 @@
             "basic_write_file"
         ],
         "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
+    },
+    "TestDebugSimpleTypoWithoutGuidance": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/code/d2/d2_test.py"
     }
 }
\ No newline at end of file