From 9ede17891bb4a322d51ec2bf1cc9e60e93db0acd Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 7 Jul 2023 13:50:53 -0700
Subject: [PATCH] Add 'Debug simple typo with guidance' challenge (#65)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 agbenchmark/agent_interface.py                | 31 ++++++++--
 agbenchmark/challenge.py                      | 62 +++++++++++--------
 agbenchmark/challenges/README.md              |  3 +-
 agbenchmark/challenges/code/code.py           |  8 +++
 .../artifacts_in/__init__.py}                 |  0
 .../challenges/code/d1/artifacts_in/code.py   | 13 ++++
 .../challenges/code/d1/artifacts_in/test.py   | 31 ++++++++++
 .../code/d1/artifacts_out/__init__.py         |  0
 .../challenges/code/d1/artifacts_out/code.py  | 12 ++++
 .../challenges/code/d1/artifacts_out/test.py  | 31 ++++++++++
 .../debug_simple_typo_with_guidance_data.json | 22 +++++++
 .../debug_simple_typo_with_guidance_test.py   | 31 ++++++++++
 agbenchmark/challenges/define_task_types.py   |  3 +-
 .../instructions_1.txt                        |  0
 .../instructions_2.txt                        |  0
 .../instructions_3.txt                        |  0
 .../instructions_4.txt                        |  0
 .../instructions_5.txt                        |  0
 agbenchmark/challenges/memory/m1/m1_data.json |  3 +-
 agbenchmark/challenges/memory/m1/m1_test.py   |  4 +-
 .../instructions_1.txt                        |  0
 .../instructions_2.txt                        |  0
 .../instructions_3.txt                        |  0
 .../instructions_4.txt                        |  0
 .../instructions_5.txt                        |  0
 .../memory/m2/remember_multiple_ids_data.json |  3 +-
 .../memory/m2/remember_multiple_ids_test.py   |  4 +-
 .../instructions_1.txt                        |  0
 .../instructions_2.txt                        |  0
 .../instructions_3.txt                        |  0
 .../instructions_4.txt                        |  0
 .../instructions_5.txt                        |  0
 ...remember_multiple_ids_with_noise_data.json |  3 +-
 .../remember_multiple_ids_with_noise_test.py  |  4 +-
 .../instructions_1.txt                        |  0
 .../instructions_2.txt                        |  0
 .../instructions_3.txt                        |  0
 .../instructions_4.txt                        |  0
 .../instructions_5.txt                        |  0
 ...mber_multiple_phrases_with_noise_data.json |  3 +-
 ...member_multiple_phrases_with_noise_test.py |  4 +-
 .../challenges/retrieval/r1/r1_data.json      |  3 +-
 .../challenges/retrieval/r1/r1_test.py        |  4 +-
 .../challenges/retrieval/r2/r2_data.json      |  3 +-
 .../challenges/retrieval/r2/r2_test.py        |  4 +-
 .../challenges/retrieval/r3/r3_data.json      |  3 +-
 .../challenges/retrieval/r3/r3_test.py        |  4 +-
 agbenchmark/mocks/tests/basic_mocks.py        | 12 ----
 .../file_to_check.txt                         |  0
 .../read_file/artifacts_out/file_to_check.txt |  1 +
 .../read_file/r_file_data.json                |  7 ++-
 .../read_file/read_file_test.py               |  4 +-
 .../write_file/w_file_data.json               |  3 +-
 .../write_file/write_file_test.py             |  4 +-
 pyproject.toml                                |  1 +
 regression_tests.json                         | 45 ++++++++------
 56 files changed, 288 insertions(+), 85 deletions(-)
 create mode 100644 agbenchmark/challenges/code/code.py
 rename agbenchmark/challenges/code/{c1_test.py => d1/artifacts_in/__init__.py} (100%)
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/code.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/test.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/__init__.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/code.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/test.py
 create mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
 create mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
 rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_1.txt (100%)
 rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_2.txt (100%)
 rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_3.txt (100%)
 rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_4.txt (100%)
 rename agbenchmark/challenges/memory/m1/{artifacts => artifacts_in}/instructions_5.txt (100%)
 rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_1.txt (100%)
 rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_2.txt (100%)
 rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_3.txt (100%)
 rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_4.txt (100%)
 rename agbenchmark/challenges/memory/m2/{artifacts => artifacts_in}/instructions_5.txt (100%)
 rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_1.txt (100%)
 rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_2.txt (100%)
 rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_3.txt (100%)
 rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_4.txt (100%)
 rename agbenchmark/challenges/memory/m3/{artifacts => artifacts_in}/instructions_5.txt (100%)
 rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_1.txt (100%)
 rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_2.txt (100%)
 rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_3.txt (100%)
 rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_4.txt (100%)
 rename agbenchmark/challenges/memory/m4/{artifacts => artifacts_in}/instructions_5.txt (100%)
 rename agbenchmark/tests/basic_abilities/read_file/{artifacts => artifacts_in}/file_to_check.txt (100%)
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt

diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 8e9e5a14..05540f6d 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 import subprocess
 import sys
 import time
@@ -14,13 +15,20 @@ MOCK_FLAG = os.getenv("MOCK_TEST")
 
 
 def run_agent(
-    task: Optional[str], mock_func: Optional[str], config: Dict[str, Any]
+    task: Optional[str],
+    mock_func: Optional[str],
+    config: Dict[str, Any],
+    challenge_location: str,
 ) -> None:
     """Calling to get a response"""
 
-    if mock_func == None and MOCK_FLAG == "True":
-        print("No mock provided")
-    elif MOCK_FLAG == "True":
+    if MOCK_FLAG == "True":
+        copy_artifacts_into_workspace(
+            config["workspace"], "artifacts_out", challenge_location
+        )
+        if mock_func is None:
+            print("No mock provided")
+            return
         mock_manager = MockManager(
             task, config
         )  # workspace doesn't need to be passed in, stays the same
@@ -77,4 +85,19 @@ def run_agent(
         process.wait()
 
 
+def copy_artifacts_into_workspace(
+    workspace: str, artifact_folder_name: str, challenge_dir_path: str
+) -> None:
+    source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
+
+    # Check if source_dir exists, if not then return immediately.
+    if not os.path.exists(source_dir):
+        return
+
+    for file_name in os.listdir(source_dir):
+        full_file_name = os.path.join(source_dir, file_name)
+        if os.path.isfile(full_file_name):
+            shutil.copy(full_file_name, workspace)
+
+
 ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index dee2b435..4c8e6984 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -1,9 +1,10 @@
 import glob
 import inspect
 import os
-import shutil
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+import subprocess
+import types
+from abc import ABC, ABCMeta, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Type, cast
 
 import pytest
 from dotenv import load_dotenv
@@ -16,7 +17,20 @@ mock_test_str = os.getenv("MOCK_TEST")
 MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
 
 
-class Challenge(ABC):
+class ChallengeMeta(ABCMeta):
+    def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
+
+        super().__init__(name, bases, dct)
+        try:
+            frame = cast(types.FrameType, inspect.currentframe())
+            assert frame.f_back is not None
+            self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back))
+        except Exception as e:
+            print(f"Unable to get the file from 8 frames back due to: {str(e)}")
+            raise e
+
+
+class Challenge(ABC, metaclass=ChallengeMeta):
     """The parent class to all specific challenges classes.
     Defines helper methods for running a challenge"""
 
@@ -52,11 +66,13 @@ class Challenge(ABC):
         return self.data.dependencies
 
     def setup_challenge(self, config: Dict[str, Any]) -> None:
-        from agbenchmark.agent_interface import run_agent
+        from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
 
-        self.copy_artifacts_into_workspace(config["workspace"])
+        copy_artifacts_into_workspace(
+            config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
+        )
 
-        run_agent(self.task, self.mock, config)
+        run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
 
     @property
     def name(self) -> str:
@@ -77,8 +93,7 @@ class Challenge(ABC):
         with open(workspace_dir, "r") as f:
             return f.read()
 
-    @staticmethod
-    def open_files(workspace: str, file_patterns: list) -> List[str]:
+    def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
         script_dir = os.path.abspath(workspace)
         files_contents = []
 
@@ -92,8 +107,17 @@ class Challenge(ABC):
                 matching_files = [os.path.join(script_dir, file_pattern)]
 
             for file_path in matching_files:
-                with open(file_path, "r") as f:
-                    files_contents.append(f.read())
+                if self.data.ground.type == "execute_python_code":
+                    result = subprocess.run(
+                        ["python3", file_path],
+                        cwd=os.path.abspath(workspace),
+                        capture_output=True,
+                        text=True,
+                    )
+                    files_contents.append(result.stdout)
+                else:
+                    with open(file_path, "r") as f:
+                        files_contents.append(f.read())
 
         return files_contents
 
@@ -135,19 +159,3 @@ class Challenge(ABC):
                     )
 
         return 1.0
-
-    def copy_artifacts_into_workspace(self, workspace: str) -> None:
-        curr_frame = inspect.currentframe()
-        outer_frame = inspect.getouterframes(curr_frame)[2]
-        caller_file_path = outer_frame.filename
-        caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path))
-        source_dir = os.path.join(caller_dir_path, "artifacts")
-
-        # Check if source_dir exists, if not then return immediately.
-        if not os.path.exists(source_dir):
-            return
-
-        for file_name in os.listdir(source_dir):
-            full_file_name = os.path.join(source_dir, file_name)
-            if os.path.isfile(full_file_name):
-                shutil.copy(full_file_name, workspace)
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 9e74d19c..2d782d1f 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -33,7 +33,8 @@ Example:
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_write_file_mock",
diff --git a/agbenchmark/challenges/code/code.py b/agbenchmark/challenges/code/code.py
new file mode 100644
index 00000000..508d24a9
--- /dev/null
+++ b/agbenchmark/challenges/code/code.py
@@ -0,0 +1,8 @@
+import pytest
+
+from agbenchmark.challenge import Challenge
+
+
+@pytest.mark.code
+class CodeChallenge(Challenge):
+    """Challenge for memory"""
diff --git a/agbenchmark/challenges/code/c1_test.py b/agbenchmark/challenges/code/d1/artifacts_in/__init__.py
similarity index 100%
rename from agbenchmark/challenges/code/c1_test.py
rename to agbenchmark/challenges/code/d1/artifacts_in/__init__.py
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/code.py b/agbenchmark/challenges/code/d1/artifacts_in/code.py
new file mode 100644
index 00000000..df8120bf
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/artifacts_in/code.py
@@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/test.py b/agbenchmark/challenges/code/d1/artifacts_in/test.py
new file mode 100644
index 00000000..d85d1353
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/artifacts_in/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1/artifacts_out/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/code.py b/agbenchmark/challenges/code/d1/artifacts_out/code.py
new file mode 100644
index 00000000..de3d8c62
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/test.py b/agbenchmark/challenges/code/d1/artifacts_out/test.py
new file mode 100644
index 00000000..d85d1353
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/artifacts_out/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
new file mode 100644
index 00000000..ce9d9298
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
@@ -0,0 +1,22 @@
+{
+  "name": "debug_simple_typo_with_guidance",
+  "category": ["code"],
+  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+  "dependencies": [],
+  "ground": {
+    "answer": "2314",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "mock": {
+    "mock_func": null,
+    "mock_task": null
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
new file mode 100644
index 00000000..e5f50c70
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@@ -0,0 +1,31 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.code.code import CodeChallenge
+
+
+class TestDebugSimpleTypoWithGuidance(CodeChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(
+            os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json"
+        )
+
+    @pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 52df3017..f84df126 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -5,7 +5,7 @@ from pydantic import BaseModel
 
 
 class Mock(BaseModel):
-    mock_func: str
+    mock_func: Optional[str] = None
     mock_task: Optional[str] = None
 
 
@@ -20,6 +20,7 @@ class Ground(BaseModel):
     should_contain: Optional[List[str]] = None
     should_not_contain: Optional[List[str]] = None
     files: List[str]
+    type: str
 
 
 class ChallengeData(BaseModel):
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt
rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt
rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt
rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt
rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt
rename to agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
diff --git a/agbenchmark/challenges/memory/m1/m1_data.json b/agbenchmark/challenges/memory/m1/m1_data.json
index 7023f85f..3e410ac5 100644
--- a/agbenchmark/challenges/memory/m1/m1_data.json
+++ b/agbenchmark/challenges/memory/m1/m1_data.json
@@ -7,7 +7,8 @@
     "answer": "2314",
     "should_contain": ["2314"],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_memory_mock",
diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py
index 28e600cc..c1f37024 100644
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@@ -16,7 +16,9 @@ class TestBasicMemory(MemoryChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt
rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt
rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt
rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt
rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt
rename to agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
index 374df616..29d7339b 100644
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
@@ -7,7 +7,8 @@
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "remember_multiple_ids_mock",
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
index d5f0cf1a..f0f2b397 100644
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
@@ -20,7 +20,9 @@ class TestRememberMultipleIds(MemoryChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt
rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt
rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt
rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt
rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt
rename to agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
index 95c93ccb..6b53c457 100644
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
@@ -7,7 +7,8 @@
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "remember_multiple_ids_mock",
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
index 4d2d6495..493ea357 100644
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
@@ -21,7 +21,9 @@ class TestRememberMultipleIdsWithNoise(MemoryChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt
rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt
rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt
rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt
rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt
rename to agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
index e1ecb16f..316ef947 100644
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
@@ -12,7 +12,8 @@
       "The giant hamster rode a unicycle through the crowded mall"
     ],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "remember_multiple_phrases_with_noise_mock",
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
index fd33da1c..e37e9a38 100644
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
@@ -21,7 +21,9 @@ class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index 44fce24e..8fca01b7 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -7,7 +7,8 @@
     "answer": "£25.89",
     "should_contain": ["25.89"],
     "should_not_contain": [],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_retrieval_mock",
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index d107d964..285b8aff 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -16,7 +16,9 @@ class TestRetrieval(RetrievalChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/retrieval/r2/r2_data.json b/agbenchmark/challenges/retrieval/r2/r2_data.json
index 925e6db8..3c388f19 100644
--- a/agbenchmark/challenges/retrieval/r2/r2_data.json
+++ b/agbenchmark/challenges/retrieval/r2/r2_data.json
@@ -7,7 +7,8 @@
     "answer": "81,462",
     "should_contain": ["81,462"],
     "should_not_contain": [],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_retrieval_2_mock",
diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py
index a60296ec..ba727b8e 100644
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -16,7 +16,9 @@ class TestRetrieval2(RetrievalChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/retrieval/r3/r3_data.json b/agbenchmark/challenges/retrieval/r3/r3_data.json
index 183529c4..41545615 100644
--- a/agbenchmark/challenges/retrieval/r3/r3_data.json
+++ b/agbenchmark/challenges/retrieval/r3/r3_data.json
@@ -7,7 +7,8 @@
     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
     "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
     "should_not_contain": [],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_retrieval_3_mock",
diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py
index bcd48d33..b58f4267 100644
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -16,7 +16,9 @@ class TestRetrieval3(RetrievalChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 37ded0ae..32149eb8 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -1,18 +1,6 @@
 from agbenchmark.challenge import Challenge
 
 
-def basic_read_file_mock(task: str, workspace: str) -> None:
-    """
-    This mock reads a file and returns its content.
-    """
-
-    file_contents = Challenge.open_file(workspace, "file_to_check.txt")
-
-    Challenge.write_to_file(
-        workspace, "file_to_check.txt", f"random string: {file_contents}"
-    )
-
-
 def basic_write_file_mock(task: str, workspace: str) -> None:
     """
     This mock writes to a file (creates one if it doesn't exist)
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
similarity index 100%
rename from agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt
rename to agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
new file mode 100644
index 00000000..c1a7879a
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
@@ -0,0 +1 @@
+random string Hello World!
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index a74b875a..7463d22f 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -4,9 +4,10 @@
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
   "dependencies": ["basic_write_file"],
   "ground": {
-    "answer": "random string: Hello World!",
-    "should_contain": ["random string: Hello World!"],
-    "files": ["file_to_check.txt"]
+    "answer": "random string Hello World!",
+    "should_contain": ["random string", "Hello World!"],
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_read_file_mock"
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index e7f2af9e..7c38d283 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -15,7 +15,9 @@ class TestReadFile(BasicChallenge):
     @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 358ebb53..9232a45a 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -7,7 +7,8 @@
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_write_file_mock",
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 81f72cc9..474d6712 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,7 +16,9 @@ class TestWriteFile(BasicChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/pyproject.toml b/pyproject.toml
index e0d579ca..33a8671c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ markers = [
     "retrieval",
     "regression",
     "basic",
+    "code",
     "memory"
 ]
 
diff --git a/regression_tests.json b/regression_tests.json
index 1195efbc..3c8988a1 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,9 +1,34 @@
 {
+    "TestDebugSimpleTypoWithGuidance": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
+    },
     "TestBasicMemory": {
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/challenges/memory/m1/m1_test.py"
     },
+    "TestRememberMultipleIds": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
+    },
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
+    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+    },
+    "TestRetrieval": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
+    },
     "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
@@ -19,31 +44,11 @@
         "dependencies": [],
         "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
     },
-    "TestRetrieval": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
-    },
     "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
             "basic_write_file"
         ],
         "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
-    },
-    "TestRememberMultipleIds": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
-    },
-    "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
-    },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
     }
 }
\ No newline at end of file