diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0e926386..b7864db6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -141,6 +141,10 @@ jobs:
             agbenchmark start --improve --mock
             agbenchmark start --mock
             agbenchmark start --mock --category=retrieval
+            agbenchmark start --mock --category=interface
+            agbenchmark start --mock --category=code
+            agbenchmark start --mock --category=memory
+            agbenchmark start --mock --category=iterate
           else
             curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
             agbenchmark start --maintain
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index aeebd7ad..874fd45b 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -46,6 +46,13 @@ class Challenge(ABC):
 
         run_agent(self.task, config, self.CHALLENGE_LOCATION)
 
+        # hidden files are added after the agent runs. Hidden files can be python test files.
+        # We copy them in the workspace to make it easy to import the code produced by the agent
+
+        copy_artifacts_into_workspace(
+            config["workspace"], "hidden_files", self.CHALLENGE_LOCATION
+        )
+
     def test_method(self, config: Dict[str, Any]) -> None:
         raise NotImplementedError
 
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 305cd28f..a890c9d3 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -46,3 +46,19 @@ Example:
 Current Output:
 
 - **score** (float): scores range from [0, 1]
+
+## Add files to challenges: 
+
+### artifacts_in
+
+This folder contains all the files you want the agent to have in its workspace BEFORE the challenge starts
+
+### artifacts_out
+This folder contains all the files you would like the agent to generate. This folder is used to mock the agent.
+This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works.
+
+### hidden_files
+This folder contains files hidden from the agent but useful to assess whether a challenge is successful.
+For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge.
+This allows us to run this test.py and easily import code generated by the agent.
+For example see: TestBasicCodeGeneration challenge.
diff --git a/agbenchmark/challenges/code/d4/artifacts_out/__init__.py b/agbenchmark/challenges/code/d4/artifacts_out/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/agbenchmark/challenges/code/d4/artifacts_out/code.py b/agbenchmark/challenges/code/d4/artifacts_out/code.py
new file mode 100644
index 00000000..de3d8c62
--- /dev/null
+++ b/agbenchmark/challenges/code/d4/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json
new file mode 100644
index 00000000..51f6f270
--- /dev/null
+++ b/agbenchmark/challenges/code/d4/data.json
@@ -0,0 +1,18 @@
+{
+  "name": "TestBasicCodeGeneration",
+  "category": ["code", "iterate"],
+  "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+  "dependencies": ["TestWriteFile"],
+  "ground": {
+    "answer": "The two_sum function coded properly.",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "novice",
+    "description": "Tests ability for the agent to create the two_sum function.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/code/d4/hidden_files/test.py b/agbenchmark/challenges/code/d4/hidden_files/test.py
new file mode 100644
index 00000000..d85d1353
--- /dev/null
+++ b/agbenchmark/challenges/code/d4/hidden_files/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 521d626c..bca191cd 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 521d626c0075ed6545f01b771757c856f8addbd6
+Subproject commit bca191cd76cdea0335da91d004c64d9bb8520fea
diff --git a/pyproject.toml b/pyproject.toml
index b0526ab5..48be9cf5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,10 +38,10 @@ testpaths = [
 ]
 markers = [
     "retrieval",
-    "regression",
     "interface",
     "code",
-    "memory"
+    "memory",
+    "iterate"
 ]
 
 [tool.poetry.scripts]