From 69bd41f7414c1028e61affd3a340054355d9249a Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 8 Jul 2023 21:43:38 -0400
Subject: [PATCH] Quality of life improvements & fixes (#75)

---
 .gitignore                                    |  2 +-
 agbenchmark/README.md                         |  5 +-
 .../regression => }/RegressionManager.py      |  0
 agbenchmark/agent_interface.py                | 21 ++----
 agbenchmark/challenge.py                      | 39 +++++------
 agbenchmark/challenges/README.md              |  7 +-
 agbenchmark/challenges/code/code.py           |  8 ---
 agbenchmark/challenges/code/d1/data.json      |  7 +-
 .../debug_simple_typo_with_guidance_test.py   | 18 +----
 agbenchmark/challenges/code/d2/d2_test.py     | 20 +-----
 agbenchmark/challenges/code/d2/data.json      |  9 +--
 agbenchmark/challenges/define_task_types.py   |  7 --
 .../interface}/browse_test.py                 |  0
 .../read_file/artifacts_in/file_to_check.txt  |  0
 .../read_file/artifacts_out/file_to_check.txt |  0
 .../interface}/read_file/data.json            |  9 +--
 .../interface/read_file/read_file_test.py     | 12 ++++
 .../write_file/artifacts_out/random_file.txt  |  0
 .../interface}/write_file/data.json           |  8 +--
 .../interface/write_file/write_file_test.py   | 13 ++++
 agbenchmark/challenges/memory/m1/data.json    |  3 +-
 agbenchmark/challenges/memory/m1/m1_test.py   | 18 +----
 agbenchmark/challenges/memory/m2/data.json    |  7 +-
 .../memory/m2/remember_multiple_ids_test.py   | 20 +-----
 agbenchmark/challenges/memory/m3/data.json    |  7 +-
 .../remember_multiple_ids_with_noise_test.py  | 21 +-----
 agbenchmark/challenges/memory/m4/data.json    |  7 +-
 ...member_multiple_phrases_with_noise_test.py | 21 +-----
 agbenchmark/challenges/memory/memory.py       |  8 ---
 agbenchmark/challenges/retrieval/r1/data.json |  7 +-
 .../challenges/retrieval/r1/r1_test.py        | 18 +----
 agbenchmark/challenges/retrieval/r2/data.json |  7 +-
 .../challenges/retrieval/r2/r2_test.py        | 18 +----
 agbenchmark/challenges/retrieval/r3/data.json | 25 +++++--
 .../challenges/retrieval/r3/r3_test.py        | 17 +----
 agbenchmark/challenges/retrieval/retrieval.py |  8 ---
 agbenchmark/conftest.py                       | 66 ++++++++++++++-----
 agbenchmark/mocks/mock_manager.py             | 28 --------
 agbenchmark/mocks/tests/basic_mocks.py        | 12 ----
 agbenchmark/start_benchmark.py                | 48 +++++++-------
 .../tests/basic_abilities/basic_challenge.py  |  8 ---
 .../read_file/read_file_test.py               | 24 -------
 .../basic_abilities/remember_context_test.py  |  0
 .../write_file/write_file_test.py             | 25 -------
 agent/Auto-GPT                                |  2 +-
 agent/gpt-engineer                            |  2 +-
 agent/smol-developer                          |  2 +-
 config.json                                   |  6 +-
 pyproject.toml                                |  2 +-
 regression_tests.json                         | 60 ++++++++---------
 50 files changed, 220 insertions(+), 462 deletions(-)
 rename agbenchmark/{tests/regression => }/RegressionManager.py (100%)
 delete mode 100644 agbenchmark/challenges/code/code.py
 rename agbenchmark/{tests/basic_abilities => challenges/interface}/browse_test.py (100%)
 rename agbenchmark/{tests/basic_abilities => challenges/interface}/read_file/artifacts_in/file_to_check.txt (100%)
 rename agbenchmark/{tests/basic_abilities => challenges/interface}/read_file/artifacts_out/file_to_check.txt (100%)
 rename agbenchmark/{tests/basic_abilities => challenges/interface}/read_file/data.json (72%)
 create mode 100644 agbenchmark/challenges/interface/read_file/read_file_test.py
 rename agbenchmark/{tests/basic_abilities => challenges/interface}/write_file/artifacts_out/random_file.txt (100%)
 rename agbenchmark/{tests/basic_abilities => challenges/interface}/write_file/data.json (72%)
 create mode 100644 agbenchmark/challenges/interface/write_file/write_file_test.py
 delete mode 100644 agbenchmark/challenges/memory/memory.py
 delete mode 100644 agbenchmark/challenges/retrieval/retrieval.py
 delete mode 100644 agbenchmark/mocks/mock_manager.py
 delete mode 100644 agbenchmark/mocks/tests/basic_mocks.py
 delete mode 100644 agbenchmark/tests/basic_abilities/basic_challenge.py
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/remember_context_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py

diff --git a/.gitignore b/.gitignore
index 3581dc93..7d0419ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-agbenchmark/mocks/workspace/
+agbenchmark/workspace/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/agbenchmark/README.md b/agbenchmark/README.md
index 01f602dc..42e2bd4d 100644
--- a/agbenchmark/README.md
+++ b/agbenchmark/README.md
@@ -53,8 +53,7 @@ import os
 class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
-    @pytest.mark.depends(on=[], name="basic_write_file")
-    def test_method(self, workspace):
+    def test_method(self, config):
         # implement scoring logic by looking at workspace
 ```
 
@@ -82,7 +81,7 @@ Add the below to create a file in the workspace prior to running a challenge. On
 
 ## Workspace
 
-If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
+If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
 
 #### Dataset
 
diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/RegressionManager.py
similarity index 100%
rename from agbenchmark/tests/regression/RegressionManager.py
rename to agbenchmark/RegressionManager.py
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 4244fa08..1d43577c 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -3,37 +3,27 @@ import shutil
 import subprocess
 import sys
 import time
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 from dotenv import load_dotenv
 
-from agbenchmark.mocks.mock_manager import MockManager
-
 load_dotenv()
 
-MOCK_FLAG = os.getenv("MOCK_TEST")
+mock_test_str = os.getenv("MOCK_TEST")
+MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
 
 
 def run_agent(
     task: str,
-    mock_func: Optional[str],
     config: Dict[str, Any],
     challenge_location: str,
 ) -> None:
     """Calling to get a response"""
 
-    if MOCK_FLAG == "True":
+    if MOCK_FLAG:
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_out", challenge_location
         )
-        if mock_func is None:
-            print("No mock provided")
-            return
-        mock_manager = MockManager(
-            task, config
-        )  # workspace doesn't need to be passed in, stays the same
-        print("Server unavailable, using mock", mock_func)
-        mock_manager.delegate(mock_func)
     else:
         timeout = config["cutoff"]
         print(
@@ -99,6 +89,3 @@ def copy_artifacts_into_workspace(
         full_file_name = os.path.join(source_dir, file_name)
         if os.path.isfile(full_file_name):
             shutil.copy(full_file_name, workspace)
-
-
-ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index d7e1c896..ddf69f42 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -4,9 +4,8 @@ import os
 import subprocess
 import types
 from abc import ABC, ABCMeta
-from typing import Any, Dict, List, Optional, Tuple, Type, cast
+from typing import Any, Dict, List, Tuple, Type, cast
 
-import pytest
 from dotenv import load_dotenv
 
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
@@ -19,7 +18,6 @@ MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
 
 class ChallengeMeta(ABCMeta):
     def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
-
         super().__init__(name, bases, dct)
         try:
             frame = cast(types.FrameType, inspect.currentframe())
@@ -40,18 +38,13 @@ class Challenge(ABC, metaclass=ChallengeMeta):
     @property
     def data(self) -> ChallengeData:
         file_path = f"{self.CHALLENGE_LOCATION}/data.json"
-        Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
+        if file_path not in Challenge._data_cache:
+            Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
         return Challenge._data_cache[file_path]
 
-    @property
-    def mock(self) -> Optional[str]:
-        return self.data.mock.mock_func if self.data.mock else None
-
     @property
     def task(self) -> str:
-        return str(
-            self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
-        )
+        return self.data.task
 
     @property
     def dependencies(self) -> list:
@@ -64,17 +57,8 @@ class Challenge(ABC, metaclass=ChallengeMeta):
             config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
         )
 
-        run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
+        run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)
 
-    @property
-    def name(self) -> str:
-        return self.data.name
-
-    @pytest.mark.parametrize(
-        "challenge_data",
-        [data],
-        indirect=True,
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         raise NotImplementedError
 
@@ -151,3 +135,16 @@ class Challenge(ABC, metaclass=ChallengeMeta):
                     )
 
         return 1.0
+
+    def get_scores(self, config: Dict[str, Any]) -> List[float]:
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        return scores
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 2d782d1f..305cd28f 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -25,10 +25,9 @@ Example:
 
 ```python
 {
-  "name": "basic_write_file",
   "category": ["basic"],
   "task": "Print the the capital of America to a .txt file",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"], # the class name of the test
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
@@ -36,10 +35,6 @@ Example:
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests the writing to file",
diff --git a/agbenchmark/challenges/code/code.py b/agbenchmark/challenges/code/code.py
deleted file mode 100644
index 508d24a9..00000000
--- a/agbenchmark/challenges/code/code.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.code
-class CodeChallenge(Challenge):
-    """Challenge for memory"""
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
index c29c3d83..6ac284b8 100644
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "debug_simple_typo_with_guidance",
   "category": ["code"],
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
-  "dependencies": [],
+  "dependencies": ["TestReadFile", "TestWriteFile"],
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@@ -10,10 +9,6 @@
     "files": ["test.py"],
     "type": "execute_python_code"
   },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability for the agent to debug python code with a simple typo in it.",
diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
index 16a12ae4..d104b337 100644
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict
 
-import pytest
-
-from agbenchmark.challenges.code.code import CodeChallenge
+from agbenchmark.challenge import Challenge
 
 
-class TestDebugSimpleTypoWithGuidance(CodeChallenge):
+class TestDebugSimpleTypoWithGuidance(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py
index 7a5988b9..b02114a7 100644
--- a/agbenchmark/challenges/code/d2/d2_test.py
+++ b/agbenchmark/challenges/code/d2/d2_test.py
@@ -1,28 +1,14 @@
 from typing import Any, Dict
 
-import pytest
-
-from agbenchmark.challenges.code.code import CodeChallenge
+from agbenchmark.challenge import Challenge
 
 
-class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
+class TestDebugSimpleTypoWithoutGuidance(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(
-        name="test_debug_simple_typo_without_guidance",
-        depends=["test_debug_simple_typo_with_guidance"],
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
+        scores = self.get_scores(config)
 
         assert 1 in scores
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
index 6003055a..3de5111f 100644
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "debug_simple_typo_without_guidance",
   "category": ["code"],
   "task": "Make test.py run without errors.",
-  "dependencies": [],
+  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@@ -10,12 +9,8 @@
     "files": ["test.py"],
     "type": "execute_python_code"
   },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "medium",
     "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index f84df126..308cb5ea 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,11 +4,6 @@ from typing import List, Optional
 from pydantic import BaseModel
 
 
-class Mock(BaseModel):
-    mock_func: Optional[str] = None
-    mock_task: Optional[str] = None
-
-
 class Info(BaseModel):
     difficulty: str
     description: str
@@ -24,12 +19,10 @@ class Ground(BaseModel):
 
 
 class ChallengeData(BaseModel):
-    name: str
     category: List[str]
     task: str
     dependencies: List[str]
     ground: Ground
-    mock: Optional[Mock] = None
     info: Info
 
     def serialize(self, path: str) -> None:
diff --git a/agbenchmark/tests/basic_abilities/browse_test.py b/agbenchmark/challenges/interface/browse_test.py
similarity index 100%
rename from agbenchmark/tests/basic_abilities/browse_test.py
rename to agbenchmark/challenges/interface/browse_test.py
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt b/agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt
similarity index 100%
rename from agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
rename to agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt b/agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt
similarity index 100%
rename from agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
rename to agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt
diff --git a/agbenchmark/tests/basic_abilities/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json
similarity index 72%
rename from agbenchmark/tests/basic_abilities/read_file/data.json
rename to agbenchmark/challenges/interface/read_file/data.json
index 7463d22f..dd399fab 100644
--- a/agbenchmark/tests/basic_abilities/read_file/data.json
+++ b/agbenchmark/challenges/interface/read_file/data.json
@@ -1,17 +1,14 @@
 {
-  "name": "basic_read_file",
-  "category": ["basic"],
+  "name": "ReadFile",
+  "category": ["interface"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["basic_write_file"],
+  "dependencies": ["TestWriteFile"],
   "ground": {
     "answer": "random string Hello World!",
     "should_contain": ["random string", "Hello World!"],
     "files": ["file_to_check.txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_read_file_mock"
-  },
   "info": {
     "description": "This reads the file quickly",
     "difficulty": "basic",
diff --git a/agbenchmark/challenges/interface/read_file/read_file_test.py b/agbenchmark/challenges/interface/read_file/read_file_test.py
new file mode 100644
index 00000000..591d0a74
--- /dev/null
+++ b/agbenchmark/challenges/interface/read_file/read_file_test.py
@@ -0,0 +1,12 @@
+from typing import Any, Dict
+
+from agbenchmark.challenge import Challenge
+
+
+class TestReadFile(Challenge):
+    """Testing if LLM can read a file"""
+
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+        scores = self.get_scores(config)
+        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt b/agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt
similarity index 100%
rename from agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt
rename to agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt
diff --git a/agbenchmark/tests/basic_abilities/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json
similarity index 72%
rename from agbenchmark/tests/basic_abilities/write_file/data.json
rename to agbenchmark/challenges/interface/write_file/data.json
index 9232a45a..b3e4b6f0 100644
--- a/agbenchmark/tests/basic_abilities/write_file/data.json
+++ b/agbenchmark/challenges/interface/write_file/data.json
@@ -1,6 +1,6 @@
 {
-  "name": "basic_write_file",
-  "category": ["basic"],
+  "name": "WriteFile",
+  "category": ["interface"],
   "task": "Print the the capital of America to a .txt file",
   "dependencies": [],
   "ground": {
@@ -10,10 +10,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests the writing to file",
diff --git a/agbenchmark/challenges/interface/write_file/write_file_test.py b/agbenchmark/challenges/interface/write_file/write_file_test.py
new file mode 100644
index 00000000..4a52b097
--- /dev/null
+++ b/agbenchmark/challenges/interface/write_file/write_file_test.py
@@ -0,0 +1,13 @@
+from typing import Any, Dict
+
+from agbenchmark.challenge import Challenge
+
+
+class TestWriteFile(Challenge):
+    """Testing if LLM can write to a file"""
+
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        scores = self.get_scores(config)
+        assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
index c7d44190..f771a266 100644
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_memory",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestReadFile", "TestWriteFile"],
   "ground": {
     "answer": "2314",
     "should_contain": ["2314"],
diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py
index 9e5e0a77..0fc537ee 100644
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict
 
-import pytest
-
-from agbenchmark.challenges.memory.memory import MemoryChallenge
+from agbenchmark.challenge import Challenge
 
 
-class TestBasicMemory(MemoryChallenge):
+class TestBasicMemory(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(name="test_basic_memory")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
index 6e898298..998e894b 100644
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "remember_multiple_ids",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestBasicMemory"],
   "ground": {
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
@@ -10,10 +9,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "remember_multiple_ids_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
index 6ba38dad..c88f2883 100644
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
@@ -1,27 +1,13 @@
 from typing import Any, Dict
 
-import pytest
-
-from agbenchmark.challenges.memory.memory import MemoryChallenge
+from agbenchmark.challenge import Challenge
 
 
-class TestRememberMultipleIds(MemoryChallenge):
+class TestRememberMultipleIds(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(
-        name="test_remember_multiple_ids", depends=["test_basic_memory"]
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
index 2a4f06ff..d5d95b1d 100644
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "remember_multiple_ids_with_noise_mock",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestRememberMultipleIds"],
   "ground": {
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
@@ -10,10 +9,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "remember_multiple_ids_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
   "info": {
     "difficulty": "medium",
     "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
index 037a6929..0e35dd2f 100644
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
@@ -1,28 +1,13 @@
 from typing import Any, Dict
 
-import pytest
-
-from agbenchmark.challenges.memory.memory import MemoryChallenge
+from agbenchmark.challenge import Challenge
 
 
-class TestRememberMultipleIdsWithNoise(MemoryChallenge):
+class TestRememberMultipleIdsWithNoise(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(
-        name="test_remember_multiple_ids_with_noise",
-        depends=["test_remember_multiple_ids"],
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
index adfd8e33..49831537 100644
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "remember_multiple_phrases_with_noise_mock",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestRememberMultipleIdsWithNoise"],
   "ground": {
     "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
     "should_contain": [
@@ -15,10 +14,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "remember_multiple_phrases_with_noise_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
   "info": {
     "difficulty": "medium",
     "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
index 2c931af8..4c4bdce5 100644
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
@@ -1,28 +1,13 @@
 from typing import Any, Dict
 
-import pytest
-
-from agbenchmark.challenges.memory.memory import MemoryChallenge
+from agbenchmark.challenge import Challenge
 
 
-class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
+class TestRememberMultiplePhrasesWithNoise(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(
-        name="test_remember_multiple_phrases_with_noise",
-        depends=["test_remember_multiple_ids_with_noise"],
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/memory/memory.py b/agbenchmark/challenges/memory/memory.py
deleted file mode 100644
index 429bef23..00000000
--- a/agbenchmark/challenges/memory/memory.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.memory
-class MemoryChallenge(Challenge):
-    """Challenge for memory"""
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
index 8fca01b7..6e1344b8 100644
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_information_retrieval",
   "category": ["retrieval"],
   "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"],
   "ground": {
     "answer": "£25.89",
     "should_contain": ["25.89"],
@@ -10,10 +9,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_retrieval_mock",
-    "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file."
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability to retrieve information from a website.",
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 68d3de4e..9845a7b2 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict
 
-import pytest
-
-from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
+from agbenchmark.challenge import Challenge
 
 
-class TestRetrieval(RetrievalChallenge):
+class TestRetrieval(Challenge):
     """The first information-retrieval challenge"""
 
-    @pytest.mark.depends(name="test_retrieval")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
index 3c388f19..05846b9f 100644
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ b/agbenchmark/challenges/retrieval/r2/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_information_retrieval",
   "category": ["retrieval"],
   "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": [],
+  "dependencies": ["TestRetrieval"],
   "ground": {
     "answer": "81,462",
     "should_contain": ["81,462"],
@@ -10,10 +9,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_retrieval_2_mock",
-    "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability to retrieve information.",
diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py
index 5a1a2069..f0f13ffb 100644
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict
 
-import pytest
-
-from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
+from agbenchmark.challenge import Challenge
 
 
-class TestRetrieval2(RetrievalChallenge):
+class TestRetrieval2(Challenge):
     """The first information-retrieval challenge"""
 
-    @pytest.mark.depends(on=["test_retrieval"], name="test_retrieval_2")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
index 41545615..763c963e 100644
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -1,19 +1,30 @@
 {
-  "name": "basic_information_retrieval",
   "category": ["retrieval"],
   "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": [],
+  "dependencies": ["TestRetrieval2"],
   "ground": {
     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
-    "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
+    "should_contain": [
+      "15",
+      "112",
+      "117",
+      "204",
+      "413",
+      "2,014",
+      "3,198",
+      "4,046",
+      "7,000",
+      "11,759",
+      "21,461",
+      "24,578",
+      "31,536",
+      "53,823",
+      "81,462"
+    ],
     "should_not_contain": [],
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_retrieval_3_mock",
-    "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability to retrieve information.",
diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py
index c4b4bcf1..5887c0b4 100644
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -1,25 +1,14 @@
 from typing import Any, Dict
 
-import pytest
-
-from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
+from agbenchmark.challenge import Challenge
 
 
-class TestRetrieval3(RetrievalChallenge):
+class TestRetrieval3(Challenge):
     """The first information-retrieval challenge"""
 
-    @pytest.mark.depends(on=["test_retrieval_2"], name="test_retrieval_3")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
+        scores = self.get_scores(config)
 
         assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/retrieval.py b/agbenchmark/challenges/retrieval/retrieval.py
deleted file mode 100644
index 891cccef..00000000
--- a/agbenchmark/challenges/retrieval/retrieval.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.retrieval
-class RetrievalChallenge(Challenge):
-    """Challenge for information-retrieval"""
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 40457fb6..ffbb2620 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -2,12 +2,16 @@ import json
 import os
 import shutil
 from pathlib import Path  # noqa
-from typing import Any, Dict, Generator, List
+from typing import Any, Dict, Generator
 
 import pytest
 
-from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH
-from agbenchmark.tests.regression.RegressionManager import RegressionManager
+from agbenchmark.RegressionManager import RegressionManager
+from agbenchmark.start_benchmark import (
+    CONFIG_PATH,
+    REGRESSION_TESTS_PATH,
+    get_regression_data,
+)
 
 
 def resolve_workspace(config: Dict[str, Any]) -> str:
@@ -37,7 +41,7 @@ def config(request: Any) -> None:
         config = json.load(f)
 
     if request.config.getoption("--mock"):
-        config["workspace"] = "agbenchmark/mocks/workspace"
+        config["workspace"] = "agbenchmark/workspace"
     elif isinstance(config["workspace"], str):
         config["workspace"] = resolve_workspace(config)
     else:  # it's a input output dict
@@ -77,9 +81,22 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
 
 def pytest_addoption(parser: Any) -> None:
     parser.addoption("--mock", action="store_true", default=False)
+    parser.addoption("--improve", action="store_true", default=False)
+    parser.addoption("--maintain", action="store_true", default=False)
 
 
-regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+@pytest.fixture(autouse=True)
+def check_regression(request: Any) -> None:
+    test_name = request.node.parent.name
+    data = get_regression_data()
+
+    # Check if the test name exists in the regression tests
+    if request.config.getoption("--improve") and data.get(test_name, None):
+        pytest.skip("Skipping test because it's a regression test and --improve is set")
+    elif request.config.getoption("--maintain") and not data.get(test_name, None):
+        pytest.skip(
+            "Skipping test because it's not a regression test and --maintain is set"
+        )
 
 
 # this is to get the challenge_data from every test
@@ -88,6 +105,9 @@ def challenge_data(request: Any) -> None:
     return request.param
 
 
+regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+
+
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
     if call.when == "call":
         challenge_data = item.funcargs.get("challenge_data", None)
@@ -109,16 +129,6 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
             regression_manager.remove_test(item.nodeid.split("::")[1])
 
 
-def pytest_collection_modifyitems(items: List[Any]) -> None:
-    """Called once all test items are collected. Used
-    to add regression and depends markers to collected test items."""
-    for item in items:
-        # regression add
-        if item.nodeid.split("::")[1] in regression_manager.tests:
-            print(regression_manager.tests)
-            item.add_marker(pytest.mark.regression)
-
-
 def pytest_sessionfinish() -> None:
     """Called at the end of the session to save regression tests"""
     regression_manager.save()
@@ -135,3 +145,29 @@ def pytest_generate_tests(metafunc: Any) -> None:
 
         # Add the parameters to the test function
         metafunc.parametrize("challenge_data", [params], indirect=True)
+
+
+# this is adding the dependency marker and category markers automatically from the json
+def pytest_collection_modifyitems(items: Any, config: Any) -> None:
+    data = get_regression_data()
+
+    for item in items:
+        # Assuming item.cls is your test class
+        test_class_instance = item.cls()
+
+        # Then you can access your properties
+        name = item.parent.cls.__name__
+        dependencies = test_class_instance.data.dependencies
+
+        # Filter dependencies if they exist in regression data if its an improvement test
+        if config.getoption("--improve"):
+            dependencies = [dep for dep in dependencies if not data.get(dep, None)]
+
+        categories = test_class_instance.data.category
+
+        # Add depends marker dynamically
+        item.add_marker(pytest.mark.depends(on=dependencies, name=name))
+
+        # Add category marker dynamically
+        for category in categories:
+            item.add_marker(getattr(pytest.mark, category))
diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py
deleted file mode 100644
index 3a227e49..00000000
--- a/agbenchmark/mocks/mock_manager.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import Any, Dict, Optional
-
-import agbenchmark.mocks.tests.basic_mocks as basic_mocks
-
-
-class MockManager:
-    def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None:
-        self.task = task
-        self.workspace = config["workspace"]
-        self.modules = [basic_mocks]
-
-    def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None:
-        if hasattr(self, mock_function_name):
-            # Check if the mock function is an attribute of this class
-            getattr(self, mock_function_name)(*args, **kwargs)
-        elif mock_function_name in globals():
-            # Check if the function is imported in the file
-            func = globals()[mock_function_name]
-            func(self.task, self.workspace, *args, **kwargs)
-        elif len(self.modules) > 0:
-            # checks if function is in imported modules
-            for module in self.modules:
-                if hasattr(module, mock_function_name):
-                    func = getattr(module, mock_function_name)
-                    func(self.task, self.workspace, *args, **kwargs)
-                    return
-        else:
-            raise ValueError(f"No such mock: {mock_function_name}")
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
deleted file mode 100644
index e4a1dedc..00000000
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from agbenchmark.challenge import Challenge
-
-
-def example_mock(task: str, workspace: str) -> None:
-    """
-    This mock writes to a file (creates one if it doesn't exist)
-    """
-    Challenge.write_to_file(
-        workspace,
-        "file_to_check.txt",
-        "This is an example showing how you can use mocks but here you can use artifacts_out folder instead of a mock.",
-    )
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 295bbf4b..f78e86a1 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -2,11 +2,11 @@ import json
 import os
 import sys
 from pathlib import Path
-from typing import List
+from typing import Any
 
 import click
 import pytest
-from dotenv import load_dotenv, set_key
+from dotenv import load_dotenv
 
 load_dotenv()
 
@@ -26,10 +26,17 @@ def cli() -> None:
 @cli.command()
 @click.option("--category", default=None, help="Specific category to run")
 @click.option("--maintain", is_flag=True, help="Runs only regression tests")
+@click.option("--improve", is_flag=True, help="Run only non-regression tests")
 @click.option("--mock", is_flag=True, help="Run with mock")
-def start(category: str, maintain: bool, mock: bool) -> int:
+def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
+    if maintain and improve:
+        print(
+            "Error: You can't use both --maintain and --improve at the same time. Please choose one."
+        )
+        return 1
+
     if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
         config = {}
 
@@ -55,7 +62,7 @@ def start(category: str, maintain: bool, mock: bool) -> int:
         with open(CONFIG_PATH, "r") as f:
             config = json.load(f)
 
-    set_key(".env", "MOCK_TEST", "True" if mock else "False")
+    os.environ["MOCK_TEST"] = "True" if mock else "False"
 
     if not os.path.exists(REGRESSION_TESTS_PATH):
         with open(REGRESSION_TESTS_PATH, "a"):
@@ -65,42 +72,31 @@ def start(category: str, maintain: bool, mock: bool) -> int:
     for key, value in config.items():
         print(f"{key}: {value}")
 
-    print("Starting benchmark tests...", category)
-    tests_to_run = []
     pytest_args = ["-vs"]
     if category:
         pytest_args.extend(["-m", category])
+        print("Starting benchmark tests ", category)
     else:
-        if maintain:
-            print("Running all regression tests")
-            tests_to_run = get_regression_tests()
-        else:
-            print("Running all categories")
+        print("Running all categories")
+
+    if maintain:
+        print("Running only regression tests")
+        pytest_args.append("--maintain")
+    elif improve:
+        print("Running only non-regression tests")
+        pytest_args.append("--improve")
 
     if mock:
         pytest_args.append("--mock")
 
-    # Run pytest with the constructed arguments
-    if not tests_to_run:
-        tests_to_run = [str(CURRENT_DIRECTORY)]
-    pytest_args.extend(tests_to_run)
-
     return sys.exit(pytest.main(pytest_args))
 
 
-def get_regression_tests() -> List[str]:
-    if not Path(REGRESSION_TESTS_PATH).exists():
-        with open(REGRESSION_TESTS_PATH, "w") as file:
-            json.dump({}, file)
-
+def get_regression_data() -> Any:
     with open(REGRESSION_TESTS_PATH, "r") as file:
         data = json.load(file)
 
-    regression_tests = [
-        str(CURRENT_DIRECTORY / ".." / value["test"]) for key, value in data.items()
-    ]
-
-    return regression_tests
+    return data
 
 
 if __name__ == "__main__":
diff --git a/agbenchmark/tests/basic_abilities/basic_challenge.py b/agbenchmark/tests/basic_abilities/basic_challenge.py
deleted file mode 100644
index 8b3a4db1..00000000
--- a/agbenchmark/tests/basic_abilities/basic_challenge.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.basic
-class BasicChallenge(Challenge):
-    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
deleted file mode 100644
index cf5dceb6..00000000
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import Any, Dict
-
-import pytest
-
-from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
-
-
-class TestReadFile(BasicChallenge):
-    """Testing if LLM can read a file"""
-
-    @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
-        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/remember_context_test.py b/agbenchmark/tests/basic_abilities/remember_context_test.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
deleted file mode 100644
index ba039518..00000000
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from typing import Any, Dict
-
-import pytest
-
-from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
-
-
-class TestWriteFile(BasicChallenge):
-    """Testing if LLM can write to a file"""
-
-    @pytest.mark.depends(name="basic_write_file")
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
-        assert 1 in scores
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index dd65cc25..5a36e43b 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit dd65cc256ca72cb199fe8c5d6ae31c23a7acee62
+Subproject commit 5a36e43b782fdaef8a7270109f8347f0323211d2
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 155ea895..fd705f89 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 155ea895eb5f7e44ed8647b335d90a03b5ffb06d
+Subproject commit fd705f89afd53469c91935a9cae7b92a564025eb
diff --git a/agent/smol-developer b/agent/smol-developer
index 5a3ad431..a1e4a9ff 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit 5a3ad43103b238b9c8f2a2acceff250888be263e
+Subproject commit a1e4a9ff3a75909c4a892e409a55f86a2c57b7c6
diff --git a/config.json b/config.json
index 88526a13..8bbcebdb 100644
--- a/config.json
+++ b/config.json
@@ -1,6 +1,6 @@
 {
-  "workspace": "projects/my-new-project/workspace",
-  "entry_path": "agent/gpt-engineer/benchmarks.py",
-  "home_path": "agent/gpt-engineer",
+  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+  "entry_path": "benchmarks.py",
+  "home_path": "agent/mini-agi",
   "cutoff": 60
 }
diff --git a/pyproject.toml b/pyproject.toml
index 33a8671c..a8f4f8de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ testpaths = [
 markers = [
     "retrieval",
     "regression",
-    "basic",
+    "interface",
     "code",
     "memory"
 ]
diff --git a/regression_tests.json b/regression_tests.json
index 9714d42a..44334801 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,14 +1,4 @@
 {
-    "TestDebugSimpleTypoWithGuidance": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
-    },
-    "TestDebugSimpleTypoWithoutGuidance": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/code/d2/d2_test.py"
-    },
     "TestBasicMemory": {
         "difficulty": "basic",
         "dependencies": [],
@@ -16,44 +6,54 @@
     },
     "TestRememberMultipleIds": {
         "difficulty": "basic",
-        "dependencies": [],
+        "dependencies": [
+            "TestBasicMemory"
+        ],
         "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
     },
     "TestRememberMultipleIdsWithNoise": {
         "difficulty": "medium",
-        "dependencies": [],
+        "dependencies": [
+            "TestRememberMultipleIds"
+        ],
         "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
     },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIdsWithNoise"
+        ],
+        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+    },
     "TestRetrieval": {
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
     },
+    "TestRetrieval2": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestRetrieval"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
+    },
+    "TestRetrieval3": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestRetrieval2"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
+    },
     "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
-    },
-    "TestRetrieval2": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
+        "test": "agbenchmark/challenges/interface/write_file/write_file_test.py"
     },
     "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
-            "basic_write_file"
+            "TestWriteFile"
         ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
-    },
-    "TestRetrieval3": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
-    },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+        "test": "agbenchmark/challenges/interface/read_file/read_file_test.py"
     }
 }
\ No newline at end of file