Quality of life improvements & fixes (#75)

2026-02-17 20:24:30 +01:00 · 2023-07-08 21:43:38 -04:00
parent db86ccdcb4
commit 69bd41f741
50 changed files with 220 additions and 462 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-agbenchmark/mocks/workspace/
+agbenchmark/workspace/

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/agbenchmark/README.md
+++ b/agbenchmark/README.md
@@ -53,8 +53,7 @@ import os
 class TestWriteFile(BasicChallenge):
    """Testing if LLM can write to a file"""

-    @pytest.mark.depends(on=[], name="basic_write_file")
-    def test_method(self, workspace):
+    def test_method(self, config):
        # implement scoring logic by looking at workspace
 ```

@@ -82,7 +81,7 @@ Add the below to create a file in the workspace prior to running a challenge. On

 ## Workspace

-If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
+If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config

 #### Dataset

--- a/agbenchmark/tests/regression/RegressionManager.py
+++ b/agbenchmark/tests/regression/RegressionManager.py
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -3,37 +3,27 @@ import shutil
 import subprocess
 import sys
 import time
-from typing import Any, Dict, Optional
+from typing import Any, Dict

 from dotenv import load_dotenv

-from agbenchmark.mocks.mock_manager import MockManager
-
 load_dotenv()

-MOCK_FLAG = os.getenv("MOCK_TEST")
+mock_test_str = os.getenv("MOCK_TEST")
+MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False


 def run_agent(
    task: str,
-    mock_func: Optional[str],
    config: Dict[str, Any],
    challenge_location: str,
 ) -> None:
    """Calling to get a response"""

-    if MOCK_FLAG == "True":
+    if MOCK_FLAG:
        copy_artifacts_into_workspace(
            config["workspace"], "artifacts_out", challenge_location
        )
-        if mock_func is None:
-            print("No mock provided")
-            return
-        mock_manager = MockManager(
-            task, config
-        )  # workspace doesn't need to be passed in, stays the same
-        print("Server unavailable, using mock", mock_func)
-        mock_manager.delegate(mock_func)
    else:
        timeout = config["cutoff"]
        print(
@@ -99,6 +89,3 @@ def copy_artifacts_into_workspace(
        full_file_name = os.path.join(source_dir, file_name)
        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, workspace)
-
-
-ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -4,9 +4,8 @@ import os
 import subprocess
 import types
 from abc import ABC, ABCMeta
-from typing import Any, Dict, List, Optional, Tuple, Type, cast
+from typing import Any, Dict, List, Tuple, Type, cast

-import pytest
 from dotenv import load_dotenv

 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
@@ -19,7 +18,6 @@ MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False

 class ChallengeMeta(ABCMeta):
    def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
-
        super().__init__(name, bases, dct)
        try:
            frame = cast(types.FrameType, inspect.currentframe())
@@ -40,18 +38,13 @@ class Challenge(ABC, metaclass=ChallengeMeta):
    @property
    def data(self) -> ChallengeData:
        file_path = f"{self.CHALLENGE_LOCATION}/data.json"
-        Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
+        if file_path not in Challenge._data_cache:
+            Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
        return Challenge._data_cache[file_path]

-    @property
-    def mock(self) -> Optional[str]:
-        return self.data.mock.mock_func if self.data.mock else None
-
    @property
    def task(self) -> str:
-        return str(
-            self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
-        )
+        return self.data.task

    @property
    def dependencies(self) -> list:
@@ -64,17 +57,8 @@ class Challenge(ABC, metaclass=ChallengeMeta):
            config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
        )

-        run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
+        run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)

-    @property
-    def name(self) -> str:
-        return self.data.name
-
-    @pytest.mark.parametrize(
-        "challenge_data",
-        [data],
-        indirect=True,
-    )
    def test_method(self, config: Dict[str, Any]) -> None:
        raise NotImplementedError

@@ -151,3 +135,16 @@ class Challenge(ABC, metaclass=ChallengeMeta):
                    )

        return 1.0
+
+    def get_scores(self, config: Dict[str, Any]) -> List[float]:
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        return scores
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -25,10 +25,9 @@ Example:

 ```python
 {
-  "name": "basic_write_file",
  "category": ["basic"],
  "task": "Print the the capital of America to a .txt file",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"], # the class name of the test
  "ground": {
    "answer": "Washington",
    "should_contain": ["Washington"],
@@ -36,10 +35,6 @@ Example:
    "files": [".txt"],
    "type": "file"
  },
-  "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
-  },
  "info": {
    "difficulty": "basic",
    "description": "Tests the writing to file",
--- a/agbenchmark/challenges/code/code.py
+++ b/agbenchmark/challenges/code/code.py
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.code
-class CodeChallenge(Challenge):
-    """Challenge for memory"""
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "debug_simple_typo_with_guidance",
  "category": ["code"],
  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
-  "dependencies": [],
+  "dependencies": ["TestReadFile", "TestWriteFile"],
  "ground": {
    "answer": "[0, 1] [2, 5] [0, 3]",
    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@@ -10,10 +9,6 @@
    "files": ["test.py"],
    "type": "execute_python_code"
  },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
  "info": {
    "difficulty": "basic",
    "description": "Tests ability for the agent to debug python code with a simple typo in it.",
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict

-import pytest
-
-from agbenchmark.challenges.code.code import CodeChallenge
+from agbenchmark.challenge import Challenge


-class TestDebugSimpleTypoWithGuidance(CodeChallenge):
+class TestDebugSimpleTypoWithGuidance(Challenge):
    """The first memory challenge"""

-    @pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
        assert 1 in scores
--- a/agbenchmark/challenges/code/d2/d2_test.py
+++ b/agbenchmark/challenges/code/d2/d2_test.py
@@ -1,28 +1,14 @@
 from typing import Any, Dict

-import pytest
-
-from agbenchmark.challenges.code.code import CodeChallenge
+from agbenchmark.challenge import Challenge


-class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
+class TestDebugSimpleTypoWithoutGuidance(Challenge):
    """The first memory challenge"""

-    @pytest.mark.depends(
-        name="test_debug_simple_typo_without_guidance",
-        depends=["test_debug_simple_typo_with_guidance"],
-    )
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
+        scores = self.get_scores(config)

        assert 1 in scores
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "debug_simple_typo_without_guidance",
  "category": ["code"],
  "task": "Make test.py run without errors.",
-  "dependencies": [],
+  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
  "ground": {
    "answer": "[0, 1] [2, 5] [0, 3]",
    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@@ -10,12 +9,8 @@
    "files": ["test.py"],
    "type": "execute_python_code"
  },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
  "info": {
-    "difficulty": "basic",
+    "difficulty": "medium",
    "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
    "side_effects": ["tests if there is in fact an LLM attached"]
  }
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,11 +4,6 @@ from typing import List, Optional
 from pydantic import BaseModel


-class Mock(BaseModel):
-    mock_func: Optional[str] = None
-    mock_task: Optional[str] = None
-
-
 class Info(BaseModel):
    difficulty: str
    description: str
@@ -24,12 +19,10 @@ class Ground(BaseModel):


 class ChallengeData(BaseModel):
-    name: str
    category: List[str]
    task: str
    dependencies: List[str]
    ground: Ground
-    mock: Optional[Mock] = None
    info: Info

    def serialize(self, path: str) -> None:
--- a/agbenchmark/tests/basic_abilities/browse_test.py
+++ b/agbenchmark/tests/basic_abilities/browse_test.py
--- a/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
+++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
--- a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
+++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
--- a/agbenchmark/tests/basic_abilities/read_file/data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/data.json
@@ -1,17 +1,14 @@
 {
-  "name": "basic_read_file",
-  "category": ["basic"],
+  "name": "ReadFile",
+  "category": ["interface"],
  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["basic_write_file"],
+  "dependencies": ["TestWriteFile"],
  "ground": {
    "answer": "random string Hello World!",
    "should_contain": ["random string", "Hello World!"],
    "files": ["file_to_check.txt"],
    "type": "file"
  },
-  "mock": {
-    "mock_func": "basic_read_file_mock"
-  },
  "info": {
    "description": "This reads the file quickly",
    "difficulty": "basic",
--- a/agbenchmark/challenges/interface/read_file/read_file_test.py
+++ b/agbenchmark/challenges/interface/read_file/read_file_test.py
@@ -0,0 +1,12 @@
+from typing import Any, Dict
+
+from agbenchmark.challenge import Challenge
+
+
+class TestReadFile(Challenge):
+    """Testing if LLM can read a file"""
+
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+        scores = self.get_scores(config)
+        assert 1 in scores
--- a/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt
+++ b/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt
--- a/agbenchmark/tests/basic_abilities/write_file/data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/data.json
@@ -1,6 +1,6 @@
 {
-  "name": "basic_write_file",
-  "category": ["basic"],
+  "name": "WriteFile",
+  "category": ["interface"],
  "task": "Print the the capital of America to a .txt file",
  "dependencies": [],
  "ground": {
@@ -10,10 +10,6 @@
    "files": [".txt"],
    "type": "file"
  },
-  "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
-  },
  "info": {
    "difficulty": "basic",
    "description": "Tests the writing to file",
--- a/agbenchmark/challenges/interface/write_file/write_file_test.py
+++ b/agbenchmark/challenges/interface/write_file/write_file_test.py
@@ -0,0 +1,13 @@
+from typing import Any, Dict
+
+from agbenchmark.challenge import Challenge
+
+
+class TestWriteFile(Challenge):
+    """Testing if LLM can write to a file"""
+
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        scores = self.get_scores(config)
+        assert 1 in scores
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_memory",
  "category": ["memory"],
  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestReadFile", "TestWriteFile"],
  "ground": {
    "answer": "2314",
    "should_contain": ["2314"],
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict

-import pytest
-
-from agbenchmark.challenges.memory.memory import MemoryChallenge
+from agbenchmark.challenge import Challenge


-class TestBasicMemory(MemoryChallenge):
+class TestBasicMemory(Challenge):
    """The first memory challenge"""

-    @pytest.mark.depends(name="test_basic_memory")
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
        assert 1 in scores
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "remember_multiple_ids",
  "category": ["memory"],
  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestBasicMemory"],
  "ground": {
    "answer": "3145\n3791\n9317\n9471",
    "should_contain": ["3145", "3791", "9317", "9471"],
@@ -10,10 +9,6 @@
    "files": [".txt"],
    "type": "file"
  },
-  "mock": {
-    "mock_func": "remember_multiple_ids_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
  "info": {
    "difficulty": "basic",
    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
@@ -1,27 +1,13 @@
 from typing import Any, Dict

-import pytest
-
-from agbenchmark.challenges.memory.memory import MemoryChallenge
+from agbenchmark.challenge import Challenge


-class TestRememberMultipleIds(MemoryChallenge):
+class TestRememberMultipleIds(Challenge):
    """The first memory challenge"""

-    @pytest.mark.depends(
-        name="test_remember_multiple_ids", depends=["test_basic_memory"]
-    )
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
        assert 1 in scores
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "remember_multiple_ids_with_noise_mock",
  "category": ["memory"],
  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestRememberMultipleIds"],
  "ground": {
    "answer": "3145\n3791\n9317\n9471",
    "should_contain": ["3145", "3791", "9317", "9471"],
@@ -10,10 +9,6 @@
    "files": [".txt"],
    "type": "file"
  },
-  "mock": {
-    "mock_func": "remember_multiple_ids_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
  "info": {
    "difficulty": "medium",
    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
@@ -1,28 +1,13 @@
 from typing import Any, Dict

-import pytest
-
-from agbenchmark.challenges.memory.memory import MemoryChallenge
+from agbenchmark.challenge import Challenge


-class TestRememberMultipleIdsWithNoise(MemoryChallenge):
+class TestRememberMultipleIdsWithNoise(Challenge):
    """The first memory challenge"""

-    @pytest.mark.depends(
-        name="test_remember_multiple_ids_with_noise",
-        depends=["test_remember_multiple_ids"],
-    )
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
        assert 1 in scores
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "remember_multiple_phrases_with_noise_mock",
  "category": ["memory"],
  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestRememberMultipleIdsWithNoise"],
  "ground": {
    "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
    "should_contain": [
@@ -15,10 +14,6 @@
    "files": [".txt"],
    "type": "file"
  },
-  "mock": {
-    "mock_func": "remember_multiple_phrases_with_noise_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
  "info": {
    "difficulty": "medium",
    "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
@@ -1,28 +1,13 @@
 from typing import Any, Dict

-import pytest
-
-from agbenchmark.challenges.memory.memory import MemoryChallenge
+from agbenchmark.challenge import Challenge


-class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
+class TestRememberMultiplePhrasesWithNoise(Challenge):
    """The first memory challenge"""

-    @pytest.mark.depends(
-        name="test_remember_multiple_phrases_with_noise",
-        depends=["test_remember_multiple_ids_with_noise"],
-    )
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
        assert 1 in scores
--- a/agbenchmark/challenges/memory/memory.py
+++ b/agbenchmark/challenges/memory/memory.py
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.memory
-class MemoryChallenge(Challenge):
-    """Challenge for memory"""
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_information_retrieval",
  "category": ["retrieval"],
  "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"],
  "ground": {
    "answer": "£25.89",
    "should_contain": ["25.89"],
@@ -10,10 +9,6 @@
    "files": [".txt"],
    "type": "file"
  },
-  "mock": {
-    "mock_func": "basic_retrieval_mock",
-    "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file."
-  },
  "info": {
    "difficulty": "basic",
    "description": "Tests ability to retrieve information from a website.",
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict

-import pytest
-
-from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
+from agbenchmark.challenge import Challenge


-class TestRetrieval(RetrievalChallenge):
+class TestRetrieval(Challenge):
    """The first information-retrieval challenge"""

-    @pytest.mark.depends(name="test_retrieval")
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
        assert 1 in scores
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ b/agbenchmark/challenges/retrieval/r2/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_information_retrieval",
  "category": ["retrieval"],
  "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": [],
+  "dependencies": ["TestRetrieval"],
  "ground": {
    "answer": "81,462",
    "should_contain": ["81,462"],
@@ -10,10 +9,6 @@
    "files": [".txt"],
    "type": "file"
  },
-  "mock": {
-    "mock_func": "basic_retrieval_2_mock",
-    "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
-  },
  "info": {
    "difficulty": "basic",
    "description": "Tests ability to retrieve information.",
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict

-import pytest
-
-from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
+from agbenchmark.challenge import Challenge


-class TestRetrieval2(RetrievalChallenge):
+class TestRetrieval2(Challenge):
    """The first information-retrieval challenge"""

-    @pytest.mark.depends(on=["test_retrieval"], name="test_retrieval_2")
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
        assert 1 in scores
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -1,19 +1,30 @@
 {
-  "name": "basic_information_retrieval",
  "category": ["retrieval"],
  "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": [],
+  "dependencies": ["TestRetrieval2"],
  "ground": {
    "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
-    "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
+    "should_contain": [
+      "15",
+      "112",
+      "117",
+      "204",
+      "413",
+      "2,014",
+      "3,198",
+      "4,046",
+      "7,000",
+      "11,759",
+      "21,461",
+      "24,578",
+      "31,536",
+      "53,823",
+      "81,462"
+    ],
    "should_not_contain": [],
    "files": [".txt"],
    "type": "file"
  },
-  "mock": {
-    "mock_func": "basic_retrieval_3_mock",
-    "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
-  },
  "info": {
    "difficulty": "basic",
    "description": "Tests ability to retrieve information.",
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -1,25 +1,14 @@
 from typing import Any, Dict

-import pytest
-
-from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
+from agbenchmark.challenge import Challenge


-class TestRetrieval3(RetrievalChallenge):
+class TestRetrieval3(Challenge):
    """The first information-retrieval challenge"""

-    @pytest.mark.depends(on=["test_retrieval_2"], name="test_retrieval_3")
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
+        scores = self.get_scores(config)

        assert 1 in scores
--- a/agbenchmark/challenges/retrieval/retrieval.py
+++ b/agbenchmark/challenges/retrieval/retrieval.py
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.retrieval
-class RetrievalChallenge(Challenge):
-    """Challenge for information-retrieval"""
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -2,12 +2,16 @@ import json
 import os
 import shutil
 from pathlib import Path  # noqa
-from typing import Any, Dict, Generator, List
+from typing import Any, Dict, Generator

 import pytest

-from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH
-from agbenchmark.tests.regression.RegressionManager import RegressionManager
+from agbenchmark.RegressionManager import RegressionManager
+from agbenchmark.start_benchmark import (
+    CONFIG_PATH,
+    REGRESSION_TESTS_PATH,
+    get_regression_data,
+)


 def resolve_workspace(config: Dict[str, Any]) -> str:
@@ -37,7 +41,7 @@ def config(request: Any) -> None:
        config = json.load(f)

    if request.config.getoption("--mock"):
-        config["workspace"] = "agbenchmark/mocks/workspace"
+        config["workspace"] = "agbenchmark/workspace"
    elif isinstance(config["workspace"], str):
        config["workspace"] = resolve_workspace(config)
    else:  # it's a input output dict
@@ -77,9 +81,22 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:

 def pytest_addoption(parser: Any) -> None:
    parser.addoption("--mock", action="store_true", default=False)
+    parser.addoption("--improve", action="store_true", default=False)
+    parser.addoption("--maintain", action="store_true", default=False)


-regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+@pytest.fixture(autouse=True)
+def check_regression(request: Any) -> None:
+    test_name = request.node.parent.name
+    data = get_regression_data()
+
+    # Check if the test name exists in the regression tests
+    if request.config.getoption("--improve") and data.get(test_name, None):
+        pytest.skip("Skipping test because it's a regression test and --improve is set")
+    elif request.config.getoption("--maintain") and not data.get(test_name, None):
+        pytest.skip(
+            "Skipping test because it's not a regression test and --maintain is set"
+        )


 # this is to get the challenge_data from every test
@@ -88,6 +105,9 @@ def challenge_data(request: Any) -> None:
    return request.param


+regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+
+
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
    if call.when == "call":
        challenge_data = item.funcargs.get("challenge_data", None)
@@ -109,16 +129,6 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
            regression_manager.remove_test(item.nodeid.split("::")[1])


-def pytest_collection_modifyitems(items: List[Any]) -> None:
-    """Called once all test items are collected. Used
-    to add regression and depends markers to collected test items."""
-    for item in items:
-        # regression add
-        if item.nodeid.split("::")[1] in regression_manager.tests:
-            print(regression_manager.tests)
-            item.add_marker(pytest.mark.regression)
-
-
 def pytest_sessionfinish() -> None:
    """Called at the end of the session to save regression tests"""
    regression_manager.save()
@@ -135,3 +145,29 @@ def pytest_generate_tests(metafunc: Any) -> None:

        # Add the parameters to the test function
        metafunc.parametrize("challenge_data", [params], indirect=True)
+
+
+# this is adding the dependency marker and category markers automatically from the json
+def pytest_collection_modifyitems(items: Any, config: Any) -> None:
+    data = get_regression_data()
+
+    for item in items:
+        # Assuming item.cls is your test class
+        test_class_instance = item.cls()
+
+        # Then you can access your properties
+        name = item.parent.cls.__name__
+        dependencies = test_class_instance.data.dependencies
+
+        # Filter dependencies if they exist in regression data if its an improvement test
+        if config.getoption("--improve"):
+            dependencies = [dep for dep in dependencies if not data.get(dep, None)]
+
+        categories = test_class_instance.data.category
+
+        # Add depends marker dynamically
+        item.add_marker(pytest.mark.depends(on=dependencies, name=name))
+
+        # Add category marker dynamically
+        for category in categories:
+            item.add_marker(getattr(pytest.mark, category))
--- a/agbenchmark/mocks/mock_manager.py
+++ b/agbenchmark/mocks/mock_manager.py
@@ -1,28 +0,0 @@
-from typing import Any, Dict, Optional
-
-import agbenchmark.mocks.tests.basic_mocks as basic_mocks
-
-
-class MockManager:
-    def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None:
-        self.task = task
-        self.workspace = config["workspace"]
-        self.modules = [basic_mocks]
-
-    def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None:
-        if hasattr(self, mock_function_name):
-            # Check if the mock function is an attribute of this class
-            getattr(self, mock_function_name)(*args, **kwargs)
-        elif mock_function_name in globals():
-            # Check if the function is imported in the file
-            func = globals()[mock_function_name]
-            func(self.task, self.workspace, *args, **kwargs)
-        elif len(self.modules) > 0:
-            # checks if function is in imported modules
-            for module in self.modules:
-                if hasattr(module, mock_function_name):
-                    func = getattr(module, mock_function_name)
-                    func(self.task, self.workspace, *args, **kwargs)
-                    return
-        else:
-            raise ValueError(f"No such mock: {mock_function_name}")
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -1,12 +0,0 @@
-from agbenchmark.challenge import Challenge
-
-
-def example_mock(task: str, workspace: str) -> None:
-    """
-    This mock writes to a file (creates one if it doesn't exist)
-    """
-    Challenge.write_to_file(
-        workspace,
-        "file_to_check.txt",
-        "This is an example showing how you can use mocks but here you can use artifacts_out folder instead of a mock.",
-    )
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -2,11 +2,11 @@ import json
 import os
 import sys
 from pathlib import Path
-from typing import List
+from typing import Any

 import click
 import pytest
-from dotenv import load_dotenv, set_key
+from dotenv import load_dotenv

 load_dotenv()

@@ -26,10 +26,17 @@ def cli() -> None:
@cli.command()
@click.option("--category", default=None, help="Specific category to run")
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
+@click.option("--improve", is_flag=True, help="Run only non-regression tests")
@click.option("--mock", is_flag=True, help="Run with mock")
-def start(category: str, maintain: bool, mock: bool) -> int:
+def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
    # Check if configuration file exists and is not empty
+    if maintain and improve:
+        print(
+            "Error: You can't use both --maintain and --improve at the same time. Please choose one."
+        )
+        return 1
+
    if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
        config = {}

@@ -55,7 +62,7 @@ def start(category: str, maintain: bool, mock: bool) -> int:
        with open(CONFIG_PATH, "r") as f:
            config = json.load(f)

-    set_key(".env", "MOCK_TEST", "True" if mock else "False")
+    os.environ["MOCK_TEST"] = "True" if mock else "False"

    if not os.path.exists(REGRESSION_TESTS_PATH):
        with open(REGRESSION_TESTS_PATH, "a"):
@@ -65,42 +72,31 @@ def start(category: str, maintain: bool, mock: bool) -> int:
    for key, value in config.items():
        print(f"{key}: {value}")

-    print("Starting benchmark tests...", category)
-    tests_to_run = []
    pytest_args = ["-vs"]
    if category:
        pytest_args.extend(["-m", category])
+        print("Starting benchmark tests ", category)
    else:
-        if maintain:
-            print("Running all regression tests")
-            tests_to_run = get_regression_tests()
-        else:
-            print("Running all categories")
+        print("Running all categories")
+
+    if maintain:
+        print("Running only regression tests")
+        pytest_args.append("--maintain")
+    elif improve:
+        print("Running only non-regression tests")
+        pytest_args.append("--improve")

    if mock:
        pytest_args.append("--mock")

-    # Run pytest with the constructed arguments
-    if not tests_to_run:
-        tests_to_run = [str(CURRENT_DIRECTORY)]
-    pytest_args.extend(tests_to_run)
-
    return sys.exit(pytest.main(pytest_args))


-def get_regression_tests() -> List[str]:
-    if not Path(REGRESSION_TESTS_PATH).exists():
-        with open(REGRESSION_TESTS_PATH, "w") as file:
-            json.dump({}, file)
-
+def get_regression_data() -> Any:
    with open(REGRESSION_TESTS_PATH, "r") as file:
        data = json.load(file)

-    regression_tests = [
-        str(CURRENT_DIRECTORY / ".." / value["test"]) for key, value in data.items()
-    ]
-
-    return regression_tests
+    return data


 if __name__ == "__main__":
--- a/agbenchmark/tests/basic_abilities/basic_challenge.py
+++ b/agbenchmark/tests/basic_abilities/basic_challenge.py
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.basic
-class BasicChallenge(Challenge):
-    pass
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,24 +0,0 @@
-from typing import Any, Dict
-
-import pytest
-
-from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
-
-
-class TestReadFile(BasicChallenge):
-    """Testing if LLM can read a file"""
-
-    @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
-        assert 1 in scores
--- a/agbenchmark/tests/basic_abilities/remember_context_test.py
+++ b/agbenchmark/tests/basic_abilities/remember_context_test.py
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,25 +0,0 @@
-from typing import Any, Dict
-
-import pytest
-
-from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
-
-
-class TestWriteFile(BasicChallenge):
-    """Testing if LLM can write to a file"""
-
-    @pytest.mark.depends(name="basic_write_file")
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
-        assert 1 in scores
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
--- a/agent/smol-developer
+++ b/agent/smol-developer
--- a/config.json
+++ b/config.json
@@ -1,6 +1,6 @@
 {
-  "workspace": "projects/my-new-project/workspace",
-  "entry_path": "agent/gpt-engineer/benchmarks.py",
-  "home_path": "agent/gpt-engineer",
+  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+  "entry_path": "benchmarks.py",
+  "home_path": "agent/mini-agi",
  "cutoff": 60
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ testpaths = [
 markers = [
    "retrieval",
    "regression",
-    "basic",
+    "interface",
    "code",
    "memory"
 ]
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,14 +1,4 @@
 {
-    "TestDebugSimpleTypoWithGuidance": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
-    },
-    "TestDebugSimpleTypoWithoutGuidance": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/code/d2/d2_test.py"
-    },
    "TestBasicMemory": {
        "difficulty": "basic",
        "dependencies": [],
@@ -16,44 +6,54 @@
    },
    "TestRememberMultipleIds": {
        "difficulty": "basic",
-        "dependencies": [],
+        "dependencies": [
+            "TestBasicMemory"
+        ],
        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
    },
    "TestRememberMultipleIdsWithNoise": {
        "difficulty": "medium",
-        "dependencies": [],
+        "dependencies": [
+            "TestRememberMultipleIds"
+        ],
        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIdsWithNoise"
+        ],
+        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+    },
    "TestRetrieval": {
        "difficulty": "basic",
        "dependencies": [],
        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
    },
+    "TestRetrieval2": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestRetrieval"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
+    },
+    "TestRetrieval3": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestRetrieval2"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
+    },
    "TestWriteFile": {
        "difficulty": "basic",
        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
-    },
-    "TestRetrieval2": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
+        "test": "agbenchmark/challenges/interface/write_file/write_file_test.py"
    },
    "TestReadFile": {
        "difficulty": "basic",
        "dependencies": [
-            "basic_write_file"
+            "TestWriteFile"
        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
-    },
-    "TestRetrieval3": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
-    },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+        "test": "agbenchmark/challenges/interface/read_file/read_file_test.py"
    }
 }