Add basic memory challenge (#57)

2026-02-10 00:34:30 +01:00 · 2023-07-05 20:32:28 -07:00
parent bfc7dfdb29
commit 74fc969dd6
24 changed files with 121 additions and 38 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+agbenchmark/mocks/workspace/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -22,7 +22,7 @@ def run_agent(
        print("No mock provided")
    elif MOCK_FLAG == "True":
        mock_manager = MockManager(
-            task
+            task, config
        )  # workspace doesn't need to be passed in, stays the same
        print("Server unavailable, using mock", mock_func)
        mock_manager.delegate(mock_func)
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -1,5 +1,7 @@
 import glob
+import inspect
 import os
+import shutil
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional

@@ -45,6 +47,8 @@ class Challenge(ABC):
    def setup_challenge(self, config: Dict[str, Any]) -> None:
        from agbenchmark.agent_interface import run_agent

+        self.copy_artifacts_into_workspace(config["workspace"])
+
        run_agent(self.task, self.mock, config)

    @property
@@ -124,3 +128,19 @@ class Challenge(ABC):
                    )

        return 1.0
+
+    def copy_artifacts_into_workspace(self, workspace: str) -> None:
+        curr_frame = inspect.currentframe()
+        outer_frame = inspect.getouterframes(curr_frame)[2]
+        caller_file_path = outer_frame.filename
+        caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path))
+        source_dir = os.path.join(caller_dir_path, "artifacts")
+
+        # Check if source_dir exists, if not then return immediately.
+        if not os.path.exists(source_dir):
+            return
+
+        for file_name in os.listdir(source_dir):
+            full_file_name = os.path.join(source_dir, file_name)
+            if os.path.isfile(full_file_name):
+                shutil.copy(full_file_name, workspace)
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt
@@ -0,0 +1,2 @@
+The id to remember is 2314
+Read the file instructions_2.txt
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt
@@ -0,0 +1 @@
+Read the file instructions_3.txt
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt
@@ -0,0 +1 @@
+Read the file instructions_4.txt
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt
@@ -0,0 +1 @@
+Read the file instructions_5.txt
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt
@@ -0,0 +1 @@
+Write the id previously mentioned to a .txt file.
--- a/agbenchmark/challenges/memory/m1/m1_data.json
+++ b/agbenchmark/challenges/memory/m1/m1_data.json
@@ -0,0 +1,21 @@
+{
+  "name": "basic_memory",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "2314",
+    "should_contain": ["2314"],
+    "should_not_contain": [],
+    "files": ["file_to_check.txt"]
+  },
+  "mock": {
+    "mock_func": "basic_memory_mock",
+    "mock_task": "Follow the instructions in the instructions_1.txt file"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@@ -0,0 +1,27 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.memory.memory import MemoryChallenge
+
+
+class TestBasicMemory(MemoryChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "m1_data.json")
+
+    @pytest.mark.depends(name="test_basic_memory")
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
--- a/agbenchmark/challenges/memory/m1_test.py
+++ b/agbenchmark/challenges/memory/m1_test.py
--- a/agbenchmark/challenges/memory/memory.py
+++ b/agbenchmark/challenges/memory/memory.py
@@ -0,0 +1,8 @@
+import pytest
+
+from agbenchmark.challenge import Challenge
+
+
+@pytest.mark.memory
+class MemoryChallenge(Challenge):
+    """Challenge for memory"""
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import Any, Dict

 import pytest
@@ -13,12 +12,11 @@ class TestRetrieval(RetrievalChallenge):
    def get_file_path(self) -> str:  # all tests must implement this method
        return os.path.join(os.path.dirname(__file__), "r1_data.json")

-    @pytest.mark.depends(on=["basic_write_file"], name="test_retrieval")
+    @pytest.mark.depends(name="test_retrieval")
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        workspace = Path(os.getcwd()) / config["workspace"]
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import Any, Dict

 import pytest
@@ -17,8 +16,7 @@ class TestRetrieval2(RetrievalChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        workspace = Path(os.getcwd()) / config["workspace"]
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import Any, Dict

 import pytest
@@ -17,8 +16,7 @@ class TestRetrieval3(RetrievalChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        workspace = Path(os.getcwd()) / config["workspace"]
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -31,14 +31,13 @@ def config(request: Any) -> None:
    with open(CONFIG_PATH, "r") as f:
        config = json.load(f)

-    if request.config.getoption("--mock"):
-        config["workspace"] = "agbenchmark/mocks/workspace"
-    elif config.get("workspace", "").startswith("${") and config.get(
+    if config.get("workspace", "").startswith("${") and config.get(
        "workspace", ""
    ).endswith("}"):
        path = get_dynamic_workspace(config)
        config["workspace"] = path
-
+    else:
+        config["workspace"] = Path(os.getcwd()) / config["workspace"]
    return config


--- a/agbenchmark/mocks/mock_manager.py
+++ b/agbenchmark/mocks/mock_manager.py
@@ -1,13 +1,13 @@
-from typing import Any
+from typing import Any, Dict

 import agbenchmark.mocks.tests.basic_mocks as basic_mocks
 import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks


 class MockManager:
-    def __init__(self, task: str):
+    def __init__(self, task: str, config: Dict[str, Any]) -> None:
        self.task = task
-        self.workspace = "agbenchmark/mocks/workspace"
+        self.workspace = config["workspace"]
        self.modules = [basic_mocks, retrieval_mocks]

    def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None:
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -55,3 +55,14 @@ def basic_retrieval_3_mock(task: str, workspace: str) -> None:
        "file_to_check.txt",
        "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
    )
+
+
+def basic_memory_mock(task: str, workspace: str) -> None:
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "2314",
+    )
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -56,8 +56,6 @@ def start(category: str, reg: bool, mock: bool) -> int:
            config = json.load(f)

    set_key(".env", "MOCK_TEST", "True" if mock else "False")
-    if mock:
-        config["workspace"] = "agbenchmark/mocks/workspace"

    # create workspace directory if it doesn't exist
    workspace_path = os.path.abspath(config["workspace"])
--- a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt
+++ b/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt
@@ -0,0 +1 @@
+Hello World!
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -4,8 +4,8 @@
  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
  "dependencies": ["basic_write_file"],
  "ground": {
-    "answer": "random string: this is how we're doing",
-    "should_contain": ["random string: this is how we're doing"],
+    "answer": "random string: Hello World!",
+    "should_contain": ["random string: Hello World!"],
    "files": ["file_to_check.txt"]
  },
  "mock": {
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -3,19 +3,12 @@ from typing import Any, Dict

 import pytest

-from agbenchmark.challenge import Challenge
 from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge


 class TestReadFile(BasicChallenge):
    """Testing if LLM can read a file"""

-    @pytest.fixture(scope="module", autouse=True)
-    def setup_module(self, workspace: str) -> None:
-        Challenge.write_to_file(
-            workspace, self.data.ground.files[0], "this is how we're doing"
-        )
-
    def get_file_path(self) -> str:  # all tests must implement this method
        return os.path.join(os.path.dirname(__file__), "r_file_data.json")

--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import Any, Dict

 import pytest
@@ -17,8 +16,7 @@ class TestWriteFile(BasicChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        workspace = Path(os.getcwd()) / config["workspace"]
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)

        scores = []
        for file_content in files_contents:
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,4 +1,9 @@
 {
+    "TestBasicMemory": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m1/m1_test.py"
+    },
    "TestRetrieval": {
        "difficulty": "basic",
        "dependencies": [],
@@ -9,6 +14,11 @@
        "dependencies": [],
        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
    },
+    "TestRetrieval2": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
+    },
    "TestReadFile": {
        "difficulty": "basic",
        "dependencies": [
@@ -16,14 +26,9 @@
        ],
        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
    },
-    "TestRetrieval2": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
-    },
    "TestRetrieval3": {
        "difficulty": "basic",
        "dependencies": [],
        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
    }
-}
+}
				`@@ -0,0 +1 @@`
				`Write the id previously mentioned to a .txt file.`