diff --git a/.gitignore b/.gitignore index c41065ca..3581dc93 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +agbenchmark/mocks/workspace/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 993aa242..4d74aac7 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -22,7 +22,7 @@ def run_agent( print("No mock provided") elif MOCK_FLAG == "True": mock_manager = MockManager( - task + task, config ) # workspace doesn't need to be passed in, stays the same print("Server unavailable, using mock", mock_func) mock_manager.delegate(mock_func) diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index eaed73a2..865d6444 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -1,5 +1,7 @@ import glob +import inspect import os +import shutil from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional @@ -45,6 +47,8 @@ class Challenge(ABC): def setup_challenge(self, config: Dict[str, Any]) -> None: from agbenchmark.agent_interface import run_agent + self.copy_artifacts_into_workspace(config["workspace"]) + run_agent(self.task, self.mock, config) @property @@ -124,3 +128,19 @@ class Challenge(ABC): ) return 1.0 + + def copy_artifacts_into_workspace(self, workspace: str) -> None: + curr_frame = inspect.currentframe() + outer_frame = inspect.getouterframes(curr_frame)[2] + caller_file_path = outer_frame.filename + caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path)) + source_dir = os.path.join(caller_dir_path, "artifacts") + + # Check if source_dir exists, if not then return immediately. + if not os.path.exists(source_dir): + return + + for file_name in os.listdir(source_dir): + full_file_name = os.path.join(source_dir, file_name) + if os.path.isfile(full_file_name): + shutil.copy(full_file_name, workspace) diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt new file mode 100644 index 00000000..da3babb1 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt @@ -0,0 +1,2 @@ +The id to remember is 2314 +Read the file instructions_2.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt new file mode 100644 index 00000000..87c870b1 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt @@ -0,0 +1 @@ +Read the file instructions_3.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt new file mode 100644 index 00000000..98708620 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt @@ -0,0 +1 @@ +Read the file instructions_4.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt new file mode 100644 index 00000000..78da710b --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt @@ -0,0 +1 @@ +Read the file instructions_5.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt new file mode 100644 index 00000000..8a2d3550 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt @@ -0,0 +1 @@ +Write the id previously mentioned to a .txt file. diff --git a/agbenchmark/challenges/memory/m1/m1_data.json b/agbenchmark/challenges/memory/m1/m1_data.json new file mode 100644 index 00000000..7023f85f --- /dev/null +++ b/agbenchmark/challenges/memory/m1/m1_data.json @@ -0,0 +1,21 @@ +{ + "name": "basic_memory", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "2314", + "should_contain": ["2314"], + "should_not_contain": [], + "files": ["file_to_check.txt"] + }, + "mock": { + "mock_func": "basic_memory_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py new file mode 100644 index 00000000..28e600cc --- /dev/null +++ b/agbenchmark/challenges/memory/m1/m1_test.py @@ -0,0 +1,27 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.memory.memory import MemoryChallenge + + +class TestBasicMemory(MemoryChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "m1_data.json") + + @pytest.mark.depends(name="test_basic_memory") + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.open_files(config["workspace"], self.data.ground.files) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/challenges/memory/m1_test.py b/agbenchmark/challenges/memory/m1_test.py deleted file mode 100644 index e69de29b..00000000 diff --git a/agbenchmark/challenges/memory/memory.py b/agbenchmark/challenges/memory/memory.py new file mode 100644 index 00000000..429bef23 --- /dev/null +++ b/agbenchmark/challenges/memory/memory.py @@ -0,0 +1,8 @@ +import pytest + +from agbenchmark.challenge import Challenge + + +@pytest.mark.memory +class MemoryChallenge(Challenge): + """Challenge for memory""" diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 76777534..d107d964 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from typing import Any, Dict import pytest @@ -13,12 +12,11 @@ class TestRetrieval(RetrievalChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r1_data.json") - @pytest.mark.depends(on=["basic_write_file"], name="test_retrieval") + @pytest.mark.depends(name="test_retrieval") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - workspace = Path(os.getcwd()) / config["workspace"] - files_contents = self.open_files(workspace, self.data.ground.files) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py index 7664ca36..a60296ec 100644 --- a/agbenchmark/challenges/retrieval/r2/r2_test.py +++ b/agbenchmark/challenges/retrieval/r2/r2_test.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from typing import Any, Dict import pytest @@ -17,8 +16,7 @@ class TestRetrieval2(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - workspace = Path(os.getcwd()) / config["workspace"] - files_contents = self.open_files(workspace, self.data.ground.files) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py index c13de2c8..bcd48d33 100644 --- a/agbenchmark/challenges/retrieval/r3/r3_test.py +++ b/agbenchmark/challenges/retrieval/r3/r3_test.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from typing import Any, Dict import pytest @@ -17,8 +16,7 @@ class TestRetrieval3(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - workspace = Path(os.getcwd()) / config["workspace"] - files_contents = self.open_files(workspace, self.data.ground.files) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 66ede2c0..7203ee6b 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -31,14 +31,13 @@ def config(request: Any) -> None: with open(CONFIG_PATH, "r") as f: config = json.load(f) - if request.config.getoption("--mock"): - config["workspace"] = "agbenchmark/mocks/workspace" - elif config.get("workspace", "").startswith("${") and config.get( + if config.get("workspace", "").startswith("${") and config.get( "workspace", "" ).endswith("}"): path = get_dynamic_workspace(config) config["workspace"] = path - + else: + config["workspace"] = Path(os.getcwd()) / config["workspace"] return config diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py index 59fa8dbf..5b84965c 100644 --- a/agbenchmark/mocks/mock_manager.py +++ b/agbenchmark/mocks/mock_manager.py @@ -1,13 +1,13 @@ -from typing import Any +from typing import Any, Dict import agbenchmark.mocks.tests.basic_mocks as basic_mocks import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks class MockManager: - def __init__(self, task: str): + def __init__(self, task: str, config: Dict[str, Any]) -> None: self.task = task - self.workspace = "agbenchmark/mocks/workspace" + self.workspace = config["workspace"] self.modules = [basic_mocks, retrieval_mocks] def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None: diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 882e3c82..3b9170f4 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -55,3 +55,14 @@ def basic_retrieval_3_mock(task: str, workspace: str) -> None: "file_to_check.txt", "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", ) + + +def basic_memory_mock(task: str, workspace: str) -> None: + """ + This mock writes to a file (creates one if it doesn't exist) + """ + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "2314", + ) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 8ef01d3c..959dee36 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -56,8 +56,6 @@ def start(category: str, reg: bool, mock: bool) -> int: config = json.load(f) set_key(".env", "MOCK_TEST", "True" if mock else "False") - if mock: - config["workspace"] = "agbenchmark/mocks/workspace" # create workspace directory if it doesn't exist workspace_path = os.path.abspath(config["workspace"]) diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt new file mode 100644 index 00000000..980a0d5f --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt @@ -0,0 +1 @@ +Hello World! diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index b21e2724..a74b875a 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -4,8 +4,8 @@ "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", "dependencies": ["basic_write_file"], "ground": { - "answer": "random string: this is how we're doing", - "should_contain": ["random string: this is how we're doing"], + "answer": "random string: Hello World!", + "should_contain": ["random string: Hello World!"], "files": ["file_to_check.txt"] }, "mock": { diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index c5f886d5..e7f2af9e 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -3,19 +3,12 @@ from typing import Any, Dict import pytest -from agbenchmark.challenge import Challenge from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" - @pytest.fixture(scope="module", autouse=True) - def setup_module(self, workspace: str) -> None: - Challenge.write_to_file( - workspace, self.data.ground.files[0], "this is how we're doing" - ) - def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r_file_data.json") diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 966df7f2..81f72cc9 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from typing import Any, Dict import pytest @@ -17,8 +16,7 @@ class TestWriteFile(BasicChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - workspace = Path(os.getcwd()) / config["workspace"] - files_contents = self.open_files(workspace, self.data.ground.files) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/regression_tests.json b/regression_tests.json index d0a8ed19..cfa4bda3 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,4 +1,9 @@ { + "TestBasicMemory": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m1/m1_test.py" + }, "TestRetrieval": { "difficulty": "basic", "dependencies": [], @@ -9,6 +14,11 @@ "dependencies": [], "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py" }, + "TestRetrieval2": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" + }, "TestReadFile": { "difficulty": "basic", "dependencies": [ @@ -16,14 +26,9 @@ ], "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" }, - "TestRetrieval2": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" - }, "TestRetrieval3": { "difficulty": "basic", "dependencies": [], "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" } -} +} \ No newline at end of file