diff --git a/.env.example b/.env.example index 7782d048..e50ed58a 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,3 @@ AGENT_NAME=mini-agi -AGENT_TIMEOUT=60 +ENVIRONMENT=local MOCK_TEST=False \ No newline at end of file diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index f644abc4..7b1e4df0 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -4,7 +4,7 @@ import pytest from abc import ABC, abstractmethod from agbenchmark.challenges.define_task_types import Ground from agbenchmark.challenges.define_task_types import ChallengeData -from dotenv import load_dotenv, set_key +from dotenv import load_dotenv load_dotenv() @@ -40,22 +40,24 @@ class Challenge(ABC): print("self.data.dependencies", self.data.dependencies) return self.data.dependencies + def setup_challenge(self, config): + from agbenchmark.agent_interface import run_agent + + print("SETTING UP CHALLENGE...") + + run_agent(self.task, self.mock, config) + @property def name(self) -> str: print("self.data.name", self.data.name) return self.data.name - @pytest.mark.parametrize( - "run_agent", - [(task, mock)], - indirect=True, - ) @pytest.mark.parametrize( "challenge_data", [data], indirect=True, ) - def test_method(self, workspace): + def test_method(self, config): raise NotImplementedError @staticmethod diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py new file mode 100644 index 00000000..eba26fc1 --- /dev/null +++ b/agbenchmark/agent_interface.py @@ -0,0 +1,108 @@ +import os +import sys +import subprocess +import time +from agbenchmark.mocks.MockManager import MockManager +from multiprocessing import Process, Pipe + +from agent.hook import run_specific_agent + +from dotenv import load_dotenv + +load_dotenv() + +MOCK_FLAG = os.getenv("MOCK_TEST") + + +def run_agent(task, mock_func, config): + """Calling to get a response""" + + if mock_func == None and MOCK_FLAG == "True": + print("No mock provided") + elif MOCK_FLAG == "True": + mock_manager = MockManager( + task + ) # workspace doesn't need to be passed in, stays the same + print("Server unavailable, using mock", mock_func) + mock_manager.delegate(mock_func) + else: + if config["agent"]["type"] == "python": + run_agent_function(config, task) + elif config["agent"]["type"] == "script": + run_agent_command(config, task) + + +ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" + + +def run_agent_command(config, task): + path = config["agent"]["path"] + + if ENVIRONMENT == "local": + AGENT_NAME = os.getenv("AGENT_NAME") + path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") + + timeout = config["agent"]["cutoff"] or sys.maxsize + print(f"Running {task} with timeout {timeout}") + + command_from_config = config["agent"]["script"] + command_list = command_from_config.split() + + # replace '{}' with the task + command_list = [cmd if cmd != "{}" else task for cmd in command_list] + print("path, command_list", path, command_list) + start_time = time.time() + proc = subprocess.Popen( + command_list, + cwd=path, + shell=True, + ) + + while True: + if time.time() - start_time > timeout: + print("The subprocess has exceeded the time limit and was terminated.") + proc.terminate() + break + + if proc.poll() is not None: + print("The subprocess has finished running.") + break + + +def run_agent_function(config, task): + timeout = ( + config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize + ) + print( + f"Running Python function '{config['agent']['function']}' with timeout {timeout}" + ) + + parent_conn, child_conn = Pipe() + process = Process(target=run_specific_agent, args=(task, child_conn)) + process.start() + start_time = time.time() + + while True: + if parent_conn.poll(): # Check if there's a new message from the child process + response, cycle_count = parent_conn.recv() + print(f"Cycle {cycle_count}: {response}") + + if cycle_count >= config["cutoff"]["count"]: + print( + f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating." + ) + child_conn.send("terminate") + break + + if time.time() - start_time > timeout: + print("The Python function has exceeded the time limit and was terminated.") + child_conn.send( + "terminate" + ) # Send a termination signal to the child process + break + + if not process.is_alive(): + print("The Python function has finished running.") + break + + process.join() diff --git a/agbenchmark/benchmark.py b/agbenchmark/benchmark.py deleted file mode 100644 index 6dc3b231..00000000 --- a/agbenchmark/benchmark.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import sys -import pexpect as expect -from dotenv import load_dotenv - -load_dotenv() - - -def check_cycle_count(cycle_count: int, cutoff: int, proc): - """Increment, print, and check cycle count.""" - cycle_count += 1 - print(f"Cycle count: {cycle_count}") - if cycle_count >= cutoff: - proc.terminate(force=True) - return cycle_count - - -AGENT_NAME = os.getenv("AGENT_NAME") - - -def run_agnostic(config, task): - path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - - timeout = sys.maxsize - - if config["cutoff"]["type"] == "time": - timeout = config["cutoff"]["count"] or 60 - - # from pexpect.popen_spawn import PopenSpawn - - print(f"Running {task} with timeout {timeout}") - - # Starting the subprocess using pexpect - proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path) - - print("proc", proc) - - cycle_count = 0 - - while True: - try: - # If we get the prompt for user input, we send "\n" - if config["cutoff"]["type"] == "user_input": - proc.expect([config["cutoff"]["user_prompt"]]) - proc.sendline(config["cutoff"]["user_input"]) - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - elif config["cutoff"]["type"] == "cycle_count": - match = proc.expect([r"Cycle count: (\d+)"]) - if match is not None: - cycle_count = int(match.group(1)) # type: ignore - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - - # for cutoff type "time", just let it run until timeout - except expect.TIMEOUT: - print("The subprocess has exceeded the time limit and was terminated.") - break - except expect.EOF: - print("The subprocess has finished running.") - break - - proc.close() diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 0bd907d8..b679a731 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,6 +1,4 @@ -import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge -from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os @@ -10,8 +8,9 @@ class TestRetrieval1(RetrievalChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r1_data.json") - def test_method(self, workspace): - files_contents = self.open_files(workspace, self.data.ground.files) + def test_method(self, config): + self.setup_challenge(config) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/config.json b/agbenchmark/config.json index d95b8e44..7388085d 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,9 +1,10 @@ { "workspace": "C:\\Users\\silen\\miniagi", - "cutoff": { - "type": "time", - "user_prompt": "Press enter to continue or abort this action by typing feedback:", + "agent": { + "type": "script", + "path": "", + "script": "python miniagi.py {}", "user_input": "\n", - "count": 5 + "cutoff": 60 } } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 25510e42..0f1fc7bb 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -2,11 +2,7 @@ import json import os import pytest import shutil -import subprocess -import sys from agbenchmark.tests.regression.RegressionManager import RegressionManager -from agbenchmark.mocks.MockManager import MockManager -from agbenchmark.benchmark import run_agnostic @pytest.fixture(scope="module") @@ -41,29 +37,6 @@ def pytest_addoption(parser): parser.addoption("--mock", action="store_true", default=False) -@pytest.fixture(autouse=True) -def run_agent(request, config): - """Calling to get a response""" - if isinstance(request.param, tuple): - task = request.param[0] # The task is passed in indirectly - mock_function_name = request.param[1] or None - else: - task = request.param - mock_function_name = None - - if mock_function_name != None and (request.config.getoption("--mock")): - if mock_function_name: - mock_manager = MockManager( - task - ) # workspace doesn't need to be passed in, stays the same - print("Server unavailable, using mock", mock_function_name) - mock_manager.delegate(mock_function_name) - else: - print("No mock provided") - else: - run_agnostic(config, task) - - regression_json = "agbenchmark/tests/regression/regression_tests.json" regression_manager = RegressionManager(regression_json) @@ -120,13 +93,3 @@ def pytest_generate_tests(metafunc): # Add the parameters to the test function metafunc.parametrize("challenge_data", [params], indirect=True) - - if "run_agent" in metafunc.fixturenames: - # Get the instance of the test class - test_class = metafunc.cls() - - # Generate the parameters - params = [(test_class.task, test_class.mock)] - - # Add the parameters to the test function - metafunc.parametrize("run_agent", params, indirect=True) diff --git a/agbenchmark/mocks/workspace/file_to_check.txt b/agbenchmark/mocks/workspace/file_to_check.txt new file mode 100644 index 00000000..48dc8cff --- /dev/null +++ b/agbenchmark/mocks/workspace/file_to_check.txt @@ -0,0 +1 @@ +Washington DC is the capital of the United States of America \ No newline at end of file diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index f99ae608..c0aaa7f9 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -17,10 +17,9 @@ class TestReadFile(BasicChallenge): return os.path.join(os.path.dirname(__file__), "r_file_data.json") @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") - def test_method( - self, workspace - ): # run_test is a common name that all tests must implement - files_contents = self.open_files(workspace, self.data.ground.files) + def test_method(self, config): + self.setup_challenge(config) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 39c73b16..306375dd 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -10,9 +10,9 @@ class TestWriteFile(BasicChallenge): return os.path.join(os.path.dirname(__file__), "w_file_data.json") @pytest.mark.depends(on=[], name="basic_write_file") - def test_method(self, workspace): - print("my workspace is ", workspace) - files_contents = self.open_files(workspace, self.data.ground.files) + def test_method(self, config): + self.setup_challenge(config) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 8a6278fe..d13b763c 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -2,13 +2,6 @@ "TestWriteFile": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "basic_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]" } } \ No newline at end of file diff --git a/agent/hook.py b/agent/hook.py new file mode 100644 index 00000000..6fa53418 --- /dev/null +++ b/agent/hook.py @@ -0,0 +1,10 @@ +async def run_specific_agent(task, conn): + while ( + not conn.poll() + ): # Check if there's a termination signal from the main process + response, cycle_count = await run_agent( + task + ) # run the agent and get the response and cycle count + + # Send response and cycle count back to the main process + conn.send((response, cycle_count)) diff --git a/pyproject.toml b/pyproject.toml index af9688d1..043fe68a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,6 @@ openai = "^0.27.8" pydantic = "^1.10.9" pytest-depends = "^1.0.1" python-dotenv = "^1.0.0" -pexpect = "^4.8.0" -wexpect = "^4.0.0" [build-system]