moving run agent to tests & agnostic run working

2025-12-19 15:04:26 +01:00 · 2023-06-30 10:50:54 -04:00
parent fce421fb33
commit 2987d71264
13 changed files with 144 additions and 135 deletions
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,3 @@
 AGENT_NAME=mini-agi
-AGENT_TIMEOUT=60
+ENVIRONMENT=local
 MOCK_TEST=False
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -4,7 +4,7 @@ import pytest
 from abc import ABC, abstractmethod
 from agbenchmark.challenges.define_task_types import Ground
 from agbenchmark.challenges.define_task_types import ChallengeData
-from dotenv import load_dotenv, set_key
+from dotenv import load_dotenv
 load_dotenv()
@@ -40,22 +40,24 @@ class Challenge(ABC):
        print("self.data.dependencies", self.data.dependencies)
        return self.data.dependencies
    def setup_challenge(self, config):
        from agbenchmark.agent_interface import run_agent
        print("SETTING UP CHALLENGE...")
        run_agent(self.task, self.mock, config)
    @property
    def name(self) -> str:
        print("self.data.name", self.data.name)
        return self.data.name
    @pytest.mark.parametrize(
        "run_agent",
        [(task, mock)],
        indirect=True,
    )
    @pytest.mark.parametrize(
        "challenge_data",
        [data],
        indirect=True,
    )
-    def test_method(self, workspace):
+    def test_method(self, config):
        raise NotImplementedError
    @staticmethod
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -0,0 +1,108 @@
 import os
 import sys
 import subprocess
 import time
 from agbenchmark.mocks.MockManager import MockManager
 from multiprocessing import Process, Pipe
 from agent.hook import run_specific_agent
 from dotenv import load_dotenv
 load_dotenv()
 MOCK_FLAG = os.getenv("MOCK_TEST")
 def run_agent(task, mock_func, config):
    """Calling to get a response"""
    if mock_func == None and MOCK_FLAG == "True":
        print("No mock provided")
    elif MOCK_FLAG == "True":
        mock_manager = MockManager(
            task
        )  # workspace doesn't need to be passed in, stays the same
        print("Server unavailable, using mock", mock_func)
        mock_manager.delegate(mock_func)
    else:
        if config["agent"]["type"] == "python":
            run_agent_function(config, task)
        elif config["agent"]["type"] == "script":
            run_agent_command(config, task)
 ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
 def run_agent_command(config, task):
    path = config["agent"]["path"]
    if ENVIRONMENT == "local":
        AGENT_NAME = os.getenv("AGENT_NAME")
        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
    timeout = config["agent"]["cutoff"] or sys.maxsize
    print(f"Running {task} with timeout {timeout}")
    command_from_config = config["agent"]["script"]
    command_list = command_from_config.split()
    # replace '{}' with the task
    command_list = [cmd if cmd != "{}" else task for cmd in command_list]
    print("path, command_list", path, command_list)
    start_time = time.time()
    proc = subprocess.Popen(
        command_list,
        cwd=path,
        shell=True,
    )
    while True:
        if time.time() - start_time > timeout:
            print("The subprocess has exceeded the time limit and was terminated.")
            proc.terminate()
            break
        if proc.poll() is not None:
            print("The subprocess has finished running.")
            break
 def run_agent_function(config, task):
    timeout = (
        config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize
    )
    print(
        f"Running Python function '{config['agent']['function']}' with timeout {timeout}"
    )
    parent_conn, child_conn = Pipe()
    process = Process(target=run_specific_agent, args=(task, child_conn))
    process.start()
    start_time = time.time()
    while True:
        if parent_conn.poll():  # Check if there's a new message from the child process
            response, cycle_count = parent_conn.recv()
            print(f"Cycle {cycle_count}: {response}")
            if cycle_count >= config["cutoff"]["count"]:
                print(
                    f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating."
                )
                child_conn.send("terminate")
                break
        if time.time() - start_time > timeout:
            print("The Python function has exceeded the time limit and was terminated.")
            child_conn.send(
                "terminate"
            )  # Send a termination signal to the child process
            break
        if not process.is_alive():
            print("The Python function has finished running.")
            break
    process.join()
--- a/agbenchmark/benchmark.py
+++ b/agbenchmark/benchmark.py
@@ -1,65 +0,0 @@
 import os
 import sys
 import pexpect as expect
 from dotenv import load_dotenv
 load_dotenv()
 def check_cycle_count(cycle_count: int, cutoff: int, proc):
    """Increment, print, and check cycle count."""
    cycle_count += 1
    print(f"Cycle count: {cycle_count}")
    if cycle_count >= cutoff:
        proc.terminate(force=True)
    return cycle_count
 AGENT_NAME = os.getenv("AGENT_NAME")
 def run_agnostic(config, task):
    path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
    timeout = sys.maxsize
    if config["cutoff"]["type"] == "time":
        timeout = config["cutoff"]["count"] or 60
    # from pexpect.popen_spawn import PopenSpawn
    print(f"Running {task} with timeout {timeout}")
    # Starting the subprocess using pexpect
    proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path)
    print("proc", proc)
    cycle_count = 0
    while True:
        try:
            # If we get the prompt for user input, we send "\n"
            if config["cutoff"]["type"] == "user_input":
                proc.expect([config["cutoff"]["user_prompt"]])
                proc.sendline(config["cutoff"]["user_input"])
                cycle_count = check_cycle_count(
                    cycle_count, config["cutoff"]["count"], proc
                )
            elif config["cutoff"]["type"] == "cycle_count":
                match = proc.expect([r"Cycle count: (\d+)"])
                if match is not None:
                    cycle_count = int(match.group(1))  # type: ignore
                    cycle_count = check_cycle_count(
                        cycle_count, config["cutoff"]["count"], proc
                    )
            # for cutoff type "time", just let it run until timeout
        except expect.TIMEOUT:
            print("The subprocess has exceeded the time limit and was terminated.")
            break
        except expect.EOF:
            print("The subprocess has finished running.")
            break
    proc.close()
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,6 +1,4 @@
 import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
@@ -10,8 +8,9 @@ class TestRetrieval1(RetrievalChallenge):
    def get_file_path(self) -> str:  # all tests must implement this method
        return os.path.join(os.path.dirname(__file__), "r1_data.json")
-    def test_method(self, workspace):
+    def test_method(self, config):
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        self.setup_challenge(config)
        files_contents = self.open_files(config["workspace"], self.data.ground.files)
        scores = []
        for file_content in files_contents:
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,9 +1,10 @@
 {
  "workspace": "C:\\Users\\silen\\miniagi",
-  "cutoff": {
+  "agent": {
-    "type": "time",
+    "type": "script",
-    "user_prompt": "Press enter to continue or abort this action by typing feedback:",
+    "path": "",
    "script": "python miniagi.py {}",
    "user_input": "\n",
-    "count": 5
+    "cutoff": 60
  }
 }
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -2,11 +2,7 @@ import json
 import os
 import pytest
 import shutil
 import subprocess
 import sys
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
 from agbenchmark.mocks.MockManager import MockManager
 from agbenchmark.benchmark import run_agnostic
@pytest.fixture(scope="module")
@@ -41,29 +37,6 @@ def pytest_addoption(parser):
    parser.addoption("--mock", action="store_true", default=False)
@pytest.fixture(autouse=True)
 def run_agent(request, config):
    """Calling to get a response"""
    if isinstance(request.param, tuple):
        task = request.param[0]  # The task is passed in indirectly
        mock_function_name = request.param[1] or None
    else:
        task = request.param
        mock_function_name = None
    if mock_function_name != None and (request.config.getoption("--mock")):
        if mock_function_name:
            mock_manager = MockManager(
                task
            )  # workspace doesn't need to be passed in, stays the same
            print("Server unavailable, using mock", mock_function_name)
            mock_manager.delegate(mock_function_name)
        else:
            print("No mock provided")
    else:
        run_agnostic(config, task)
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
 regression_manager = RegressionManager(regression_json)
@@ -120,13 +93,3 @@ def pytest_generate_tests(metafunc):
        # Add the parameters to the test function
        metafunc.parametrize("challenge_data", [params], indirect=True)
    if "run_agent" in metafunc.fixturenames:
        # Get the instance of the test class
        test_class = metafunc.cls()
        # Generate the parameters
        params = [(test_class.task, test_class.mock)]
        # Add the parameters to the test function
        metafunc.parametrize("run_agent", params, indirect=True)
--- a/agbenchmark/mocks/workspace/file_to_check.txt
+++ b/agbenchmark/mocks/workspace/file_to_check.txt
@@ -0,0 +1 @@
 Washington DC is the capital of the United States of America
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -17,10 +17,9 @@ class TestReadFile(BasicChallenge):
        return os.path.join(os.path.dirname(__file__), "r_file_data.json")
    @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
-    def test_method(
+    def test_method(self, config):
-        self, workspace
+        self.setup_challenge(config)
-    ):  # run_test is a common name that all tests must implement
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
        files_contents = self.open_files(workspace, self.data.ground.files)
        scores = []
        for file_content in files_contents:
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -10,9 +10,9 @@ class TestWriteFile(BasicChallenge):
        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
    @pytest.mark.depends(on=[], name="basic_write_file")
-    def test_method(self, workspace):
+    def test_method(self, config):
-        print("my workspace is ", workspace)
+        self.setup_challenge(config)
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
        scores = []
        for file_content in files_contents:
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -2,13 +2,6 @@
    "TestWriteFile": {
        "difficulty": "basic",
        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]"
    },
    "TestReadFile": {
        "difficulty": "basic",
        "dependencies": [
            "basic_write_file"
        ],
        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
    }
 }
--- a/agent/hook.py
+++ b/agent/hook.py
@@ -0,0 +1,10 @@
 async def run_specific_agent(task, conn):
    while (
        not conn.poll()
    ):  # Check if there's a termination signal from the main process
        response, cycle_count = await run_agent(
            task
        )  # run the agent and get the response and cycle count
        # Send response and cycle count back to the main process
        conn.send((response, cycle_count))
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,6 @@ openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
 python-dotenv = "^1.0.0"
 pexpect = "^4.8.0"
 wexpect = "^4.0.0"
 [build-system]
		`@@ -0,0 +1 @@`
							`Washington DC is the capital of the United States of America`