diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 7b1e4df0..d7a2bdc9 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -23,6 +23,7 @@ class Challenge(ABC): @property def data(self) -> ChallengeData: + # TODO: make it so that this is cached somewhere to just call self.deserialized_data return ChallengeData.deserialize(self.get_file_path()) @property @@ -37,19 +38,15 @@ class Challenge(ABC): @property def dependencies(self) -> list: - print("self.data.dependencies", self.data.dependencies) return self.data.dependencies def setup_challenge(self, config): from agbenchmark.agent_interface import run_agent - print("SETTING UP CHALLENGE...") - run_agent(self.task, self.mock, config) @property def name(self) -> str: - print("self.data.name", self.data.name) return self.data.name @pytest.mark.parametrize( diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index eba26fc1..2ff2acf3 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,12 +1,9 @@ import os -import sys -import subprocess +import importlib import time from agbenchmark.mocks.MockManager import MockManager from multiprocessing import Process, Pipe -from agent.hook import run_specific_agent - from dotenv import load_dotenv load_dotenv() @@ -26,83 +23,48 @@ def run_agent(task, mock_func, config): print("Server unavailable, using mock", mock_func) mock_manager.delegate(mock_func) else: - if config["agent"]["type"] == "python": - run_agent_function(config, task) - elif config["agent"]["type"] == "script": - run_agent_command(config, task) + timeout = config["cutoff"] + print(f"Running Python function '{config['func_path']}' with timeout {timeout}") + + parent_conn, child_conn = Pipe() + + # Import the specific agent dynamically + module_name = config["func_path"].replace("/", ".").rstrip(".py") + module = importlib.import_module(module_name) + run_specific_agent = getattr(module, "run_specific_agent") + + process = Process(target=run_specific_agent, args=(task, child_conn)) + process.start() + start_time = time.time() + + while True: + if ( + parent_conn.poll() + ): # Check if there's a new message from the child process + response, cycle_count = parent_conn.recv() + print(f"Cycle {cycle_count}: {response}") + + if cycle_count >= config["cutoff"]: + print( + f"Cycle count has reached the limit of {config['cutoff']}. Terminating." + ) + child_conn.send("terminate") + break + + if time.time() - start_time > timeout: + print( + "The Python function has exceeded the time limit and was terminated." + ) + child_conn.send( + "terminate" + ) # Send a termination signal to the child process + break + + if not process.is_alive(): + print("The Python function has finished running.") + break + + process.join() ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" - - -def run_agent_command(config, task): - path = config["agent"]["path"] - - if ENVIRONMENT == "local": - AGENT_NAME = os.getenv("AGENT_NAME") - path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - - timeout = config["agent"]["cutoff"] or sys.maxsize - print(f"Running {task} with timeout {timeout}") - - command_from_config = config["agent"]["script"] - command_list = command_from_config.split() - - # replace '{}' with the task - command_list = [cmd if cmd != "{}" else task for cmd in command_list] - print("path, command_list", path, command_list) - start_time = time.time() - proc = subprocess.Popen( - command_list, - cwd=path, - shell=True, - ) - - while True: - if time.time() - start_time > timeout: - print("The subprocess has exceeded the time limit and was terminated.") - proc.terminate() - break - - if proc.poll() is not None: - print("The subprocess has finished running.") - break - - -def run_agent_function(config, task): - timeout = ( - config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize - ) - print( - f"Running Python function '{config['agent']['function']}' with timeout {timeout}" - ) - - parent_conn, child_conn = Pipe() - process = Process(target=run_specific_agent, args=(task, child_conn)) - process.start() - start_time = time.time() - - while True: - if parent_conn.poll(): # Check if there's a new message from the child process - response, cycle_count = parent_conn.recv() - print(f"Cycle {cycle_count}: {response}") - - if cycle_count >= config["cutoff"]["count"]: - print( - f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating." - ) - child_conn.send("terminate") - break - - if time.time() - start_time > timeout: - print("The Python function has exceeded the time limit and was terminated.") - child_conn.send( - "terminate" - ) # Send a termination signal to the child process - break - - if not process.is_alive(): - print("The Python function has finished running.") - break - - process.join() diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 7388085d..d9b42ca4 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,10 +1,5 @@ { "workspace": "C:\\Users\\silen\\miniagi", - "agent": { - "type": "script", - "path": "", - "script": "python miniagi.py {}", - "user_input": "\n", - "cutoff": 60 - } + "func_path": "agent/benchmarks.py", + "cutoff": 60 } diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index c9f3643c..fe395cd2 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -29,7 +29,17 @@ def start(category, noreg, mock): config["workspace"] = click.prompt( "Please enter a new workspace path", - default=os.path.join(Path.home(), "miniagi"), + default=os.path.join(Path.home(), "workspace"), + ) + + config["func_path"] = click.prompt( + "Please enter a the path to your run_specific_agent function implementation", + default="/benchmarks.py", + ) + + config["cutoff"] = click.prompt( + "Please enter a hard cutoff runtime for your agent", + default="60", ) with open(config_dir, "w") as f: diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index d13b763c..9e26dfee 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1,7 +1 @@ -{ - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]" - } -} \ No newline at end of file +{} \ No newline at end of file diff --git a/agent/benchmarks.py b/agent/benchmarks.py new file mode 100644 index 00000000..eb66412c --- /dev/null +++ b/agent/benchmarks.py @@ -0,0 +1,15 @@ +# import subprocess + + +def run_specific_agent(task, conn): + cycle_count = 0 + while ( + not conn.poll() + ): # Check if there's a termination signal from the main process + response = run_agent(task) # run the agent and get the response and cycle count + + if response: + cycle_count += 1 + + # Send response and cycle count back to the main process + conn.send((response, cycle_count)) diff --git a/agent/hook.py b/agent/hook.py deleted file mode 100644 index 6fa53418..00000000 --- a/agent/hook.py +++ /dev/null @@ -1,10 +0,0 @@ -async def run_specific_agent(task, conn): - while ( - not conn.poll() - ): # Check if there's a termination signal from the main process - response, cycle_count = await run_agent( - task - ) # run the agent and get the response and cycle count - - # Send response and cycle count back to the main process - conn.send((response, cycle_count))