From c2269397f16607c27597af9e88d86e8b673c7a61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Nov=C3=A1k?= Date: Fri, 11 Aug 2023 09:04:08 +0200 Subject: [PATCH] Use agent protocol (#278) Signed-off-by: Jakub Novak --- .gitmodules | 4 +-- agbenchmark/agent_api_interface.py | 53 ++++++++++++++++++++++++++++++ agbenchmark/agent_interface.py | 46 +++++++++----------------- agbenchmark/conftest.py | 25 +++++++++++++- agbenchmark/generate_test.py | 11 +++++-- agbenchmark/reports/reports.py | 2 +- agbenchmark/start_benchmark.py | 23 +++++++++---- agbenchmark/utils/challenge.py | 19 ++++++++--- poetry.lock | 50 +++++++++++++++++++++++++++- pyproject.toml | 3 ++ 10 files changed, 188 insertions(+), 48 deletions(-) create mode 100644 agbenchmark/agent_api_interface.py diff --git a/.gitmodules b/.gitmodules index 25e78d0b..de4680ff 100644 --- a/.gitmodules +++ b/.gitmodules @@ -12,8 +12,8 @@ branch = benchmark-integration [submodule "agent/smol-developer"] path = agent/smol-developer - url = https://github.com/merwanehamadi/developer.git - branch = benchmark-integration + url = https://github.com/e2b-dev/smol-developer.git + branch = benchmarks [submodule "agent/SuperAGI"] path = agent/SuperAGI url = https://github.com/SilenNaihin/SuperAGI.git diff --git a/agbenchmark/agent_api_interface.py b/agbenchmark/agent_api_interface.py new file mode 100644 index 00000000..1e3657e6 --- /dev/null +++ b/agbenchmark/agent_api_interface.py @@ -0,0 +1,53 @@ +import os +import time +from typing import Any, Dict + +from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody + +from agbenchmark.agent_interface import get_list_of_file_paths +from agbenchmark.utils.data_types import ChallengeData + + +async def run_api_agent( + task: ChallengeData, config: Dict[str, Any], artifacts_location: str, timeout: int +) -> None: + configuration = Configuration(host=config["host"]) + async with ApiClient(configuration) as api_client: + api_instance = AgentApi(api_client) + task_request_body = TaskRequestBody(input=task.task) + + start_time = time.time() + response = await api_instance.create_agent_task( + task_request_body=task_request_body + ) + task_id = response.task_id + + for file in get_list_of_file_paths(artifacts_location, "artifacts_in"): + print(f"[{task.name}] - Copy {file.split('/')[-1]} to agent") + await api_instance.upload_agent_task_artifacts(task_id=task_id, file=file) + + i = 1 + while step := await api_instance.execute_agent_task_step(task_id=task_id): + print(f"[{task.name}] - step {step.name} ({i}. request)") + i += 1 + + if step.is_last: + break + + if time.time() - start_time > timeout: + raise TimeoutError("Time limit exceeded") + + artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id) + for artifact in artifacts: + print(f"[{task.name}] - Copy {artifact.file_name} from agent") + + if artifact.relative_path: + folder_path = os.path.join(config["workspace"], artifact.relative_path) + else: + folder_path = os.path.join(config["workspace"]) + + with open(os.path.join(folder_path, artifact.file_name), "wb") as f: + content = await api_instance.download_agent_task_artifact( + task_id=task_id, artifact_id=artifact.artifact_id + ) + f.write(content) diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index afa5d5b1..016c6373 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -4,7 +4,7 @@ import shutil import subprocess import sys import time -from typing import Any, Dict +from typing import List import psutil from dotenv import load_dotenv @@ -13,33 +13,16 @@ from agbenchmark.start_benchmark import CURRENT_DIRECTORY, HOME_DIRECTORY load_dotenv() -mock_test_str = os.getenv("MOCK_TEST") -MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS") HELICONE_GRAPHQL_LOGS = ( helicone_graphql_logs.lower() == "true" if helicone_graphql_logs else False ) -def run_agent( - task: str, config: Dict[str, Any], artifacts_location: str, cutoff: int -) -> None: +def run_agent(task: str, timeout: int) -> None: """Calling to get a response""" - if task == "": - return - if MOCK_FLAG: - print("Running mock agent") - copy_artifacts_into_workspace( - config["workspace"], "artifacts_out", artifacts_location - ) - return - entry_path = "agbenchmark.benchmarks" - timeout = cutoff - if "--nc" in sys.argv: - timeout = 100000 - if "--cutoff" in sys.argv: - timeout = int(sys.argv[sys.argv.index("--cutoff") + 1]) + entry_path = "agbenchmark.benchmarks" print(f"Running '{entry_path}' with timeout {timeout}") @@ -84,19 +67,22 @@ def run_agent( print(f"The agent timed out") -def copy_artifacts_into_workspace( - workspace: str, artifact_folder_name: str, challenge_dir_path: str -) -> None: +def get_list_of_file_paths( + challenge_dir_path: str, artifact_folder_name: str +) -> List[str]: # this file is at agbenchmark\agent_interface.py source_dir = os.path.join( CURRENT_DIRECTORY, "..", challenge_dir_path, artifact_folder_name ) - - # Check if source_dir exists, if not then return immediately. if not os.path.exists(source_dir): - return + return [] + return [os.path.join(source_dir, file_name) for file_name in os.listdir(source_dir)] - for file_name in os.listdir(source_dir): - full_file_name = os.path.join(source_dir, file_name) - if os.path.isfile(full_file_name): - shutil.copy(full_file_name, workspace) + +def copy_artifacts_into_workspace( + workspace: str, artifact_folder_name: str, challenge_dir_path: str +) -> None: + file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name) + for file_path in file_paths: + if os.path.isfile(file_path): + shutil.copy(file_path, workspace) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 75030906..a535eed9 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -1,6 +1,7 @@ import json import os import shutil +import subprocess import sys import threading import time @@ -15,7 +16,7 @@ from agbenchmark.reports.reports import ( generate_single_call_report, session_finish, ) -from agbenchmark.start_benchmark import CONFIG_PATH, get_regression_data +from agbenchmark.start_benchmark import CONFIG_PATH, HOME_DIRECTORY, get_regression_data from agbenchmark.utils.data_types import SuiteConfig GLOBAL_TIMEOUT = ( @@ -245,3 +246,25 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None: # Add category marker dynamically for category in categories: item.add_marker(getattr(pytest.mark, category)) + + +@pytest.fixture(scope="session", autouse=True) +def run_agent(request: Any) -> Any: + with open(CONFIG_PATH, "r") as f: + config = json.load(f) + + if config.get("api_mode"): + command = [sys.executable, "-m", "agbenchmark.benchmarks"] + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + cwd=HOME_DIRECTORY, + ) + time.sleep(3) + yield + print(f"Terminating agent") + process.terminate() + else: + yield diff --git a/agbenchmark/generate_test.py b/agbenchmark/generate_test.py index ff72bff0..a34be80a 100644 --- a/agbenchmark/generate_test.py +++ b/agbenchmark/generate_test.py @@ -97,7 +97,7 @@ def create_single_test( ) # Define test method within the dynamically created class - def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore + async def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore # create a random number between 0 and 1 test_name = self.data.name @@ -121,7 +121,14 @@ def create_single_test( HeliconeLockManager.write_custom_property("challenge", self.data.name) cutoff = self.data.cutoff or 60 - self.setup_challenge(config, cutoff) + + timeout = cutoff + if "--nc" in sys.argv: + timeout = 100000 + if "--cutoff" in sys.argv: + timeout = int(sys.argv[sys.argv.index("--cutoff") + 1]) + + await self.setup_challenge(config, timeout) scores = self.get_scores(config) request.node.scores = scores # store scores in request.node diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py index 1e9146cf..d482532f 100644 --- a/agbenchmark/reports/reports.py +++ b/agbenchmark/reports/reports.py @@ -4,10 +4,10 @@ import sys from pathlib import Path from typing import Any, Dict -from agbenchmark.agent_interface import MOCK_FLAG from agbenchmark.reports.ReportManager import ReportManager from agbenchmark.start_benchmark import ( CONFIG_PATH, + MOCK_FLAG, REGRESSION_TESTS_PATH, REPORTS_PATH, SUCCESS_RATE_PATH, diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index f296e96d..e46e369e 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -22,6 +22,7 @@ if os.environ.get("HELICONE_API_KEY"): HeliconeLockManager.write_custom_property( "benchmark_start_time", BENCHMARK_START_TIME ) +MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true" ( @@ -141,21 +142,29 @@ def start( ) return 1 - if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: + if os.path.exists(CONFIG_PATH) and os.stat(CONFIG_PATH).st_size: + # If the configuration file exists and is not empty, load it + with open(CONFIG_PATH, "r") as f: + config = json.load(f) + else: config = {} + if not config.get("workspace"): config["workspace"] = click.prompt( "Please enter a new workspace path", default=os.path.join("workspace"), show_default=True, ) - with open(CONFIG_PATH, "w") as f: - json.dump(config, f) - else: - # If the configuration file exists and is not empty, load it - with open(CONFIG_PATH, "r") as f: - config = json.load(f) + if config.get("api_mode") and not config.get("host"): + config["host"] = click.prompt( + "Please enter the Agent API host address", + default="http://localhost:8000", + show_default=True, + ) + + with open(CONFIG_PATH, "w") as f: + json.dump(config, f) print("Current configuration:") for key, value in config.items(): diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py index efee9cc7..d130f486 100644 --- a/agbenchmark/utils/challenge.py +++ b/agbenchmark/utils/challenge.py @@ -9,8 +9,8 @@ from typing import Any, Dict, List import openai import pytest -from agbenchmark.agent_interface import MOCK_FLAG -from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES +from agbenchmark.agent_api_interface import run_api_agent +from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES from agbenchmark.utils.data_types import ChallengeData, Ground from agbenchmark.utils.prompts import ( END_PROMPT, @@ -46,7 +46,10 @@ class Challenge(ABC): def dependencies(self) -> list: return self.data.dependencies - def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None: + async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None: + if not self.task: + return + from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent copy_artifacts_into_workspace( @@ -58,7 +61,15 @@ class Challenge(ABC): ) print(f"\033[1;30mTask: {self.task}\033[0m") - run_agent(self.task, config, self.ARTIFACTS_LOCATION, cutoff) + if MOCK_FLAG: + print("Running mock agent") + copy_artifacts_into_workspace( + config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION + ) + elif config.get("api_mode"): + await run_api_agent(self.data, config, self.ARTIFACTS_LOCATION, cutoff) + else: + run_agent(self.task, cutoff) # hidden files are added after the agent runs. Hidden files can be python test files. # We copy them in the workspace to make it easy to import the code produced by the agent diff --git a/poetry.lock b/poetry.lock index 025c3867..b05e5d0d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,17 @@ # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +[[package]] +name = "aenum" +version = "3.1.15" +description = "Advanced Enumerations (compatible with Python's stdlib Enum), NamedTuples, and NamedConstants" +optional = false +python-versions = "*" +files = [ + {file = "aenum-3.1.15-py2-none-any.whl", hash = "sha256:27b1710b9d084de6e2e695dab78fe9f269de924b51ae2850170ee7e1ca6288a5"}, + {file = "aenum-3.1.15-py3-none-any.whl", hash = "sha256:e0dfaeea4c2bd362144b87377e2c61d91958c5ed0b4daf89cb6f45ae23af6288"}, + {file = "aenum-3.1.15.tar.gz", hash = "sha256:8cbd76cd18c4f870ff39b24284d3ea028fbe8731a58df3aa581e434c575b9559"}, +] + [[package]] name = "agent-protocol" version = "0.2.3" @@ -21,6 +33,24 @@ pytest = ">=7.0.0,<8.0.0" python-multipart = ">=0.0.6,<0.0.7" requests = ">=2.31.0,<3.0.0" +[[package]] +name = "agent-protocol-client" +version = "0.2.2" +description = "Agent Communication Protocol Client" +optional = false +python-versions = ">=3.10,<4.0" +files = [ + {file = "agent_protocol_client-0.2.2-py3-none-any.whl", hash = "sha256:e46f506f74a20d7b4bec65c852598e889a05789f803fe384b758972049b23c96"}, + {file = "agent_protocol_client-0.2.2.tar.gz", hash = "sha256:536197eee29cb24504458b8401c2a4cf22b8e1e5ad0924af05fb1f76754b09c2"}, +] + +[package.dependencies] +aenum = ">=3.1.11" +aiohttp = ">=3.8.4" +pydantic = ">=1.10.5,<2.0.0" +python-dateutil = ">=2.8.2" +urllib3 = ">=1.25.3" + [[package]] name = "aiofiles" version = "23.1.0" @@ -2149,6 +2179,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-asyncio" +version = "0.21.1" +description = "Pytest support for asyncio" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-asyncio-0.21.1.tar.gz", hash = "sha256:40a7eae6dded22c7b604986855ea48400ab15b069ae38116e8c01238e9eeb64d"}, + {file = "pytest_asyncio-0.21.1-py3-none-any.whl", hash = "sha256:8666c1c8ac02631d7c51ba282e0c69a8a452b211ffedf2599099845da5c5c37b"}, +] + +[package.dependencies] +pytest = ">=7.0.0" + +[package.extras] +docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"] +testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] + [[package]] name = "python-dateutil" version = "2.8.2" @@ -2716,4 +2764,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "855fe892a4401b03bbda926df20c140ea5143574ff5d04b6168472a2d162b9dd" +content-hash = "8f1d3f47e03dd229fec21c8162cebecf871b7f7c093bd32ff5c2f13ecc69b4b6" diff --git a/pyproject.toml b/pyproject.toml index 8cd1430b..1d39c712 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ colorama = "^0.4.6" pyvis = "^0.3.2" selenium = "^4.11.2" agent-protocol = "^0.2.3" +agent-protocol-client = "^0.2.2" [tool.poetry.group.dev.dependencies] flake8 = "^3.9.2" @@ -38,6 +39,7 @@ pandas = "^2.0.3" gspread = "^5.10.0" oauth2client = "^4.1.3" pre-commit = "^3.3.3" +pytest-asyncio = "^0.21.1" [build-system] requires = ["poetry-core"] @@ -49,6 +51,7 @@ addopts = "-ra -q" testpaths = [ "tests", "agbenchmark", ] +asyncio_mode = "auto" markers = [ "retrieval", "interface",