Use agent protocol (#278)

Signed-off-by: Jakub Novak <jakub@e2b.dev>
2026-02-23 15:14:44 +01:00 · 2023-08-11 09:04:08 +02:00
parent 47c6062092
commit c2269397f1
10 changed files with 188 additions and 48 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -12,8 +12,8 @@
 	branch = benchmark-integration
 [submodule "agent/smol-developer"]
 	path = agent/smol-developer
-	url = https://github.com/merwanehamadi/developer.git
-	branch = benchmark-integration
+	url = https://github.com/e2b-dev/smol-developer.git
+	branch = benchmarks
 [submodule "agent/SuperAGI"]
 	path = agent/SuperAGI
 	url = https://github.com/SilenNaihin/SuperAGI.git
--- a/agbenchmark/agent_api_interface.py
+++ b/agbenchmark/agent_api_interface.py
@@ -0,0 +1,53 @@
+import os
+import time
+from typing import Any, Dict
+
+from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody
+
+from agbenchmark.agent_interface import get_list_of_file_paths
+from agbenchmark.utils.data_types import ChallengeData
+
+
+async def run_api_agent(
+    task: ChallengeData, config: Dict[str, Any], artifacts_location: str, timeout: int
+) -> None:
+    configuration = Configuration(host=config["host"])
+    async with ApiClient(configuration) as api_client:
+        api_instance = AgentApi(api_client)
+        task_request_body = TaskRequestBody(input=task.task)
+
+        start_time = time.time()
+        response = await api_instance.create_agent_task(
+            task_request_body=task_request_body
+        )
+        task_id = response.task_id
+
+        for file in get_list_of_file_paths(artifacts_location, "artifacts_in"):
+            print(f"[{task.name}] - Copy {file.split('/')[-1]} to agent")
+            await api_instance.upload_agent_task_artifacts(task_id=task_id, file=file)
+
+        i = 1
+        while step := await api_instance.execute_agent_task_step(task_id=task_id):
+            print(f"[{task.name}] - step {step.name} ({i}. request)")
+            i += 1
+
+            if step.is_last:
+                break
+
+            if time.time() - start_time > timeout:
+                raise TimeoutError("Time limit exceeded")
+
+        artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
+        for artifact in artifacts:
+            print(f"[{task.name}] - Copy {artifact.file_name} from agent")
+
+            if artifact.relative_path:
+                folder_path = os.path.join(config["workspace"], artifact.relative_path)
+            else:
+                folder_path = os.path.join(config["workspace"])
+
+            with open(os.path.join(folder_path, artifact.file_name), "wb") as f:
+                content = await api_instance.download_agent_task_artifact(
+                    task_id=task_id, artifact_id=artifact.artifact_id
+                )
+                f.write(content)
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -4,7 +4,7 @@ import shutil
 import subprocess
 import sys
 import time
-from typing import Any, Dict
+from typing import List

 import psutil
 from dotenv import load_dotenv
@@ -13,33 +13,16 @@ from agbenchmark.start_benchmark import CURRENT_DIRECTORY, HOME_DIRECTORY

 load_dotenv()

-mock_test_str = os.getenv("MOCK_TEST")
-MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
 helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
 HELICONE_GRAPHQL_LOGS = (
    helicone_graphql_logs.lower() == "true" if helicone_graphql_logs else False
 )


-def run_agent(
-    task: str, config: Dict[str, Any], artifacts_location: str, cutoff: int
-) -> None:
+def run_agent(task: str, timeout: int) -> None:
    """Calling to get a response"""
-    if task == "":
-        return
-    if MOCK_FLAG:
-        print("Running mock agent")
-        copy_artifacts_into_workspace(
-            config["workspace"], "artifacts_out", artifacts_location
-        )
-        return
-    entry_path = "agbenchmark.benchmarks"

-    timeout = cutoff
-    if "--nc" in sys.argv:
-        timeout = 100000
-    if "--cutoff" in sys.argv:
-        timeout = int(sys.argv[sys.argv.index("--cutoff") + 1])
+    entry_path = "agbenchmark.benchmarks"

    print(f"Running '{entry_path}' with timeout {timeout}")

@@ -84,19 +67,22 @@ def run_agent(
        print(f"The agent timed out")


-def copy_artifacts_into_workspace(
-    workspace: str, artifact_folder_name: str, challenge_dir_path: str
-) -> None:
+def get_list_of_file_paths(
+    challenge_dir_path: str, artifact_folder_name: str
+) -> List[str]:
    # this file is at agbenchmark\agent_interface.py
    source_dir = os.path.join(
        CURRENT_DIRECTORY, "..", challenge_dir_path, artifact_folder_name
    )
-
-    # Check if source_dir exists, if not then return immediately.
    if not os.path.exists(source_dir):
-        return
+        return []
+    return [os.path.join(source_dir, file_name) for file_name in os.listdir(source_dir)]

-    for file_name in os.listdir(source_dir):
-        full_file_name = os.path.join(source_dir, file_name)
-        if os.path.isfile(full_file_name):
-            shutil.copy(full_file_name, workspace)
+
+def copy_artifacts_into_workspace(
+    workspace: str, artifact_folder_name: str, challenge_dir_path: str
+) -> None:
+    file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
+    for file_path in file_paths:
+        if os.path.isfile(file_path):
+            shutil.copy(file_path, workspace)
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -1,6 +1,7 @@
 import json
 import os
 import shutil
+import subprocess
 import sys
 import threading
 import time
@@ -15,7 +16,7 @@ from agbenchmark.reports.reports import (
    generate_single_call_report,
    session_finish,
 )
-from agbenchmark.start_benchmark import CONFIG_PATH, get_regression_data
+from agbenchmark.start_benchmark import CONFIG_PATH, HOME_DIRECTORY, get_regression_data
 from agbenchmark.utils.data_types import SuiteConfig

 GLOBAL_TIMEOUT = (
@@ -245,3 +246,25 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
        # Add category marker dynamically
        for category in categories:
            item.add_marker(getattr(pytest.mark, category))
+
+
+@pytest.fixture(scope="session", autouse=True)
+def run_agent(request: Any) -> Any:
+    with open(CONFIG_PATH, "r") as f:
+        config = json.load(f)
+
+    if config.get("api_mode"):
+        command = [sys.executable, "-m", "agbenchmark.benchmarks"]
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            universal_newlines=True,
+            cwd=HOME_DIRECTORY,
+        )
+        time.sleep(3)
+        yield
+        print(f"Terminating agent")
+        process.terminate()
+    else:
+        yield
--- a/agbenchmark/generate_test.py
+++ b/agbenchmark/generate_test.py
@@ -97,7 +97,7 @@ def create_single_test(
    )

    # Define test method within the dynamically created class
-    def test_method(self, config: Dict[str, Any], request) -> None:  # type: ignore
+    async def test_method(self, config: Dict[str, Any], request) -> None:  # type: ignore
        # create a random number between 0 and 1
        test_name = self.data.name

@@ -121,7 +121,14 @@ def create_single_test(
            HeliconeLockManager.write_custom_property("challenge", self.data.name)

        cutoff = self.data.cutoff or 60
-        self.setup_challenge(config, cutoff)
+
+        timeout = cutoff
+        if "--nc" in sys.argv:
+            timeout = 100000
+        if "--cutoff" in sys.argv:
+            timeout = int(sys.argv[sys.argv.index("--cutoff") + 1])
+
+        await self.setup_challenge(config, timeout)

        scores = self.get_scores(config)
        request.node.scores = scores  # store scores in request.node
--- a/agbenchmark/reports/reports.py
+++ b/agbenchmark/reports/reports.py
@@ -4,10 +4,10 @@ import sys
 from pathlib import Path
 from typing import Any, Dict

-from agbenchmark.agent_interface import MOCK_FLAG
 from agbenchmark.reports.ReportManager import ReportManager
 from agbenchmark.start_benchmark import (
    CONFIG_PATH,
+    MOCK_FLAG,
    REGRESSION_TESTS_PATH,
    REPORTS_PATH,
    SUCCESS_RATE_PATH,
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -22,6 +22,7 @@ if os.environ.get("HELICONE_API_KEY"):
    HeliconeLockManager.write_custom_property(
        "benchmark_start_time", BENCHMARK_START_TIME
    )
+MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true"


 (
@@ -141,21 +142,29 @@ def start(
        )
        return 1

-    if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
+    if os.path.exists(CONFIG_PATH) and os.stat(CONFIG_PATH).st_size:
+        # If the configuration file exists and is not empty, load it
+        with open(CONFIG_PATH, "r") as f:
+            config = json.load(f)
+    else:
        config = {}

+    if not config.get("workspace"):
        config["workspace"] = click.prompt(
            "Please enter a new workspace path",
            default=os.path.join("workspace"),
            show_default=True,
        )

-        with open(CONFIG_PATH, "w") as f:
-            json.dump(config, f)
-    else:
-        # If the configuration file exists and is not empty, load it
-        with open(CONFIG_PATH, "r") as f:
-            config = json.load(f)
+    if config.get("api_mode") and not config.get("host"):
+        config["host"] = click.prompt(
+            "Please enter the Agent API host address",
+            default="http://localhost:8000",
+            show_default=True,
+        )
+
+    with open(CONFIG_PATH, "w") as f:
+        json.dump(config, f)

    print("Current configuration:")
    for key, value in config.items():
--- a/agbenchmark/utils/challenge.py
+++ b/agbenchmark/utils/challenge.py
@@ -9,8 +9,8 @@ from typing import Any, Dict, List
 import openai
 import pytest

-from agbenchmark.agent_interface import MOCK_FLAG
-from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES
+from agbenchmark.agent_api_interface import run_api_agent
+from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES
 from agbenchmark.utils.data_types import ChallengeData, Ground
 from agbenchmark.utils.prompts import (
    END_PROMPT,
@@ -46,7 +46,10 @@ class Challenge(ABC):
    def dependencies(self) -> list:
        return self.data.dependencies

-    def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
+    async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
+        if not self.task:
+            return
+
        from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent

        copy_artifacts_into_workspace(
@@ -58,7 +61,15 @@ class Challenge(ABC):
        )
        print(f"\033[1;30mTask: {self.task}\033[0m")

-        run_agent(self.task, config, self.ARTIFACTS_LOCATION, cutoff)
+        if MOCK_FLAG:
+            print("Running mock agent")
+            copy_artifacts_into_workspace(
+                config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION
+            )
+        elif config.get("api_mode"):
+            await run_api_agent(self.data, config, self.ARTIFACTS_LOCATION, cutoff)
+        else:
+            run_agent(self.task, cutoff)

        # hidden files are added after the agent runs. Hidden files can be python test files.
        # We copy them in the workspace to make it easy to import the code produced by the agent
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,5 +1,17 @@
 # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.

+[[package]]
+name = "aenum"
+version = "3.1.15"
+description = "Advanced Enumerations (compatible with Python's stdlib Enum), NamedTuples, and NamedConstants"
+optional = false
+python-versions = "*"
+files = [
+    {file = "aenum-3.1.15-py2-none-any.whl", hash = "sha256:27b1710b9d084de6e2e695dab78fe9f269de924b51ae2850170ee7e1ca6288a5"},
+    {file = "aenum-3.1.15-py3-none-any.whl", hash = "sha256:e0dfaeea4c2bd362144b87377e2c61d91958c5ed0b4daf89cb6f45ae23af6288"},
+    {file = "aenum-3.1.15.tar.gz", hash = "sha256:8cbd76cd18c4f870ff39b24284d3ea028fbe8731a58df3aa581e434c575b9559"},
+]
+
 [[package]]
 name = "agent-protocol"
 version = "0.2.3"
@@ -21,6 +33,24 @@ pytest = ">=7.0.0,<8.0.0"
 python-multipart = ">=0.0.6,<0.0.7"
 requests = ">=2.31.0,<3.0.0"

+[[package]]
+name = "agent-protocol-client"
+version = "0.2.2"
+description = "Agent Communication Protocol Client"
+optional = false
+python-versions = ">=3.10,<4.0"
+files = [
+    {file = "agent_protocol_client-0.2.2-py3-none-any.whl", hash = "sha256:e46f506f74a20d7b4bec65c852598e889a05789f803fe384b758972049b23c96"},
+    {file = "agent_protocol_client-0.2.2.tar.gz", hash = "sha256:536197eee29cb24504458b8401c2a4cf22b8e1e5ad0924af05fb1f76754b09c2"},
+]
+
+[package.dependencies]
+aenum = ">=3.1.11"
+aiohttp = ">=3.8.4"
+pydantic = ">=1.10.5,<2.0.0"
+python-dateutil = ">=2.8.2"
+urllib3 = ">=1.25.3"
+
 [[package]]
 name = "aiofiles"
 version = "23.1.0"
@@ -2149,6 +2179,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]

+[[package]]
+name = "pytest-asyncio"
+version = "0.21.1"
+description = "Pytest support for asyncio"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-asyncio-0.21.1.tar.gz", hash = "sha256:40a7eae6dded22c7b604986855ea48400ab15b069ae38116e8c01238e9eeb64d"},
+    {file = "pytest_asyncio-0.21.1-py3-none-any.whl", hash = "sha256:8666c1c8ac02631d7c51ba282e0c69a8a452b211ffedf2599099845da5c5c37b"},
+]
+
+[package.dependencies]
+pytest = ">=7.0.0"
+
+[package.extras]
+docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
+testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
+
 [[package]]
 name = "python-dateutil"
 version = "2.8.2"
@@ -2716,4 +2764,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "855fe892a4401b03bbda926df20c140ea5143574ff5d04b6168472a2d162b9dd"
+content-hash = "8f1d3f47e03dd229fec21c8162cebecf871b7f7c093bd32ff5c2f13ecc69b4b6"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ colorama = "^0.4.6"
 pyvis = "^0.3.2"
 selenium = "^4.11.2"
 agent-protocol = "^0.2.3"
+agent-protocol-client = "^0.2.2"

 [tool.poetry.group.dev.dependencies]
 flake8 = "^3.9.2"
@@ -38,6 +39,7 @@ pandas = "^2.0.3"
 gspread = "^5.10.0"
 oauth2client = "^4.1.3"
 pre-commit = "^3.3.3"
+pytest-asyncio = "^0.21.1"

 [build-system]
 requires = ["poetry-core"]
@@ -49,6 +51,7 @@ addopts = "-ra -q"
 testpaths = [
    "tests", "agbenchmark",
 ]
+asyncio_mode = "auto"
 markers = [
    "retrieval",
    "interface",