Use agent protocol (#278)

Signed-off-by: Jakub Novak <jakub@e2b.dev>
This commit is contained in:
Jakub Novák
2023-08-11 09:04:08 +02:00
committed by GitHub
parent 47c6062092
commit c2269397f1
10 changed files with 188 additions and 48 deletions

4
.gitmodules vendored
View File

@@ -12,8 +12,8 @@
branch = benchmark-integration
[submodule "agent/smol-developer"]
path = agent/smol-developer
url = https://github.com/merwanehamadi/developer.git
branch = benchmark-integration
url = https://github.com/e2b-dev/smol-developer.git
branch = benchmarks
[submodule "agent/SuperAGI"]
path = agent/SuperAGI
url = https://github.com/SilenNaihin/SuperAGI.git

View File

@@ -0,0 +1,53 @@
import os
import time
from typing import Any, Dict
from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody
from agbenchmark.agent_interface import get_list_of_file_paths
from agbenchmark.utils.data_types import ChallengeData
async def run_api_agent(
task: ChallengeData, config: Dict[str, Any], artifacts_location: str, timeout: int
) -> None:
configuration = Configuration(host=config["host"])
async with ApiClient(configuration) as api_client:
api_instance = AgentApi(api_client)
task_request_body = TaskRequestBody(input=task.task)
start_time = time.time()
response = await api_instance.create_agent_task(
task_request_body=task_request_body
)
task_id = response.task_id
for file in get_list_of_file_paths(artifacts_location, "artifacts_in"):
print(f"[{task.name}] - Copy {file.split('/')[-1]} to agent")
await api_instance.upload_agent_task_artifacts(task_id=task_id, file=file)
i = 1
while step := await api_instance.execute_agent_task_step(task_id=task_id):
print(f"[{task.name}] - step {step.name} ({i}. request)")
i += 1
if step.is_last:
break
if time.time() - start_time > timeout:
raise TimeoutError("Time limit exceeded")
artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
for artifact in artifacts:
print(f"[{task.name}] - Copy {artifact.file_name} from agent")
if artifact.relative_path:
folder_path = os.path.join(config["workspace"], artifact.relative_path)
else:
folder_path = os.path.join(config["workspace"])
with open(os.path.join(folder_path, artifact.file_name), "wb") as f:
content = await api_instance.download_agent_task_artifact(
task_id=task_id, artifact_id=artifact.artifact_id
)
f.write(content)

View File

@@ -4,7 +4,7 @@ import shutil
import subprocess
import sys
import time
from typing import Any, Dict
from typing import List
import psutil
from dotenv import load_dotenv
@@ -13,33 +13,16 @@ from agbenchmark.start_benchmark import CURRENT_DIRECTORY, HOME_DIRECTORY
load_dotenv()
mock_test_str = os.getenv("MOCK_TEST")
MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
HELICONE_GRAPHQL_LOGS = (
helicone_graphql_logs.lower() == "true" if helicone_graphql_logs else False
)
def run_agent(
task: str, config: Dict[str, Any], artifacts_location: str, cutoff: int
) -> None:
def run_agent(task: str, timeout: int) -> None:
"""Calling to get a response"""
if task == "":
return
if MOCK_FLAG:
print("Running mock agent")
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", artifacts_location
)
return
entry_path = "agbenchmark.benchmarks"
timeout = cutoff
if "--nc" in sys.argv:
timeout = 100000
if "--cutoff" in sys.argv:
timeout = int(sys.argv[sys.argv.index("--cutoff") + 1])
entry_path = "agbenchmark.benchmarks"
print(f"Running '{entry_path}' with timeout {timeout}")
@@ -84,19 +67,22 @@ def run_agent(
print(f"The agent timed out")
def copy_artifacts_into_workspace(
workspace: str, artifact_folder_name: str, challenge_dir_path: str
) -> None:
def get_list_of_file_paths(
challenge_dir_path: str, artifact_folder_name: str
) -> List[str]:
# this file is at agbenchmark\agent_interface.py
source_dir = os.path.join(
CURRENT_DIRECTORY, "..", challenge_dir_path, artifact_folder_name
)
# Check if source_dir exists, if not then return immediately.
if not os.path.exists(source_dir):
return
return []
return [os.path.join(source_dir, file_name) for file_name in os.listdir(source_dir)]
for file_name in os.listdir(source_dir):
full_file_name = os.path.join(source_dir, file_name)
if os.path.isfile(full_file_name):
shutil.copy(full_file_name, workspace)
def copy_artifacts_into_workspace(
workspace: str, artifact_folder_name: str, challenge_dir_path: str
) -> None:
file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
for file_path in file_paths:
if os.path.isfile(file_path):
shutil.copy(file_path, workspace)

View File

@@ -1,6 +1,7 @@
import json
import os
import shutil
import subprocess
import sys
import threading
import time
@@ -15,7 +16,7 @@ from agbenchmark.reports.reports import (
generate_single_call_report,
session_finish,
)
from agbenchmark.start_benchmark import CONFIG_PATH, get_regression_data
from agbenchmark.start_benchmark import CONFIG_PATH, HOME_DIRECTORY, get_regression_data
from agbenchmark.utils.data_types import SuiteConfig
GLOBAL_TIMEOUT = (
@@ -245,3 +246,25 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
# Add category marker dynamically
for category in categories:
item.add_marker(getattr(pytest.mark, category))
@pytest.fixture(scope="session", autouse=True)
def run_agent(request: Any) -> Any:
with open(CONFIG_PATH, "r") as f:
config = json.load(f)
if config.get("api_mode"):
command = [sys.executable, "-m", "agbenchmark.benchmarks"]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
cwd=HOME_DIRECTORY,
)
time.sleep(3)
yield
print(f"Terminating agent")
process.terminate()
else:
yield

View File

@@ -97,7 +97,7 @@ def create_single_test(
)
# Define test method within the dynamically created class
def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore
async def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore
# create a random number between 0 and 1
test_name = self.data.name
@@ -121,7 +121,14 @@ def create_single_test(
HeliconeLockManager.write_custom_property("challenge", self.data.name)
cutoff = self.data.cutoff or 60
self.setup_challenge(config, cutoff)
timeout = cutoff
if "--nc" in sys.argv:
timeout = 100000
if "--cutoff" in sys.argv:
timeout = int(sys.argv[sys.argv.index("--cutoff") + 1])
await self.setup_challenge(config, timeout)
scores = self.get_scores(config)
request.node.scores = scores # store scores in request.node

View File

@@ -4,10 +4,10 @@ import sys
from pathlib import Path
from typing import Any, Dict
from agbenchmark.agent_interface import MOCK_FLAG
from agbenchmark.reports.ReportManager import ReportManager
from agbenchmark.start_benchmark import (
CONFIG_PATH,
MOCK_FLAG,
REGRESSION_TESTS_PATH,
REPORTS_PATH,
SUCCESS_RATE_PATH,

View File

@@ -22,6 +22,7 @@ if os.environ.get("HELICONE_API_KEY"):
HeliconeLockManager.write_custom_property(
"benchmark_start_time", BENCHMARK_START_TIME
)
MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true"
(
@@ -141,21 +142,29 @@ def start(
)
return 1
if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
if os.path.exists(CONFIG_PATH) and os.stat(CONFIG_PATH).st_size:
# If the configuration file exists and is not empty, load it
with open(CONFIG_PATH, "r") as f:
config = json.load(f)
else:
config = {}
if not config.get("workspace"):
config["workspace"] = click.prompt(
"Please enter a new workspace path",
default=os.path.join("workspace"),
show_default=True,
)
with open(CONFIG_PATH, "w") as f:
json.dump(config, f)
else:
# If the configuration file exists and is not empty, load it
with open(CONFIG_PATH, "r") as f:
config = json.load(f)
if config.get("api_mode") and not config.get("host"):
config["host"] = click.prompt(
"Please enter the Agent API host address",
default="http://localhost:8000",
show_default=True,
)
with open(CONFIG_PATH, "w") as f:
json.dump(config, f)
print("Current configuration:")
for key, value in config.items():

View File

@@ -9,8 +9,8 @@ from typing import Any, Dict, List
import openai
import pytest
from agbenchmark.agent_interface import MOCK_FLAG
from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES
from agbenchmark.agent_api_interface import run_api_agent
from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES
from agbenchmark.utils.data_types import ChallengeData, Ground
from agbenchmark.utils.prompts import (
END_PROMPT,
@@ -46,7 +46,10 @@ class Challenge(ABC):
def dependencies(self) -> list:
return self.data.dependencies
def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
if not self.task:
return
from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
copy_artifacts_into_workspace(
@@ -58,7 +61,15 @@ class Challenge(ABC):
)
print(f"\033[1;30mTask: {self.task}\033[0m")
run_agent(self.task, config, self.ARTIFACTS_LOCATION, cutoff)
if MOCK_FLAG:
print("Running mock agent")
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION
)
elif config.get("api_mode"):
await run_api_agent(self.data, config, self.ARTIFACTS_LOCATION, cutoff)
else:
run_agent(self.task, cutoff)
# hidden files are added after the agent runs. Hidden files can be python test files.
# We copy them in the workspace to make it easy to import the code produced by the agent

50
poetry.lock generated
View File

@@ -1,5 +1,17 @@
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
[[package]]
name = "aenum"
version = "3.1.15"
description = "Advanced Enumerations (compatible with Python's stdlib Enum), NamedTuples, and NamedConstants"
optional = false
python-versions = "*"
files = [
{file = "aenum-3.1.15-py2-none-any.whl", hash = "sha256:27b1710b9d084de6e2e695dab78fe9f269de924b51ae2850170ee7e1ca6288a5"},
{file = "aenum-3.1.15-py3-none-any.whl", hash = "sha256:e0dfaeea4c2bd362144b87377e2c61d91958c5ed0b4daf89cb6f45ae23af6288"},
{file = "aenum-3.1.15.tar.gz", hash = "sha256:8cbd76cd18c4f870ff39b24284d3ea028fbe8731a58df3aa581e434c575b9559"},
]
[[package]]
name = "agent-protocol"
version = "0.2.3"
@@ -21,6 +33,24 @@ pytest = ">=7.0.0,<8.0.0"
python-multipart = ">=0.0.6,<0.0.7"
requests = ">=2.31.0,<3.0.0"
[[package]]
name = "agent-protocol-client"
version = "0.2.2"
description = "Agent Communication Protocol Client"
optional = false
python-versions = ">=3.10,<4.0"
files = [
{file = "agent_protocol_client-0.2.2-py3-none-any.whl", hash = "sha256:e46f506f74a20d7b4bec65c852598e889a05789f803fe384b758972049b23c96"},
{file = "agent_protocol_client-0.2.2.tar.gz", hash = "sha256:536197eee29cb24504458b8401c2a4cf22b8e1e5ad0924af05fb1f76754b09c2"},
]
[package.dependencies]
aenum = ">=3.1.11"
aiohttp = ">=3.8.4"
pydantic = ">=1.10.5,<2.0.0"
python-dateutil = ">=2.8.2"
urllib3 = ">=1.25.3"
[[package]]
name = "aiofiles"
version = "23.1.0"
@@ -2149,6 +2179,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-asyncio"
version = "0.21.1"
description = "Pytest support for asyncio"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytest-asyncio-0.21.1.tar.gz", hash = "sha256:40a7eae6dded22c7b604986855ea48400ab15b069ae38116e8c01238e9eeb64d"},
{file = "pytest_asyncio-0.21.1-py3-none-any.whl", hash = "sha256:8666c1c8ac02631d7c51ba282e0c69a8a452b211ffedf2599099845da5c5c37b"},
]
[package.dependencies]
pytest = ">=7.0.0"
[package.extras]
docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
[[package]]
name = "python-dateutil"
version = "2.8.2"
@@ -2716,4 +2764,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "855fe892a4401b03bbda926df20c140ea5143574ff5d04b6168472a2d162b9dd"
content-hash = "8f1d3f47e03dd229fec21c8162cebecf871b7f7c093bd32ff5c2f13ecc69b4b6"

View File

@@ -27,6 +27,7 @@ colorama = "^0.4.6"
pyvis = "^0.3.2"
selenium = "^4.11.2"
agent-protocol = "^0.2.3"
agent-protocol-client = "^0.2.2"
[tool.poetry.group.dev.dependencies]
flake8 = "^3.9.2"
@@ -38,6 +39,7 @@ pandas = "^2.0.3"
gspread = "^5.10.0"
oauth2client = "^4.1.3"
pre-commit = "^3.3.3"
pytest-asyncio = "^0.21.1"
[build-system]
requires = ["poetry-core"]
@@ -49,6 +51,7 @@ addopts = "-ra -q"
testpaths = [
"tests", "agbenchmark",
]
asyncio_mode = "auto"
markers = [
"retrieval",
"interface",