mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-07 00:04:26 +01:00
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -12,8 +12,8 @@
|
||||
branch = benchmark-integration
|
||||
[submodule "agent/smol-developer"]
|
||||
path = agent/smol-developer
|
||||
url = https://github.com/merwanehamadi/developer.git
|
||||
branch = benchmark-integration
|
||||
url = https://github.com/e2b-dev/smol-developer.git
|
||||
branch = benchmarks
|
||||
[submodule "agent/SuperAGI"]
|
||||
path = agent/SuperAGI
|
||||
url = https://github.com/SilenNaihin/SuperAGI.git
|
||||
|
||||
53
agbenchmark/agent_api_interface.py
Normal file
53
agbenchmark/agent_api_interface.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import os
|
||||
import time
|
||||
from typing import Any, Dict
|
||||
|
||||
from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody
|
||||
|
||||
from agbenchmark.agent_interface import get_list_of_file_paths
|
||||
from agbenchmark.utils.data_types import ChallengeData
|
||||
|
||||
|
||||
async def run_api_agent(
|
||||
task: ChallengeData, config: Dict[str, Any], artifacts_location: str, timeout: int
|
||||
) -> None:
|
||||
configuration = Configuration(host=config["host"])
|
||||
async with ApiClient(configuration) as api_client:
|
||||
api_instance = AgentApi(api_client)
|
||||
task_request_body = TaskRequestBody(input=task.task)
|
||||
|
||||
start_time = time.time()
|
||||
response = await api_instance.create_agent_task(
|
||||
task_request_body=task_request_body
|
||||
)
|
||||
task_id = response.task_id
|
||||
|
||||
for file in get_list_of_file_paths(artifacts_location, "artifacts_in"):
|
||||
print(f"[{task.name}] - Copy {file.split('/')[-1]} to agent")
|
||||
await api_instance.upload_agent_task_artifacts(task_id=task_id, file=file)
|
||||
|
||||
i = 1
|
||||
while step := await api_instance.execute_agent_task_step(task_id=task_id):
|
||||
print(f"[{task.name}] - step {step.name} ({i}. request)")
|
||||
i += 1
|
||||
|
||||
if step.is_last:
|
||||
break
|
||||
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError("Time limit exceeded")
|
||||
|
||||
artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
|
||||
for artifact in artifacts:
|
||||
print(f"[{task.name}] - Copy {artifact.file_name} from agent")
|
||||
|
||||
if artifact.relative_path:
|
||||
folder_path = os.path.join(config["workspace"], artifact.relative_path)
|
||||
else:
|
||||
folder_path = os.path.join(config["workspace"])
|
||||
|
||||
with open(os.path.join(folder_path, artifact.file_name), "wb") as f:
|
||||
content = await api_instance.download_agent_task_artifact(
|
||||
task_id=task_id, artifact_id=artifact.artifact_id
|
||||
)
|
||||
f.write(content)
|
||||
@@ -4,7 +4,7 @@ import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Dict
|
||||
from typing import List
|
||||
|
||||
import psutil
|
||||
from dotenv import load_dotenv
|
||||
@@ -13,33 +13,16 @@ from agbenchmark.start_benchmark import CURRENT_DIRECTORY, HOME_DIRECTORY
|
||||
|
||||
load_dotenv()
|
||||
|
||||
mock_test_str = os.getenv("MOCK_TEST")
|
||||
MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
|
||||
helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
|
||||
HELICONE_GRAPHQL_LOGS = (
|
||||
helicone_graphql_logs.lower() == "true" if helicone_graphql_logs else False
|
||||
)
|
||||
|
||||
|
||||
def run_agent(
|
||||
task: str, config: Dict[str, Any], artifacts_location: str, cutoff: int
|
||||
) -> None:
|
||||
def run_agent(task: str, timeout: int) -> None:
|
||||
"""Calling to get a response"""
|
||||
if task == "":
|
||||
return
|
||||
if MOCK_FLAG:
|
||||
print("Running mock agent")
|
||||
copy_artifacts_into_workspace(
|
||||
config["workspace"], "artifacts_out", artifacts_location
|
||||
)
|
||||
return
|
||||
entry_path = "agbenchmark.benchmarks"
|
||||
|
||||
timeout = cutoff
|
||||
if "--nc" in sys.argv:
|
||||
timeout = 100000
|
||||
if "--cutoff" in sys.argv:
|
||||
timeout = int(sys.argv[sys.argv.index("--cutoff") + 1])
|
||||
entry_path = "agbenchmark.benchmarks"
|
||||
|
||||
print(f"Running '{entry_path}' with timeout {timeout}")
|
||||
|
||||
@@ -84,19 +67,22 @@ def run_agent(
|
||||
print(f"The agent timed out")
|
||||
|
||||
|
||||
def copy_artifacts_into_workspace(
|
||||
workspace: str, artifact_folder_name: str, challenge_dir_path: str
|
||||
) -> None:
|
||||
def get_list_of_file_paths(
|
||||
challenge_dir_path: str, artifact_folder_name: str
|
||||
) -> List[str]:
|
||||
# this file is at agbenchmark\agent_interface.py
|
||||
source_dir = os.path.join(
|
||||
CURRENT_DIRECTORY, "..", challenge_dir_path, artifact_folder_name
|
||||
)
|
||||
|
||||
# Check if source_dir exists, if not then return immediately.
|
||||
if not os.path.exists(source_dir):
|
||||
return
|
||||
return []
|
||||
return [os.path.join(source_dir, file_name) for file_name in os.listdir(source_dir)]
|
||||
|
||||
for file_name in os.listdir(source_dir):
|
||||
full_file_name = os.path.join(source_dir, file_name)
|
||||
if os.path.isfile(full_file_name):
|
||||
shutil.copy(full_file_name, workspace)
|
||||
|
||||
def copy_artifacts_into_workspace(
|
||||
workspace: str, artifact_folder_name: str, challenge_dir_path: str
|
||||
) -> None:
|
||||
file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
|
||||
for file_path in file_paths:
|
||||
if os.path.isfile(file_path):
|
||||
shutil.copy(file_path, workspace)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
@@ -15,7 +16,7 @@ from agbenchmark.reports.reports import (
|
||||
generate_single_call_report,
|
||||
session_finish,
|
||||
)
|
||||
from agbenchmark.start_benchmark import CONFIG_PATH, get_regression_data
|
||||
from agbenchmark.start_benchmark import CONFIG_PATH, HOME_DIRECTORY, get_regression_data
|
||||
from agbenchmark.utils.data_types import SuiteConfig
|
||||
|
||||
GLOBAL_TIMEOUT = (
|
||||
@@ -245,3 +246,25 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
|
||||
# Add category marker dynamically
|
||||
for category in categories:
|
||||
item.add_marker(getattr(pytest.mark, category))
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def run_agent(request: Any) -> Any:
|
||||
with open(CONFIG_PATH, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
if config.get("api_mode"):
|
||||
command = [sys.executable, "-m", "agbenchmark.benchmarks"]
|
||||
process = subprocess.Popen(
|
||||
command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
cwd=HOME_DIRECTORY,
|
||||
)
|
||||
time.sleep(3)
|
||||
yield
|
||||
print(f"Terminating agent")
|
||||
process.terminate()
|
||||
else:
|
||||
yield
|
||||
|
||||
@@ -97,7 +97,7 @@ def create_single_test(
|
||||
)
|
||||
|
||||
# Define test method within the dynamically created class
|
||||
def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore
|
||||
async def test_method(self, config: Dict[str, Any], request) -> None: # type: ignore
|
||||
# create a random number between 0 and 1
|
||||
test_name = self.data.name
|
||||
|
||||
@@ -121,7 +121,14 @@ def create_single_test(
|
||||
HeliconeLockManager.write_custom_property("challenge", self.data.name)
|
||||
|
||||
cutoff = self.data.cutoff or 60
|
||||
self.setup_challenge(config, cutoff)
|
||||
|
||||
timeout = cutoff
|
||||
if "--nc" in sys.argv:
|
||||
timeout = 100000
|
||||
if "--cutoff" in sys.argv:
|
||||
timeout = int(sys.argv[sys.argv.index("--cutoff") + 1])
|
||||
|
||||
await self.setup_challenge(config, timeout)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
request.node.scores = scores # store scores in request.node
|
||||
|
||||
@@ -4,10 +4,10 @@ import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.agent_interface import MOCK_FLAG
|
||||
from agbenchmark.reports.ReportManager import ReportManager
|
||||
from agbenchmark.start_benchmark import (
|
||||
CONFIG_PATH,
|
||||
MOCK_FLAG,
|
||||
REGRESSION_TESTS_PATH,
|
||||
REPORTS_PATH,
|
||||
SUCCESS_RATE_PATH,
|
||||
|
||||
@@ -22,6 +22,7 @@ if os.environ.get("HELICONE_API_KEY"):
|
||||
HeliconeLockManager.write_custom_property(
|
||||
"benchmark_start_time", BENCHMARK_START_TIME
|
||||
)
|
||||
MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true"
|
||||
|
||||
|
||||
(
|
||||
@@ -141,21 +142,29 @@ def start(
|
||||
)
|
||||
return 1
|
||||
|
||||
if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
|
||||
if os.path.exists(CONFIG_PATH) and os.stat(CONFIG_PATH).st_size:
|
||||
# If the configuration file exists and is not empty, load it
|
||||
with open(CONFIG_PATH, "r") as f:
|
||||
config = json.load(f)
|
||||
else:
|
||||
config = {}
|
||||
|
||||
if not config.get("workspace"):
|
||||
config["workspace"] = click.prompt(
|
||||
"Please enter a new workspace path",
|
||||
default=os.path.join("workspace"),
|
||||
show_default=True,
|
||||
)
|
||||
|
||||
with open(CONFIG_PATH, "w") as f:
|
||||
json.dump(config, f)
|
||||
else:
|
||||
# If the configuration file exists and is not empty, load it
|
||||
with open(CONFIG_PATH, "r") as f:
|
||||
config = json.load(f)
|
||||
if config.get("api_mode") and not config.get("host"):
|
||||
config["host"] = click.prompt(
|
||||
"Please enter the Agent API host address",
|
||||
default="http://localhost:8000",
|
||||
show_default=True,
|
||||
)
|
||||
|
||||
with open(CONFIG_PATH, "w") as f:
|
||||
json.dump(config, f)
|
||||
|
||||
print("Current configuration:")
|
||||
for key, value in config.items():
|
||||
|
||||
@@ -9,8 +9,8 @@ from typing import Any, Dict, List
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
from agbenchmark.agent_interface import MOCK_FLAG
|
||||
from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES
|
||||
from agbenchmark.agent_api_interface import run_api_agent
|
||||
from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES
|
||||
from agbenchmark.utils.data_types import ChallengeData, Ground
|
||||
from agbenchmark.utils.prompts import (
|
||||
END_PROMPT,
|
||||
@@ -46,7 +46,10 @@ class Challenge(ABC):
|
||||
def dependencies(self) -> list:
|
||||
return self.data.dependencies
|
||||
|
||||
def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
|
||||
async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
|
||||
if not self.task:
|
||||
return
|
||||
|
||||
from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
|
||||
|
||||
copy_artifacts_into_workspace(
|
||||
@@ -58,7 +61,15 @@ class Challenge(ABC):
|
||||
)
|
||||
print(f"\033[1;30mTask: {self.task}\033[0m")
|
||||
|
||||
run_agent(self.task, config, self.ARTIFACTS_LOCATION, cutoff)
|
||||
if MOCK_FLAG:
|
||||
print("Running mock agent")
|
||||
copy_artifacts_into_workspace(
|
||||
config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION
|
||||
)
|
||||
elif config.get("api_mode"):
|
||||
await run_api_agent(self.data, config, self.ARTIFACTS_LOCATION, cutoff)
|
||||
else:
|
||||
run_agent(self.task, cutoff)
|
||||
|
||||
# hidden files are added after the agent runs. Hidden files can be python test files.
|
||||
# We copy them in the workspace to make it easy to import the code produced by the agent
|
||||
|
||||
50
poetry.lock
generated
50
poetry.lock
generated
@@ -1,5 +1,17 @@
|
||||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aenum"
|
||||
version = "3.1.15"
|
||||
description = "Advanced Enumerations (compatible with Python's stdlib Enum), NamedTuples, and NamedConstants"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "aenum-3.1.15-py2-none-any.whl", hash = "sha256:27b1710b9d084de6e2e695dab78fe9f269de924b51ae2850170ee7e1ca6288a5"},
|
||||
{file = "aenum-3.1.15-py3-none-any.whl", hash = "sha256:e0dfaeea4c2bd362144b87377e2c61d91958c5ed0b4daf89cb6f45ae23af6288"},
|
||||
{file = "aenum-3.1.15.tar.gz", hash = "sha256:8cbd76cd18c4f870ff39b24284d3ea028fbe8731a58df3aa581e434c575b9559"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "agent-protocol"
|
||||
version = "0.2.3"
|
||||
@@ -21,6 +33,24 @@ pytest = ">=7.0.0,<8.0.0"
|
||||
python-multipart = ">=0.0.6,<0.0.7"
|
||||
requests = ">=2.31.0,<3.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "agent-protocol-client"
|
||||
version = "0.2.2"
|
||||
description = "Agent Communication Protocol Client"
|
||||
optional = false
|
||||
python-versions = ">=3.10,<4.0"
|
||||
files = [
|
||||
{file = "agent_protocol_client-0.2.2-py3-none-any.whl", hash = "sha256:e46f506f74a20d7b4bec65c852598e889a05789f803fe384b758972049b23c96"},
|
||||
{file = "agent_protocol_client-0.2.2.tar.gz", hash = "sha256:536197eee29cb24504458b8401c2a4cf22b8e1e5ad0924af05fb1f76754b09c2"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
aenum = ">=3.1.11"
|
||||
aiohttp = ">=3.8.4"
|
||||
pydantic = ">=1.10.5,<2.0.0"
|
||||
python-dateutil = ">=2.8.2"
|
||||
urllib3 = ">=1.25.3"
|
||||
|
||||
[[package]]
|
||||
name = "aiofiles"
|
||||
version = "23.1.0"
|
||||
@@ -2149,6 +2179,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
|
||||
[package.extras]
|
||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-asyncio"
|
||||
version = "0.21.1"
|
||||
description = "Pytest support for asyncio"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "pytest-asyncio-0.21.1.tar.gz", hash = "sha256:40a7eae6dded22c7b604986855ea48400ab15b069ae38116e8c01238e9eeb64d"},
|
||||
{file = "pytest_asyncio-0.21.1-py3-none-any.whl", hash = "sha256:8666c1c8ac02631d7c51ba282e0c69a8a452b211ffedf2599099845da5c5c37b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pytest = ">=7.0.0"
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
|
||||
testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.8.2"
|
||||
@@ -2716,4 +2764,4 @@ multidict = ">=4.0"
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "855fe892a4401b03bbda926df20c140ea5143574ff5d04b6168472a2d162b9dd"
|
||||
content-hash = "8f1d3f47e03dd229fec21c8162cebecf871b7f7c093bd32ff5c2f13ecc69b4b6"
|
||||
|
||||
@@ -27,6 +27,7 @@ colorama = "^0.4.6"
|
||||
pyvis = "^0.3.2"
|
||||
selenium = "^4.11.2"
|
||||
agent-protocol = "^0.2.3"
|
||||
agent-protocol-client = "^0.2.2"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
flake8 = "^3.9.2"
|
||||
@@ -38,6 +39,7 @@ pandas = "^2.0.3"
|
||||
gspread = "^5.10.0"
|
||||
oauth2client = "^4.1.3"
|
||||
pre-commit = "^3.3.3"
|
||||
pytest-asyncio = "^0.21.1"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
@@ -49,6 +51,7 @@ addopts = "-ra -q"
|
||||
testpaths = [
|
||||
"tests", "agbenchmark",
|
||||
]
|
||||
asyncio_mode = "auto"
|
||||
markers = [
|
||||
"retrieval",
|
||||
"interface",
|
||||
|
||||
Reference in New Issue
Block a user