mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-17 14:04:27 +01:00
Fixing benchmarks
This commit is contained in:
5
benchmark/benchmark/__init__.py
Normal file
5
benchmark/benchmark/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# import pydevd_pycharm
|
||||||
|
|
||||||
|
# pydevd_pycharm.settrace(
|
||||||
|
# "localhost", port=9739, stdoutToServer=True, stderrToServer=True
|
||||||
|
# )
|
||||||
274
benchmark/benchmark/__main__.py
Normal file
274
benchmark/benchmark/__main__.py
Normal file
@@ -0,0 +1,274 @@
|
|||||||
|
import glob
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Optional
|
||||||
|
import toml
|
||||||
|
|
||||||
|
import click
|
||||||
|
import pytest
|
||||||
|
from helicone.lock import HeliconeLockManager
|
||||||
|
|
||||||
|
from benchmark.utils.data_types import AgentBenchmarkConfig
|
||||||
|
|
||||||
|
BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
|
||||||
|
|
||||||
|
if os.environ.get("HELICONE_API_KEY"):
|
||||||
|
HeliconeLockManager.write_custom_property(
|
||||||
|
"benchmark_start_time", BENCHMARK_START_TIME
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(
|
||||||
|
Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
|
||||||
|
) as f:
|
||||||
|
OPTIONAL_CATEGORIES = json.load(f)["optional_categories"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_unique_categories() -> set[str]:
|
||||||
|
"""Find all data.json files in the directory relative to this file and its subdirectories,
|
||||||
|
read the "category" field from each file, and return a set of unique categories."""
|
||||||
|
categories = set()
|
||||||
|
|
||||||
|
# Get the directory of this file
|
||||||
|
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
glob_path = os.path.join(this_dir, "./challenges/**/data.json")
|
||||||
|
# Use it as the base for the glob pattern
|
||||||
|
for data_file in glob.glob(glob_path, recursive=True):
|
||||||
|
with open(data_file, "r") as f:
|
||||||
|
try:
|
||||||
|
data = json.load(f)
|
||||||
|
categories.update(data.get("category", []))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(f"Error: {data_file} is not a valid JSON file.")
|
||||||
|
continue
|
||||||
|
except IOError:
|
||||||
|
print(f"IOError: file could not be read: {data_file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return categories
|
||||||
|
|
||||||
|
|
||||||
|
def run_benchmark(
|
||||||
|
agent_benchmark_config_path: AgentBenchmarkConfig,
|
||||||
|
maintain: bool = False,
|
||||||
|
improve: bool = False,
|
||||||
|
explore: bool = False,
|
||||||
|
mock: bool = False,
|
||||||
|
no_dep: bool = False,
|
||||||
|
nc: bool = False,
|
||||||
|
category: Optional[list[str]] = None,
|
||||||
|
skip_category: Optional[list[str]] = None,
|
||||||
|
test: Optional[str] = None,
|
||||||
|
suite: Optional[str] = None,
|
||||||
|
cutoff: Optional[int] = None,
|
||||||
|
server: bool = False,
|
||||||
|
) -> int:
|
||||||
|
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
|
||||||
|
# Check if configuration file exists and is not empty
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(agent_benchmark_config_path, "r") as f:
|
||||||
|
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||||
|
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if maintain and improve and explore:
|
||||||
|
print(
|
||||||
|
"Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if test and (category or skip_category or maintain or improve or suite or explore):
|
||||||
|
print(
|
||||||
|
"Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# TODO: test and ensure that this functionality works before removing
|
||||||
|
# change elif suite below if removing
|
||||||
|
if suite and (category or skip_category or maintain or improve or explore):
|
||||||
|
print(
|
||||||
|
"Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \
|
||||||
|
"Error: host needs to be added to the config if api_mode is set to True."
|
||||||
|
|
||||||
|
print("Current configuration:")
|
||||||
|
for key, value in vars(agent_benchmark_config).items():
|
||||||
|
print(f"{key}: {value}")
|
||||||
|
|
||||||
|
pytest_args = ["-vs"]
|
||||||
|
pytest_args.extend(["--agent_config_path", agent_benchmark_config_path])
|
||||||
|
if test:
|
||||||
|
print("Running specific test:", test)
|
||||||
|
pytest_args.extend(["-k", test, "--test"])
|
||||||
|
elif suite:
|
||||||
|
print("Running specific suite:", suite)
|
||||||
|
pytest_args.extend(["--suite"])
|
||||||
|
else:
|
||||||
|
# Categories that are used in the challenges
|
||||||
|
categories = get_unique_categories()
|
||||||
|
if category:
|
||||||
|
invalid_categories = set(category) - categories
|
||||||
|
assert (
|
||||||
|
not invalid_categories
|
||||||
|
), f"Invalid categories: {invalid_categories}. Valid categories are: {categories}"
|
||||||
|
|
||||||
|
if category:
|
||||||
|
categories_to_run = set(category)
|
||||||
|
if skip_category:
|
||||||
|
categories_to_run = categories_to_run.difference(set(skip_category))
|
||||||
|
assert categories_to_run, "Error: You can't skip all categories"
|
||||||
|
pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
|
||||||
|
print("Running tests of category:", categories_to_run)
|
||||||
|
elif skip_category:
|
||||||
|
categories_to_run = categories - set(skip_category)
|
||||||
|
assert categories_to_run, "Error: You can't skip all categories"
|
||||||
|
pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
|
||||||
|
print("Running tests of category:", categories_to_run)
|
||||||
|
else:
|
||||||
|
print("Running all categories")
|
||||||
|
|
||||||
|
if maintain:
|
||||||
|
print("Running only regression tests")
|
||||||
|
pytest_args.append("--maintain")
|
||||||
|
elif improve:
|
||||||
|
print("Running only non-regression tests")
|
||||||
|
pytest_args.append("--improve")
|
||||||
|
elif explore:
|
||||||
|
print("Only attempt challenges that have never been beaten")
|
||||||
|
pytest_args.append("--explore")
|
||||||
|
|
||||||
|
if mock:
|
||||||
|
pytest_args.append("--mock")
|
||||||
|
|
||||||
|
if no_dep:
|
||||||
|
pytest_args.append("--no_dep")
|
||||||
|
|
||||||
|
if nc and cutoff:
|
||||||
|
print(
|
||||||
|
"Error: You can't use both --nc and --cutoff at the same time. Please choose one."
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if nc:
|
||||||
|
pytest_args.append("--nc")
|
||||||
|
if cutoff:
|
||||||
|
pytest_args.append("--cutoff")
|
||||||
|
print(f"Setting cuttoff override to {cutoff} seconds.")
|
||||||
|
current_dir = Path(__file__).resolve().parent
|
||||||
|
print(f"Current directory: {current_dir}")
|
||||||
|
pytest_args.extend((str(current_dir), "--cache-clear"))
|
||||||
|
return pytest.main(pytest_args)
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
def cli() -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.option("--backend", is_flag=True, help="If it's being run from the cli")
|
||||||
|
@click.option("-c", "--category", multiple=True, help="Specific category to run")
|
||||||
|
@click.option(
|
||||||
|
"-s",
|
||||||
|
"--skip-category",
|
||||||
|
multiple=True,
|
||||||
|
help="Skips preventing the tests from this category from running",
|
||||||
|
)
|
||||||
|
@click.option("--test", help="Specific test to run")
|
||||||
|
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
|
||||||
|
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
|
||||||
|
@click.option(
|
||||||
|
"--explore",
|
||||||
|
is_flag=True,
|
||||||
|
help="Only attempt challenges that have never been beaten",
|
||||||
|
)
|
||||||
|
@click.option("--mock", is_flag=True, help="Run with mock")
|
||||||
|
@click.option("--suite", help="Run a suite of related tests")
|
||||||
|
@click.option(
|
||||||
|
"--no_dep",
|
||||||
|
is_flag=True,
|
||||||
|
help="Run without dependencies (can be useful for a suite run)",
|
||||||
|
)
|
||||||
|
@click.option("--nc", is_flag=True, help="Run without cutoff")
|
||||||
|
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
|
||||||
|
@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True)
|
||||||
|
def start(
|
||||||
|
maintain: bool,
|
||||||
|
improve: bool,
|
||||||
|
explore: bool,
|
||||||
|
mock: bool,
|
||||||
|
no_dep: bool,
|
||||||
|
nc: bool,
|
||||||
|
agent_config: click.Path,
|
||||||
|
category: Optional[list[str]] = None,
|
||||||
|
skip_category: Optional[list[str]] = None,
|
||||||
|
test: Optional[str] = None,
|
||||||
|
suite: Optional[str] = None,
|
||||||
|
cutoff: Optional[int] = None,
|
||||||
|
backend: Optional[bool] = False,
|
||||||
|
) -> Any:
|
||||||
|
# Redirect stdout if backend is True
|
||||||
|
original_stdout = sys.stdout # Save the original standard output
|
||||||
|
exit_code = None
|
||||||
|
|
||||||
|
|
||||||
|
assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided"
|
||||||
|
|
||||||
|
if backend:
|
||||||
|
with open("backend/backend_stdout.txt", "w") as f:
|
||||||
|
sys.stdout = f
|
||||||
|
exit_code = run_benchmark(
|
||||||
|
agent_benchmark_config_path=agent_config,
|
||||||
|
maintain=maintain,
|
||||||
|
improve=improve,
|
||||||
|
explore=explore,
|
||||||
|
mock=mock,
|
||||||
|
no_dep=no_dep,
|
||||||
|
nc=nc,
|
||||||
|
category=category,
|
||||||
|
skip_category=skip_category,
|
||||||
|
test=test,
|
||||||
|
suite=suite,
|
||||||
|
cutoff=cutoff,
|
||||||
|
)
|
||||||
|
|
||||||
|
sys.stdout = original_stdout
|
||||||
|
|
||||||
|
else:
|
||||||
|
exit_code = run_benchmark(
|
||||||
|
agent_benchmark_config_path=agent_config,
|
||||||
|
maintain=maintain,
|
||||||
|
improve=improve,
|
||||||
|
explore=explore,
|
||||||
|
mock=mock,
|
||||||
|
no_dep=no_dep,
|
||||||
|
nc=nc,
|
||||||
|
category=category,
|
||||||
|
skip_category=skip_category,
|
||||||
|
test=test,
|
||||||
|
suite=suite,
|
||||||
|
cutoff=cutoff,
|
||||||
|
)
|
||||||
|
|
||||||
|
sys.exit(exit_code)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def version():
|
||||||
|
"""Print the version of the benchmark tool."""
|
||||||
|
current_directory = Path(__file__).resolve().parent
|
||||||
|
version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
|
||||||
|
print(f"Benchmark Tool Version {version}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cli()
|
||||||
@@ -5,8 +5,8 @@ from typing import Any, Dict, Optional
|
|||||||
|
|
||||||
from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody
|
from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody
|
||||||
|
|
||||||
from agbenchmark.agent_interface import get_list_of_file_paths
|
from benchmark.agent_interface import get_list_of_file_paths
|
||||||
from agbenchmark.utils.data_types import ChallengeData
|
from benchmark.utils.data_types import ChallengeData
|
||||||
|
|
||||||
|
|
||||||
async def run_api_agent(
|
async def run_api_agent(
|
||||||
@@ -12,7 +12,6 @@ from typing import Any, List
|
|||||||
import psutil
|
import psutil
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
import agbenchmark.start_benchmark
|
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
@@ -77,7 +76,7 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
|
|||||||
def run_agent(task: str, timeout: int) -> None:
|
def run_agent(task: str, timeout: int) -> None:
|
||||||
"""Calling to get a response"""
|
"""Calling to get a response"""
|
||||||
|
|
||||||
entry_path = "agbenchmark.benchmarks"
|
entry_path = "benchmark.benchmarks"
|
||||||
|
|
||||||
print(f"Running '{entry_path}' with timeout {timeout}")
|
print(f"Running '{entry_path}' with timeout {timeout}")
|
||||||
|
|
||||||
@@ -87,7 +86,7 @@ def run_agent(task: str, timeout: int) -> None:
|
|||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
universal_newlines=True,
|
universal_newlines=True,
|
||||||
cwd=agbenchmark.start_benchmark.HOME_DIRECTORY,
|
cwd=benchmark.start_benchmark.HOME_DIRECTORY,
|
||||||
bufsize=1,
|
bufsize=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -109,7 +108,7 @@ def get_list_of_file_paths(
|
|||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
# this file is at agbenchmark\agent_interface.py
|
# this file is at agbenchmark\agent_interface.py
|
||||||
source_dir = os.path.join(
|
source_dir = os.path.join(
|
||||||
agbenchmark.start_benchmark.CURRENT_DIRECTORY,
|
benchmark.start_benchmark.CURRENT_DIRECTORY,
|
||||||
"..",
|
"..",
|
||||||
challenge_dir_path,
|
challenge_dir_path,
|
||||||
artifact_folder_name,
|
artifact_folder_name,
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user