mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-17 14:04:27 +01:00
Make agbenchmark a Proxy of the evaluated agent Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
391 lines
16 KiB
Python
391 lines
16 KiB
Python
import contextlib
|
|
import json
|
|
import os
|
|
import shutil
|
|
import sys
|
|
import threading
|
|
import time
|
|
from pathlib import Path # noqa
|
|
from typing import Any, Generator
|
|
|
|
import pytest
|
|
|
|
from agbenchmark.__main__ import TEMP_FOLDER_ABS_PATH
|
|
from agbenchmark.reports.reports import (
|
|
finalize_reports,
|
|
generate_single_call_report,
|
|
session_finish,
|
|
)
|
|
from agbenchmark.utils.data_types import AgentBenchmarkConfig
|
|
|
|
GLOBAL_TIMEOUT = (
|
|
1500 # The tests will stop after 25 minutes so we can send the reports.
|
|
)
|
|
|
|
pytest_plugins = ["agbenchmark.utils.dependencies"]
|
|
collect_ignore = ["challenges"]
|
|
suite_reports: dict[str, list] = {}
|
|
|
|
|
|
def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
|
|
"""
|
|
This function loads the configuration for the agent benchmark from a given request.
|
|
|
|
Args:
|
|
request (Any): The request object from which the agent benchmark configuration is to be loaded.
|
|
|
|
Returns:
|
|
AgentBenchmarkConfig: The loaded agent benchmark configuration.
|
|
|
|
Raises:
|
|
json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file.
|
|
"""
|
|
agent_benchmark_config_path = Path.cwd() / "agbenchmark_config" / "config.json"
|
|
try:
|
|
with open(agent_benchmark_config_path, "r") as f:
|
|
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
|
agent_benchmark_config.agent_benchmark_config_path = (
|
|
agent_benchmark_config_path
|
|
)
|
|
return agent_benchmark_config
|
|
except json.JSONDecodeError:
|
|
print("Error: benchmark_config.json is not a valid JSON file.")
|
|
raise
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def config(request: Any) -> Any:
|
|
"""
|
|
This pytest fixture is responsible for loading the agent benchmark configuration from a given request.
|
|
This fixture is scoped to the module level, meaning it's invoked once per test module.
|
|
|
|
Args:
|
|
request (Any): The request object from which the agent benchmark configuration is to be loaded.
|
|
|
|
Returns:
|
|
Any: The loaded configuration dictionary.
|
|
|
|
Raises:
|
|
json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file.
|
|
"""
|
|
config = {}
|
|
agent_benchmark_config_path = Path.cwd() / "agbenchmark_config" / "config.json"
|
|
try:
|
|
with open(agent_benchmark_config_path, "r") as f:
|
|
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
|
agent_benchmark_config.agent_benchmark_config_path = (
|
|
agent_benchmark_config_path
|
|
)
|
|
except json.JSONDecodeError:
|
|
print("Error: benchmark_config.json is not a valid JSON file.")
|
|
raise
|
|
|
|
config["AgentBenchmarkConfig"] = agent_benchmark_config
|
|
|
|
return config
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def temp_folder() -> Generator[str, None, None]:
|
|
"""
|
|
This pytest fixture is responsible for setting up and tearing down the temporary folder for each test.
|
|
It is automatically used in every test due to the 'autouse=True' parameter.
|
|
It is used in order to let agbenchmark store files so they can then be evaluated.
|
|
"""
|
|
|
|
# create output directory if it doesn't exist
|
|
if not os.path.exists(TEMP_FOLDER_ABS_PATH):
|
|
os.makedirs(TEMP_FOLDER_ABS_PATH, exist_ok=True)
|
|
|
|
yield
|
|
# teardown after test function completes
|
|
if not os.getenv("KEEP_TEMP_FOLDER_FILES"):
|
|
for filename in os.listdir(TEMP_FOLDER_ABS_PATH):
|
|
file_path = os.path.join(TEMP_FOLDER_ABS_PATH, filename)
|
|
try:
|
|
if os.path.isfile(file_path) or os.path.islink(file_path):
|
|
os.unlink(file_path)
|
|
elif os.path.isdir(file_path):
|
|
shutil.rmtree(file_path)
|
|
except Exception as e:
|
|
print(f"Failed to delete {file_path}. Reason: {e}")
|
|
|
|
|
|
def pytest_addoption(parser: Any) -> None:
|
|
"""
|
|
This function is a pytest hook that is called to add command-line options.
|
|
It is used to add custom command-line options that are specific to the agent benchmark tests.
|
|
These options can be used to control the behavior of the tests.
|
|
The "--mock" option is used to run the tests in mock mode.
|
|
The "--host" option is used to specify the host for the tests.
|
|
The "--category" option is used to run only tests of a specific category.
|
|
The "--nc" option is used to run the tests without caching.
|
|
The "--cutoff" option is used to specify a cutoff time for the tests.
|
|
The "--improve" option is used to run only the tests that are marked for improvement.
|
|
The "--maintain" option is used to run only the tests that are marked for maintenance.
|
|
The "--explore" option is used to run the tests in exploration mode.
|
|
The "--test" option is used to run a specific test.
|
|
The "--no_dep" option is used to run the tests without dependencies.
|
|
The "--keep_answers" option is used to keep the answers of the tests.
|
|
|
|
Args:
|
|
parser (Any): The parser object to which the command-line options are added.
|
|
"""
|
|
parser.addoption("--no_dep", action="store_true", default=False)
|
|
parser.addoption("--mock", action="store_true", default=False)
|
|
parser.addoption("--host", action="store_true", default=None)
|
|
parser.addoption("--nc", action="store_true", default=False)
|
|
parser.addoption("--cutoff", action="store_true", default=False)
|
|
parser.addoption("--category", action="store_true", default=False)
|
|
parser.addoption("--test", action="store_true", default=None)
|
|
parser.addoption("--improve", action="store_true", default=False)
|
|
parser.addoption("--maintain", action="store_true", default=False)
|
|
parser.addoption("--explore", action="store_true", default=False)
|
|
parser.addoption("--keep-answers", action="store_true", default=False)
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def check_regression(request: Any) -> None:
|
|
"""
|
|
This pytest fixture is responsible for checking if a test is a regression test.
|
|
It is automatically used in every test due to the 'autouse=True' parameter.
|
|
The test name and the agent benchmark configuration are retrieved from the request object.
|
|
The regression reports are loaded from the path specified in the agent benchmark configuration.
|
|
If the "--improve" option is used and the test name exists in the regression tests, the test is skipped.
|
|
If the "--maintain" option is used and the test name does not exist in the regression tests, the test is also skipped.
|
|
|
|
Args:
|
|
request (Any): The request object from which the test name and the agent benchmark configuration are retrieved.
|
|
"""
|
|
test_name = request.node.parent.name
|
|
agent_benchmark_config = load_config_from_request(request)
|
|
with contextlib.suppress(Exception):
|
|
test = agent_benchmark_config.get_regression_reports_path()
|
|
data = json.loads(test)
|
|
challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
|
|
|
|
skip_string = f"Skipping {test_name} at {challenge_location}"
|
|
|
|
# Check if the test name exists in the regression tests
|
|
if request.config.getoption("--improve") and data.get(test_name, None):
|
|
pytest.skip(f"{skip_string} because it's a regression test")
|
|
elif request.config.getoption("--maintain") and not data.get(test_name, None):
|
|
pytest.skip(f"{skip_string} because it's not a regression test")
|
|
|
|
|
|
# this is to get the challenge_data from every test
|
|
@pytest.fixture(autouse=True)
|
|
def challenge_data(request: Any) -> None:
|
|
"""
|
|
This pytest fixture is responsible for providing the challenge data for each test.
|
|
It is automatically used in every test due to the 'autouse=True' parameter.
|
|
The challenge data is retrieved from the request object's parameters.
|
|
This fixture is essential for the pytest system as it provides the necessary data for each test.
|
|
|
|
Args:
|
|
request (Any): The request object from which the challenge data is retrieved.
|
|
|
|
Returns:
|
|
None: The challenge data is directly passed to the test function and does not need to be returned.
|
|
"""
|
|
return request.param
|
|
|
|
|
|
@pytest.fixture(autouse=True, scope="session")
|
|
def mock(request: Any) -> None:
|
|
"""
|
|
This pytest fixture is responsible for retrieving the value of the "--mock" command-line option.
|
|
It is automatically used in every test session due to the 'autouse=True' parameter and 'session' scope.
|
|
The "--mock" option is used to run the tests in mock mode.
|
|
This fixture is essential for the pytest system as it provides the necessary command-line option value for each test session.
|
|
|
|
Args:
|
|
request (Any): The request object from which the "--mock" option value is retrieved.
|
|
|
|
Returns:
|
|
None: The "--mock" option value is directly passed to the test session and does not need to be returned.
|
|
"""
|
|
return request.config.getoption("--mock")
|
|
|
|
|
|
@pytest.fixture(autouse=True, scope="function")
|
|
def timer(request: Any) -> Any:
|
|
"""
|
|
This pytest fixture is responsible for timing the execution of each test.
|
|
It is automatically used in every test due to the 'autouse=True' parameter and 'function' scope.
|
|
At the start of each test, it records the current time.
|
|
After the test function completes, it calculates the run time and appends it to the test node's user properties.
|
|
This allows the run time of each test to be accessed later for reporting or analysis.
|
|
|
|
Args:
|
|
request (Any): The request object from which the test node is retrieved.
|
|
|
|
Yields:
|
|
None: Control is yielded back to the test function.
|
|
"""
|
|
start_time = time.time()
|
|
yield
|
|
run_time = time.time() - start_time
|
|
request.node.user_properties.append(("run_time", run_time))
|
|
|
|
|
|
def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
|
"""
|
|
This function is a pytest hook that is called when a test report is being generated.
|
|
It is used to generate and finalize reports for each test.
|
|
|
|
Args:
|
|
item (Any): The test item for which the report is being generated.
|
|
call (Any): The call object from which the test result is retrieved.
|
|
"""
|
|
challenge_data = item.funcargs.get("challenge_data", None)
|
|
|
|
if not challenge_data:
|
|
# this will only happen for dummy dependency setup tests
|
|
return
|
|
|
|
challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
|
|
|
|
flags = (
|
|
"--test" in sys.argv
|
|
or "--maintain" in sys.argv
|
|
or "--improve" in sys.argv
|
|
or "--explore" in sys.argv
|
|
)
|
|
|
|
if call.when == "call":
|
|
answers = getattr(item, "answers", None)
|
|
challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
|
|
test_name = item.nodeid.split("::")[1]
|
|
item.test_name = test_name
|
|
|
|
generate_single_call_report(
|
|
item, call, challenge_data, answers, challenge_location, test_name
|
|
)
|
|
|
|
if call.when == "teardown":
|
|
finalize_reports(item, challenge_data)
|
|
|
|
|
|
def timeout_monitor(start_time: int) -> None:
|
|
"""
|
|
This function is responsible for monitoring the total execution time of the test suite.
|
|
It runs in a separate thread and checks every second if the total execution time has exceeded the global timeout.
|
|
If the global timeout is exceeded, it terminates the pytest session with a specific return code.
|
|
|
|
Args:
|
|
start_time (int): The start time of the test suite.
|
|
"""
|
|
while time.time() - start_time < GLOBAL_TIMEOUT:
|
|
time.sleep(1) # check every second
|
|
|
|
pytest.exit("Test suite exceeded the global timeout", returncode=1)
|
|
|
|
|
|
def pytest_sessionstart(session: Any) -> None:
|
|
"""
|
|
This function is a pytest hook that is called at the start of the test session.
|
|
It starts the timeout monitor in a separate thread.
|
|
The timeout monitor checks if the total execution time of the test suite has exceeded the global timeout.
|
|
|
|
Args:
|
|
session (Any): The pytest session object.
|
|
"""
|
|
start_time = time.time()
|
|
t = threading.Thread(target=timeout_monitor, args=(start_time,))
|
|
t.daemon = True # Daemon threads are abruptly stopped at shutdown
|
|
t.start()
|
|
|
|
|
|
def pytest_sessionfinish(session: Any) -> None:
|
|
"""
|
|
This function is a pytest hook that is called at the end of the test session.
|
|
It is used to finalize and save the test reports.
|
|
The reports are saved in a specific location defined in the suite reports.
|
|
|
|
Args:
|
|
session (Any): The pytest session object.
|
|
"""
|
|
session_finish(suite_reports)
|
|
|
|
|
|
@pytest.fixture
|
|
def scores(request: Any) -> None:
|
|
"""
|
|
This pytest fixture is responsible for retrieving the scores of the test class.
|
|
The scores are retrieved from the test class's 'scores' attribute using the test class name.
|
|
This fixture is essential for the pytest system as it provides the necessary scores for each test.
|
|
|
|
Args:
|
|
request (Any): The request object from which the test class is retrieved.
|
|
|
|
Returns:
|
|
None: The scores are directly passed to the test function and do not need to be returned.
|
|
"""
|
|
test_class_name = request.node.cls.__name__
|
|
return request.node.cls.scores.get(test_class_name)
|
|
|
|
|
|
# this is adding the dependency marker and category markers automatically from the json
|
|
def pytest_collection_modifyitems(items: Any, config: Any) -> None:
|
|
"""
|
|
This function is a pytest hook that is called after the test collection has been performed.
|
|
It is used to modify the collected test items based on the agent benchmark configuration.
|
|
The function loads the agent benchmark configuration from the specified path and retrieves the regression reports.
|
|
For each test item, it checks if the test method exists and retrieves the dependencies and categories from the test class instance.
|
|
If the "--improve" or "--category" options are used, the dependencies are filtered based on the regression data.
|
|
If the "--test", "--no_dep", or "--maintain" options are used, the dependencies are cleared.
|
|
The function then dynamically adds the 'depends' and 'category' markers to the test item.
|
|
This function is essential for the pytest system as it provides the necessary modification of the test items based on the agent benchmark configuration.
|
|
|
|
Args:
|
|
items (Any): The collected test items to be modified.
|
|
config (Any): The pytest configuration object from which the agent benchmark configuration path is retrieved.
|
|
"""
|
|
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
|
|
try:
|
|
with open(agent_benchmark_config_path) as f:
|
|
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
|
except json.JSONDecodeError:
|
|
print("Error: benchmark_config.json is not a valid JSON file.")
|
|
raise
|
|
|
|
regression_file = agent_benchmark_config.get_regression_reports_path()
|
|
data = (
|
|
json.loads(open(regression_file, "r").read())
|
|
if os.path.exists(regression_file)
|
|
else {}
|
|
)
|
|
|
|
for item in items:
|
|
# Assuming item.cls is your test class
|
|
test_class_instance = item.cls()
|
|
|
|
if "test_method" not in item.name:
|
|
continue
|
|
|
|
# Then you can access your properties
|
|
name = item.parent.cls.__name__
|
|
# dependencies = test_class_instance.data.dependencies
|
|
|
|
# Filter dependencies if they exist in regression data if its an improvement test
|
|
# if config.getoption("--improve") or config.getoption(
|
|
# "--category"
|
|
# ):
|
|
# dependencies = [dep for dep in dependencies if not data.get(dep, None)]
|
|
# if (
|
|
# config.getoption("--test")
|
|
# or config.getoption("--no_dep")
|
|
# or config.getoption("--maintain")
|
|
# ):
|
|
dependencies = []
|
|
|
|
# Add depends marker dynamically
|
|
item.add_marker(pytest.mark.depends(on=dependencies, name=name))
|
|
|
|
categories = test_class_instance.data.category
|
|
|
|
# Add category marker dynamically
|
|
for category in categories:
|
|
item.add_marker(getattr(pytest.mark, category))
|