Files
Auto-GPT/agbenchmark/conftest.py
Silen Naihin 2b3abeff4e Integrate baby-agi (#168)
Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
Co-authored-by: merwanehamadi <merwanehamadi@gmail.com>
2023-07-21 11:15:42 -07:00

268 lines
9.2 KiB
Python

import json
import os
import shutil
import sys
import time
from pathlib import Path # noqa
from typing import Any, Dict, Generator
import pytest
from agbenchmark.ReportManager import ReportManager
from agbenchmark.start_benchmark import (
CONFIG_PATH,
INFO_TESTS_PATH,
REGRESSION_TESTS_PATH,
get_regression_data,
)
from agbenchmark.utils import AGENT_NAME, calculate_success_percentage
def resolve_workspace(workspace: str) -> str:
if workspace.startswith("${") and workspace.endswith("}"):
# Extract the string inside ${...}
path_expr = workspace[2:-1]
# Check if it starts with "os.path.join"
if path_expr.strip().startswith("os.path.join"):
# Evaluate the path string
path_value = eval(path_expr)
# Replace the original string with the evaluated result
return path_value
else:
raise ValueError("Invalid workspace path expression.")
else:
return os.path.abspath(Path(os.getcwd()) / workspace)
@pytest.fixture(scope="module")
def config(request: Any) -> None:
print(f"Config file: {CONFIG_PATH}")
with open(CONFIG_PATH, "r") as f:
config = json.load(f)
if isinstance(config["workspace"], str):
config["workspace"] = resolve_workspace(config["workspace"])
else: # it's a input output dict
config["workspace"]["input"] = resolve_workspace(config["workspace"]["input"])
config["workspace"]["output"] = resolve_workspace(config["workspace"]["output"])
return config
@pytest.fixture(autouse=True)
def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
output_path = config["workspace"]
# checks if its an input output paradigm
if not isinstance(config["workspace"], str):
output_path = config["workspace"]["output"]
if not os.path.exists(config["workspace"]["input"]):
os.makedirs(config["workspace"]["input"], exist_ok=True)
# create output directory if it doesn't exist
if not os.path.exists(output_path):
os.makedirs(output_path, exist_ok=True)
yield config["workspace"]
# teardown after test function completes
for filename in os.listdir(output_path):
file_path = os.path.join(output_path, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(f"Failed to delete {file_path}. Reason: {e}")
def pytest_addoption(parser: Any) -> None:
parser.addoption("--mock", action="store_true", default=False)
parser.addoption("--nc", action="store_true", default=False)
parser.addoption("--improve", action="store_true", default=False)
parser.addoption("--maintain", action="store_true", default=False)
parser.addoption("--test", action="store_true", default=None)
@pytest.fixture(autouse=True)
def check_regression(request: Any) -> None:
test_name = request.node.parent.name
data = get_regression_data()
# Get the true location of the test
challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
skip_string = f"Skipping {test_name} at {challenge_location}"
# Check if the test name exists in the regression tests
if request.config.getoption("--improve") and data.get(test_name, None):
pytest.skip(f"{skip_string} because it's a regression test")
elif request.config.getoption("--maintain") and not data.get(test_name, None):
pytest.skip(f"{skip_string} because it's not a regression test")
# this is to get the challenge_data from every test
@pytest.fixture(autouse=True)
def challenge_data(request: Any) -> None:
return request.param
@pytest.fixture(autouse=True, scope="session")
def mock(request: Any) -> None:
return request.config.getoption("--mock")
@pytest.fixture(autouse=True, scope="function")
def timer(request: Any) -> Any:
start_time = time.time()
yield
run_time = time.time() - start_time
request.node.user_properties.append(("run_time", run_time))
# tests that consistently pass are considered regression tests
regression_manager = ReportManager(REGRESSION_TESTS_PATH)
# user facing reporting information
info_manager = ReportManager(INFO_TESTS_PATH)
INTERNAL_LOGS_PATH = Path(__file__).resolve().parent / "reports"
# internal db step in replacement track pass/fail rate
internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
def pytest_runtest_makereport(item: Any, call: Any) -> None:
challenge_data = item.funcargs.get("challenge_data", None)
if call.when == "call":
difficulty = (
challenge_data["info"]["difficulty"] if challenge_data else "unknown"
)
dependencies = dependencies = (
challenge_data["dependencies"] if challenge_data else []
)
# Extract the challenge_location from the class
challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
test_name = item.nodeid.split("::")[1]
item.test_name = test_name
test_details = {
"difficulty": difficulty,
"dependencies": dependencies,
"data_path": challenge_location,
}
info_details: Any = {
"data_path": challenge_location,
"is_regression": False,
"task": challenge_data["task"],
"answer": challenge_data["ground"]["answer"],
"description": challenge_data["info"]["description"],
"metrics": {
"difficulty": difficulty,
"success": False,
},
}
mock = "--mock" in sys.argv # Check if --mock is in sys.argv
if call.excinfo is None:
info_details["metrics"]["success"] = True
else:
if not mock: # don't remove if it's a mock test
regression_manager.remove_test(test_name)
info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
prev_test_results: list[bool]
agent_tests: dict[str, list[bool]] = {}
# if the structure is nested inside of the agent name
if AGENT_NAME:
agent_tests = internal_info.tests.get(AGENT_NAME, {})
if agent_tests:
prev_test_results = agent_tests.get(test_name, [])
else:
prev_test_results = internal_info.tests.get(test_name, [])
if not mock:
# only add if it's an actual test
prev_test_results.append(info_details["metrics"]["success"])
internal_info.add_test(test_name, prev_test_results, AGENT_NAME)
# can calculate success rate regardless of mock
info_details["metrics"]["success_%"] = calculate_success_percentage(
prev_test_results
)
else:
# can calculate success rate regardless of mock
info_details["metrics"][
"non_mock_success_%"
] = calculate_success_percentage(prev_test_results)
if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
# if the last 3 tests were successful, add to the regression tests
info_details["is_regression"] = True
regression_manager.add_test(test_name, test_details)
# user facing reporting
item.info_details = info_details
if call.when == "teardown":
run_time = dict(item.user_properties).get("run_time")
info_details = getattr(item, "info_details", {})
test_name = getattr(item, "test_name", "")
if info_details and test_name:
if run_time:
info_details["metrics"][
"run_time"
] = f"{str(round(run_time, 3))} seconds"
info_details["reached_cutoff"] = (
float(run_time) > challenge_data["cutoff"]
)
info_manager.add_test(test_name, info_details)
def pytest_sessionfinish(session: Any) -> None:
"""Called at the end of the session to save regression tests and info"""
with open(CONFIG_PATH, "r") as f:
config = json.load(f)
internal_info.save()
info_manager.end_info_report(config)
regression_manager.save()
# this is adding the dependency marker and category markers automatically from the json
def pytest_collection_modifyitems(items: Any, config: Any) -> None:
data = get_regression_data()
for item in items:
# Assuming item.cls is your test class
test_class_instance = item.cls()
# Then you can access your properties
name = item.parent.cls.__name__
dependencies = test_class_instance.data.dependencies
# Filter dependencies if they exist in regression data if its an improvement test
if config.getoption("--improve"):
dependencies = [dep for dep in dependencies if not data.get(dep, None)]
elif config.getoption("--test"):
dependencies = []
categories = test_class_instance.data.category
# Add depends marker dynamically
item.add_marker(pytest.mark.depends(on=dependencies, name=name))
# Add category marker dynamically
for category in categories:
item.add_marker(getattr(pytest.mark, category))