Auto-GPT/agbenchmark/conftest.py

import json
import os
import shutil
import sys
import time
from pathlib import Path  # noqa
from typing import Any, Dict, Generator

import pytest

from agbenchmark.ReportManager import ReportManager
from agbenchmark.start_benchmark import (
    CONFIG_PATH,
    INFO_TESTS_PATH,
    REGRESSION_TESTS_PATH,
    get_regression_data,
)
from agbenchmark.utils import AGENT_NAME, calculate_success_percentage


def resolve_workspace(workspace: str) -> str:
    if workspace.startswith("${") and workspace.endswith("}"):
        # Extract the string inside ${...}
        path_expr = workspace[2:-1]

        # Check if it starts with "os.path.join"
        if path_expr.strip().startswith("os.path.join"):
            # Evaluate the path string
            path_value = eval(path_expr)

            # Replace the original string with the evaluated result
            return path_value
        else:
            raise ValueError("Invalid workspace path expression.")
    else:
        return os.path.abspath(Path(os.getcwd()) / workspace)


@pytest.fixture(scope="module")
def config(request: Any) -> None:
    print(f"Config file: {CONFIG_PATH}")
    with open(CONFIG_PATH, "r") as f:
        config = json.load(f)

    if isinstance(config["workspace"], str):
        config["workspace"] = resolve_workspace(config["workspace"])
    else:  # it's a input output dict
        config["workspace"]["input"] = resolve_workspace(config["workspace"]["input"])
        config["workspace"]["output"] = resolve_workspace(config["workspace"]["output"])

    return config


@pytest.fixture(autouse=True)
def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
    output_path = config["workspace"]

    # checks if its an input output paradigm
    if not isinstance(config["workspace"], str):
        output_path = config["workspace"]["output"]
        if not os.path.exists(config["workspace"]["input"]):
            os.makedirs(config["workspace"]["input"], exist_ok=True)

    # create output directory if it doesn't exist
    if not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)

    yield config["workspace"]
    # teardown after test function completes

    for filename in os.listdir(output_path):
        file_path = os.path.join(output_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")


def pytest_addoption(parser: Any) -> None:
    parser.addoption("--mock", action="store_true", default=False)
    parser.addoption("--nc", action="store_true", default=False)
    parser.addoption("--improve", action="store_true", default=False)
    parser.addoption("--maintain", action="store_true", default=False)
    parser.addoption("--test", action="store_true", default=None)


@pytest.fixture(autouse=True)
def check_regression(request: Any) -> None:
    test_name = request.node.parent.name
    data = get_regression_data()

    # Get the true location of the test
    challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")

    skip_string = f"Skipping {test_name} at {challenge_location}"

    # Check if the test name exists in the regression tests
    if request.config.getoption("--improve") and data.get(test_name, None):
        pytest.skip(f"{skip_string} because it's a regression test")
    elif request.config.getoption("--maintain") and not data.get(test_name, None):
        pytest.skip(f"{skip_string} because it's not a regression test")


# this is to get the challenge_data from every test
@pytest.fixture(autouse=True)
def challenge_data(request: Any) -> None:
    return request.param


@pytest.fixture(autouse=True, scope="session")
def mock(request: Any) -> None:
    return request.config.getoption("--mock")


@pytest.fixture(autouse=True, scope="function")
def timer(request: Any) -> Any:
    start_time = time.time()
    yield
    run_time = time.time() - start_time
    request.node.user_properties.append(("run_time", run_time))


# tests that consistently pass are considered regression tests
regression_manager = ReportManager(REGRESSION_TESTS_PATH)

# user facing reporting information
info_manager = ReportManager(INFO_TESTS_PATH)

INTERNAL_LOGS_PATH = Path(__file__).resolve().parent / "reports"

# internal db step in replacement track pass/fail rate
internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))


def pytest_runtest_makereport(item: Any, call: Any) -> None:
    challenge_data = item.funcargs.get("challenge_data", None)
    if call.when == "call":
        difficulty = (
            challenge_data["info"]["difficulty"] if challenge_data else "unknown"
        )
        dependencies = dependencies = (
            challenge_data["dependencies"] if challenge_data else []
        )
        # Extract the challenge_location from the class
        challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
        test_name = item.nodeid.split("::")[1]
        item.test_name = test_name

        test_details = {
            "difficulty": difficulty,
            "dependencies": dependencies,
            "data_path": challenge_location,
        }

        info_details: Any = {
            "data_path": challenge_location,
            "is_regression": False,
            "task": challenge_data["task"],
            "answer": challenge_data["ground"]["answer"],
            "description": challenge_data["info"]["description"],
            "metrics": {
                "difficulty": difficulty,
                "success": False,
            },
        }

        mock = "--mock" in sys.argv  # Check if --mock is in sys.argv

        if call.excinfo is None:
            info_details["metrics"]["success"] = True
        else:
            if not mock:  # don't remove if it's a mock test
                regression_manager.remove_test(test_name)
            info_details["metrics"]["fail_reason"] = str(call.excinfo.value)

        prev_test_results: list[bool]
        agent_tests: dict[str, list[bool]] = {}

        # if the structure is nested inside of the agent name
        if AGENT_NAME:
            agent_tests = internal_info.tests.get(AGENT_NAME, {})

        if agent_tests:
            prev_test_results = agent_tests.get(test_name, [])
        else:
            prev_test_results = internal_info.tests.get(test_name, [])

        if not mock:
            # only add if it's an actual test
            prev_test_results.append(info_details["metrics"]["success"])
            internal_info.add_test(test_name, prev_test_results, AGENT_NAME)

            # can calculate success rate regardless of mock
            info_details["metrics"]["success_%"] = calculate_success_percentage(
                prev_test_results
            )
        else:
            # can calculate success rate regardless of mock
            info_details["metrics"][
                "non_mock_success_%"
            ] = calculate_success_percentage(prev_test_results)

        if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
            # if the last 3 tests were successful, add to the regression tests
            info_details["is_regression"] = True
            regression_manager.add_test(test_name, test_details)

        # user facing reporting
        item.info_details = info_details
    if call.when == "teardown":
        run_time = dict(item.user_properties).get("run_time")

        info_details = getattr(item, "info_details", {})
        test_name = getattr(item, "test_name", "")

        if info_details and test_name:
            if run_time:
                info_details["metrics"][
                    "run_time"
                ] = f"{str(round(run_time, 3))} seconds"

                info_details["reached_cutoff"] = (
                    float(run_time) > challenge_data["cutoff"]
                )

            info_manager.add_test(test_name, info_details)


def pytest_sessionfinish(session: Any) -> None:
    """Called at the end of the session to save regression tests and info"""
    with open(CONFIG_PATH, "r") as f:
        config = json.load(f)

    internal_info.save()
    info_manager.end_info_report(config)
    regression_manager.save()


# this is adding the dependency marker and category markers automatically from the json
def pytest_collection_modifyitems(items: Any, config: Any) -> None:
    data = get_regression_data()

    for item in items:
        # Assuming item.cls is your test class
        test_class_instance = item.cls()

        # Then you can access your properties
        name = item.parent.cls.__name__
        dependencies = test_class_instance.data.dependencies

        # Filter dependencies if they exist in regression data if its an improvement test
        if config.getoption("--improve"):
            dependencies = [dep for dep in dependencies if not data.get(dep, None)]
        elif config.getoption("--test"):
            dependencies = []

        categories = test_class_instance.data.category

        # Add depends marker dynamically
        item.add_marker(pytest.mark.depends(on=dependencies, name=name))

        # Add category marker dynamically
        for category in categories:
            item.add_marker(getattr(pytest.mark, category))