Files
Auto-GPT/benchmark/agbenchmark/reports/reports.py
Reinier van der Leer 056163ee57 refactor(benchmark): Disable Helicone integrations
We want to upgrade the OpenAI library, but `helicone` does not support `openai@^1.0.0`, so we're disabling the Helicone integration for now.
2024-01-16 15:38:47 +01:00

194 lines
6.5 KiB
Python

import json
import logging
import os
import sys
from pathlib import Path
from typing import Any, Dict
import pytest
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.reports.ReportManager import SingletonReportManager
from agbenchmark.utils.data_types import ChallengeData, DifficultyLevel
from agbenchmark.utils.utils import calculate_success_percentage
# from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone
logger = logging.getLogger(__name__)
def get_previous_test_results(
test_name: str, info_details: dict[str, Any]
) -> list[bool]:
mock = os.getenv("IS_MOCK") # Check if --mock is in sys.argv
prev_test_results = SingletonReportManager().INTERNAL_INFO_MANAGER.tests.get(
test_name, []
)
if not mock:
# only add if it's an actual test
prev_test_results.append(info_details["metrics"]["success"])
SingletonReportManager().INTERNAL_INFO_MANAGER.add_test(
test_name, prev_test_results
)
# can calculate success rate regardless of mock
info_details["metrics"]["success_%"] = calculate_success_percentage(
prev_test_results
)
return prev_test_results
def update_regression_tests(
prev_test_results: list[bool],
info_details: dict,
test_name: str,
test_details: dict,
) -> None:
if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
# if the last 3 tests were successful, add to the regression tests
info_details["is_regression"] = True
SingletonReportManager().REGRESSION_MANAGER.add_test(test_name, test_details)
def generate_single_call_report(
item: pytest.Item,
call: pytest.CallInfo,
challenge_data: ChallengeData,
answers: dict[str, Any],
challenge_location: str,
test_name: str,
) -> None:
difficulty = challenge_data.info.difficulty
if isinstance(difficulty, DifficultyLevel):
difficulty = difficulty.value
# Extract the challenge_location from the class
# challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
# test_name = item.nodeid.split("::")[1]
# item.test_name = test_name
test_details = {
"difficulty": difficulty,
"data_path": challenge_location,
}
info_details: Any = {
"data_path": challenge_location,
"is_regression": False,
"category": challenge_data.category,
"task": challenge_data.task,
"answer": challenge_data.ground.answer,
"description": challenge_data.info.description,
"metrics": {
"difficulty": difficulty,
"success": False,
"attempted": True,
},
# "answers": answers,
}
if answers:
info_details["answers"] = answers
if challenge_data.metadata:
info_details["metadata"] = challenge_data.metadata
mock = os.getenv("IS_MOCK") # Check if --mock is in sys.argv
if call:
if call.excinfo is None:
info_details["metrics"]["success"] = True
else:
if not mock: # don't remove if it's a mock test
SingletonReportManager().REGRESSION_MANAGER.remove_test(test_name)
info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
if call.excinfo.typename == "Skipped":
info_details["metrics"]["attempted"] = False
prev_test_results: list[bool] = get_previous_test_results(test_name, info_details)
update_regression_tests(prev_test_results, info_details, test_name, test_details)
# user facing reporting
if item:
item.info_details = info_details
return info_details
def finalize_reports(
config: AgentBenchmarkConfig, item: pytest.Item, challenge_data: ChallengeData
) -> None:
run_time = dict(item.user_properties).get("run_time")
info_details = getattr(item, "info_details", {})
test_name = getattr(item, "test_name", "")
if info_details and test_name:
if run_time is not None:
cost = None
# if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
# logger.debug("Getting cost from Helicone")
# cost = get_data_from_helicone(test_name)
# logger.debug(f"Cost: {cost}")
info_details["metrics"]["cost"] = cost
if info_details["metrics"].get("success", None) is None:
info_details["metrics"]["attempted"] = False
info_details["metrics"]["success"] = False
elif (
info_details["metrics"].get("success") is False
and "attempted" not in info_details["metrics"]
):
info_details["metrics"]["attempted"] = False
info_details["metrics"]["run_time"] = f"{str(round(run_time, 3))} seconds"
info_details["reached_cutoff"] = float(run_time) > challenge_data.cutoff
if "--mock" not in sys.argv:
update_challenges_already_beaten(
config.challenges_already_beaten_file, info_details, test_name
)
if info_details.get("tests") is not None:
for nested_test_name, nested_test_info in info_details[
"tests"
].items():
update_challenges_already_beaten(
config.challenges_already_beaten_file,
nested_test_info,
nested_test_name,
)
SingletonReportManager().INFO_MANAGER.add_test(test_name, info_details)
def update_challenges_already_beaten(
challenges_already_beaten_file: Path, info_details: Dict[str, Any], test_name: str
) -> None:
current_run_successful = info_details["metrics"]["success"]
try:
with open(challenges_already_beaten_file, "r") as f:
challenge_data = json.load(f)
except FileNotFoundError:
challenge_data = {}
challenge_beaten_in_the_past = challenge_data.get(test_name)
challenge_data[test_name] = True
if challenge_beaten_in_the_past is None and not current_run_successful:
challenge_data[test_name] = False
with open(challenges_already_beaten_file, "w") as f:
json.dump(challenge_data, f, indent=4)
def session_finish(
agbenchmark_config: AgentBenchmarkConfig, suite_reports: dict
) -> None:
SingletonReportManager().INTERNAL_INFO_MANAGER.save()
SingletonReportManager().INFO_MANAGER.end_info_report(agbenchmark_config)
SingletonReportManager().REGRESSION_MANAGER.save()