mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-18 22:44:21 +01:00
Squashed commit of the following: commit 7d6476d3297860f74c276d571da995d958a8cc1a Author: Reinier van der Leer <pwuts@agpt.co> Date: Tue Jan 9 18:10:45 2024 +0100 refactor(benchmark/challenge): Set up structure to support more challenge providers - Move `Challenge`, `ChallengeData`, `load_challenges` to `challenges/builtin.py` and rename to `BuiltinChallenge`, `BuiltinChallengeSpec`, `load_builtin_challenges` - Create `BaseChallenge` to serve as interface and base class for different challenge implementations - Create `ChallengeInfo` model to serve as universal challenge info object - Create `get_challenge_from_source_uri` function in `challenges/__init__.py` - Replace `ChallengeData` by `ChallengeInfo` everywhere except in `BuiltinChallenge` - Add strong typing to `task_informations` store in app.py - Use `call.duration` in `finalize_test_report` and remove `timer` fixture - Update docstring on `challenges/__init__.py:get_unique_categories` - Add docstring to `generate_test.py` commit 5df2aa7939b45d85a2c2b5de9ac0522330d1502a Author: Reinier van der Leer <pwuts@agpt.co> Date: Tue Jan 9 16:58:01 2024 +0100 refactor(benchmark): Refactor & rename functions in agent_interface.py and agent_api_interface.py - `copy_artifacts_into_temp_folder` -> `copy_challenge_artifacts_into_workspace` - `copy_agent_artifacts_into_folder` -> `download_agent_artifacts_into_folder` - Reorder parameters of `run_api_agent`, `copy_challenge_artifacts_into_workspace`; use `Path` instead of `str` commit 6a256fef4c7950b7ee82fb801e70c83afe6b6f8b Author: Reinier van der Leer <pwuts@agpt.co> Date: Tue Jan 9 16:02:25 2024 +0100 refactor(benchmark): Refactor & typefix report generation and handling logic - Rename functions in reports.py and ReportManager.py to better reflect what they do - `get_previous_test_results` -> `get_and_update_success_history` - `generate_single_call_report` -> `initialize_test_report` - `finalize_reports` -> `finalize_test_report` - `ReportManager.end_info_report` -> `SessionReportManager.finalize_session_report` - Modify `pytest_runtest_makereport` hook in conftest.py to finalize the report immediately after the challenge finishes running instead of after teardown - Move result processing logic from `initialize_test_report` to `finalize_test_report` in reports.py - Use `Test` and `Report` types from report_types.py where possible instead of untyped dicts: reports.py, utils.py, ReportManager.py - Differentiate `ReportManager` into `SessionReportManager`, `RegressionTestsTracker`, `SuccessRateTracker` - Move filtering of optional challenge categories from challenge.py (`Challenge.skip_optional_categories`) to conftest.py (`pytest_collection_modifyitems`) - Remove unused `scores` fixture in conftest.py commit 370d6dbf5df75d78e3878877968e8cd309d6d7fb Author: Reinier van der Leer <pwuts@agpt.co> Date: Tue Jan 9 15:16:43 2024 +0100 refactor(benchmark): Simplify models in report_types.py - Removed ForbidOptionalMeta and BaseModelBenchmark classes. - Changed model attributes to optional: `Metrics.difficulty`, `Metrics.success`, `Metrics.success_percentage`, `Metrics.run_time`, and `Test.reached_cutoff`. - Added validator to `Metrics` model to require `success` and `run_time` fields if `attempted=True`. - Added default values to all optional model fields. - Removed duplicate imports. - Added condition in process_report.py to prevent null lookups if `metrics.difficulty` is not set.
206 lines
6.4 KiB
Python
206 lines
6.4 KiB
Python
import copy
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from agbenchmark.config import AgentBenchmarkConfig
|
|
from agbenchmark.reports.processing.graphs import save_single_radar_chart
|
|
from agbenchmark.reports.processing.process_report import get_agent_category
|
|
from agbenchmark.reports.processing.report_types import MetricsOverall, Report, Test
|
|
from agbenchmark.utils.utils import get_highest_success_difficulty
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SingletonReportManager:
|
|
instance = None
|
|
|
|
INFO_MANAGER: "SessionReportManager"
|
|
REGRESSION_MANAGER: "RegressionTestsTracker"
|
|
SUCCESS_RATE_TRACKER: "SuccessRatesTracker"
|
|
|
|
def __new__(cls):
|
|
if not cls.instance:
|
|
cls.instance = super(SingletonReportManager, cls).__new__(cls)
|
|
|
|
agent_benchmark_config = AgentBenchmarkConfig.load()
|
|
benchmark_start_time_dt = datetime.now(
|
|
timezone.utc
|
|
) # or any logic to fetch the datetime
|
|
|
|
# Make the Managers class attributes
|
|
cls.INFO_MANAGER = SessionReportManager(
|
|
agent_benchmark_config.get_report_dir(benchmark_start_time_dt)
|
|
/ "report.json",
|
|
benchmark_start_time_dt,
|
|
)
|
|
cls.REGRESSION_MANAGER = RegressionTestsTracker(
|
|
agent_benchmark_config.regression_tests_file
|
|
)
|
|
cls.SUCCESS_RATE_TRACKER = SuccessRatesTracker(
|
|
agent_benchmark_config.success_rate_file
|
|
)
|
|
|
|
return cls.instance
|
|
|
|
@classmethod
|
|
def clear_instance(cls):
|
|
cls.instance = None
|
|
cls.INFO_MANAGER = None
|
|
cls.REGRESSION_MANAGER = None
|
|
cls.SUCCESS_RATE_TRACKER = None
|
|
|
|
|
|
class BaseReportManager:
|
|
"""Abstracts interaction with the regression tests file"""
|
|
|
|
tests: dict[str, Any]
|
|
|
|
def __init__(self, report_file: Path):
|
|
self.report_file = report_file
|
|
|
|
self.load()
|
|
|
|
def load(self) -> None:
|
|
if not self.report_file.exists():
|
|
self.report_file.parent.mkdir(exist_ok=True)
|
|
|
|
try:
|
|
with self.report_file.open("r") as f:
|
|
data = json.load(f)
|
|
self.tests = {k: data[k] for k in sorted(data)}
|
|
except FileNotFoundError:
|
|
self.tests = {}
|
|
except json.decoder.JSONDecodeError as e:
|
|
logger.warning(f"Could not parse {self.report_file}: {e}")
|
|
self.tests = {}
|
|
self.save()
|
|
|
|
def save(self) -> None:
|
|
with self.report_file.open("w") as f:
|
|
json.dump(self.tests, f, indent=4)
|
|
|
|
def remove_test(self, test_name: str) -> None:
|
|
if test_name in self.tests:
|
|
del self.tests[test_name]
|
|
self.save()
|
|
|
|
def reset(self) -> None:
|
|
self.tests = {}
|
|
self.save()
|
|
|
|
|
|
class SessionReportManager(BaseReportManager):
|
|
"""Abstracts interaction with the regression tests file"""
|
|
|
|
tests: dict[str, Test] | Report
|
|
|
|
def __init__(self, report_file: Path, benchmark_start_time: datetime):
|
|
super().__init__(report_file)
|
|
|
|
self.start_time = time.time()
|
|
self.benchmark_start_time = benchmark_start_time
|
|
|
|
def save(self) -> None:
|
|
with self.report_file.open("w") as f:
|
|
if isinstance(self.tests, Report):
|
|
f.write(self.tests.json(indent=4))
|
|
else:
|
|
json.dump({k: v.dict() for k, v in self.tests.items()}, f, indent=4)
|
|
|
|
def add_test_report(self, test_name: str, test_report: Test) -> None:
|
|
if isinstance(self.tests, Report):
|
|
raise RuntimeError("Session report already finalized")
|
|
|
|
if test_name.startswith("Test"):
|
|
test_name = test_name[4:]
|
|
self.tests[test_name] = test_report
|
|
|
|
self.save()
|
|
|
|
def finalize_session_report(self, config: AgentBenchmarkConfig) -> None:
|
|
command = " ".join(sys.argv)
|
|
|
|
if isinstance(self.tests, Report):
|
|
raise RuntimeError("Session report already finalized")
|
|
|
|
self.tests = Report(
|
|
command=command.split(os.sep)[-1],
|
|
benchmark_git_commit_sha="---",
|
|
agent_git_commit_sha="---",
|
|
completion_time=datetime.now(timezone.utc).strftime(
|
|
"%Y-%m-%dT%H:%M:%S+00:00"
|
|
),
|
|
benchmark_start_time=self.benchmark_start_time.strftime(
|
|
"%Y-%m-%dT%H:%M:%S+00:00"
|
|
),
|
|
metrics=MetricsOverall(
|
|
run_time=str(round(time.time() - self.start_time, 2)) + " seconds",
|
|
highest_difficulty=get_highest_success_difficulty(self.tests),
|
|
total_cost=self.get_total_costs(),
|
|
),
|
|
tests=copy.copy(self.tests),
|
|
config=config.dict(exclude_none=True),
|
|
)
|
|
|
|
agent_categories = get_agent_category(self.tests)
|
|
if len(agent_categories) > 1:
|
|
save_single_radar_chart(
|
|
agent_categories,
|
|
config.get_report_dir(self.benchmark_start_time) / "radar_chart.png",
|
|
)
|
|
|
|
self.save()
|
|
|
|
def get_total_costs(self):
|
|
if isinstance(self.tests, Report):
|
|
tests = self.tests.tests
|
|
else:
|
|
tests = self.tests
|
|
|
|
total_cost = 0
|
|
all_costs_none = True
|
|
for test_data in tests.values():
|
|
cost = test_data.metrics.cost or 0.0
|
|
|
|
if cost is not None: # check if cost is not None
|
|
all_costs_none = False
|
|
total_cost += cost # add cost to total
|
|
if all_costs_none:
|
|
total_cost = None
|
|
return total_cost
|
|
|
|
|
|
class RegressionTestsTracker(BaseReportManager):
|
|
"""Abstracts interaction with the regression tests file"""
|
|
|
|
tests: dict[str, dict]
|
|
|
|
def add_test(self, test_name: str, test_details: dict) -> None:
|
|
if test_name.startswith("Test"):
|
|
test_name = test_name[4:]
|
|
self.tests[test_name] = test_details
|
|
|
|
self.save()
|
|
|
|
def has_regression_test(self, test_name: str) -> bool:
|
|
return self.tests.get(test_name) is not None
|
|
|
|
|
|
class SuccessRatesTracker(BaseReportManager):
|
|
"""Abstracts interaction with the regression tests file"""
|
|
|
|
tests: dict[str, list[bool]]
|
|
|
|
def update(self, test_name: str, success_history: list[bool]) -> None:
|
|
if test_name.startswith("Test"):
|
|
test_name = test_name[4:]
|
|
self.tests[test_name] = success_history
|
|
|
|
self.save()
|