Files
Auto-GPT/benchmark/agbenchmark/utils/utils.py
Reinier van der Leer 9012ff4db2 refactor(benchmark): Interface & type consoledation, and arch change, to allow adding challenge providers
Squashed commit of the following:

commit 7d6476d3297860f74c276d571da995d958a8cc1a
Author: Reinier van der Leer <pwuts@agpt.co>
Date:   Tue Jan 9 18:10:45 2024 +0100

    refactor(benchmark/challenge): Set up structure to support more challenge providers

    - Move `Challenge`, `ChallengeData`, `load_challenges` to `challenges/builtin.py` and rename to `BuiltinChallenge`, `BuiltinChallengeSpec`, `load_builtin_challenges`
    - Create `BaseChallenge` to serve as interface and base class for different challenge implementations
    - Create `ChallengeInfo` model to serve as universal challenge info object
    - Create `get_challenge_from_source_uri` function in `challenges/__init__.py`
    - Replace `ChallengeData` by `ChallengeInfo` everywhere except in `BuiltinChallenge`
    - Add strong typing to `task_informations` store in app.py
    - Use `call.duration` in `finalize_test_report` and remove `timer` fixture
    - Update docstring on `challenges/__init__.py:get_unique_categories`
    - Add docstring to `generate_test.py`

commit 5df2aa7939b45d85a2c2b5de9ac0522330d1502a
Author: Reinier van der Leer <pwuts@agpt.co>
Date:   Tue Jan 9 16:58:01 2024 +0100

    refactor(benchmark): Refactor & rename functions in agent_interface.py and agent_api_interface.py

    - `copy_artifacts_into_temp_folder` -> `copy_challenge_artifacts_into_workspace`
    - `copy_agent_artifacts_into_folder` -> `download_agent_artifacts_into_folder`
    - Reorder parameters of `run_api_agent`, `copy_challenge_artifacts_into_workspace`; use `Path` instead of `str`

commit 6a256fef4c7950b7ee82fb801e70c83afe6b6f8b
Author: Reinier van der Leer <pwuts@agpt.co>
Date:   Tue Jan 9 16:02:25 2024 +0100

    refactor(benchmark): Refactor & typefix report generation and handling logic

    - Rename functions in reports.py and ReportManager.py to better reflect what they do
       - `get_previous_test_results` -> `get_and_update_success_history`
       - `generate_single_call_report` -> `initialize_test_report`
       - `finalize_reports` -> `finalize_test_report`
       - `ReportManager.end_info_report` -> `SessionReportManager.finalize_session_report`
    - Modify `pytest_runtest_makereport` hook in conftest.py to finalize the report immediately after the challenge finishes running instead of after teardown
       - Move result processing logic from `initialize_test_report` to `finalize_test_report` in reports.py
    - Use `Test` and `Report` types from report_types.py where possible instead of untyped dicts: reports.py, utils.py, ReportManager.py
    - Differentiate `ReportManager` into `SessionReportManager`, `RegressionTestsTracker`, `SuccessRateTracker`
    - Move filtering of optional challenge categories from challenge.py (`Challenge.skip_optional_categories`) to conftest.py (`pytest_collection_modifyitems`)
    - Remove unused `scores` fixture in conftest.py

commit 370d6dbf5df75d78e3878877968e8cd309d6d7fb
Author: Reinier van der Leer <pwuts@agpt.co>
Date:   Tue Jan 9 15:16:43 2024 +0100

    refactor(benchmark): Simplify models in report_types.py

    - Removed ForbidOptionalMeta and BaseModelBenchmark classes.
    - Changed model attributes to optional: `Metrics.difficulty`, `Metrics.success`, `Metrics.success_percentage`, `Metrics.run_time`, and `Test.reached_cutoff`.
    - Added validator to `Metrics` model to require `success` and `run_time` fields if `attempted=True`.
    - Added default values to all optional model fields.
    - Removed duplicate imports.
    - Added condition in process_report.py to prevent null lookups if `metrics.difficulty` is not set.
2024-01-18 15:19:06 +01:00

148 lines
4.7 KiB
Python

# radio charts, logs, helper functions for tests, anything else relevant.
import json
import logging
import os
import re
from pathlib import Path
from typing import Any, Optional
from dotenv import load_dotenv
from agbenchmark.reports.processing.report_types import Test
from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel
load_dotenv()
AGENT_NAME = os.getenv("AGENT_NAME")
REPORT_LOCATION = os.getenv("REPORT_LOCATION", None)
logger = logging.getLogger(__name__)
def replace_backslash(value: Any) -> Any:
if isinstance(value, str):
return re.sub(
r"\\+", "/", value
) # replace one or more backslashes with a forward slash
elif isinstance(value, list):
return [replace_backslash(i) for i in value]
elif isinstance(value, dict):
return {k: replace_backslash(v) for k, v in value.items()}
else:
return value
def calculate_success_percentage(results: list[bool]) -> float:
# Take the last 10 results or all if less than 10
last_results = results[-10:] if len(results) > 10 else results
success_count = last_results.count(True)
total_count = len(last_results)
if total_count == 0:
return 0
success_percentage = (success_count / total_count) * 100 # as a percentage
return round(success_percentage, 2)
def get_test_path(json_file: str | Path) -> str:
if isinstance(json_file, str):
json_file = Path(json_file)
# Find the index of "agbenchmark" in the path parts
try:
agbenchmark_index = json_file.parts.index("benchmark")
except ValueError:
raise ValueError("Invalid challenge location.")
# Create the path from "agbenchmark" onwards
challenge_location = Path(*json_file.parts[agbenchmark_index:])
formatted_location = replace_backslash(str(challenge_location))
if isinstance(formatted_location, str):
return formatted_location
else:
return str(challenge_location)
def get_highest_success_difficulty(
data: dict[str, Test], just_string: Optional[bool] = None
) -> str:
highest_difficulty = None
highest_difficulty_level = 0
for test_name, test_data in data.items():
try:
if test_data.metrics.success:
difficulty_str = test_data.metrics.difficulty
if not difficulty_str:
continue
try:
difficulty_enum = DifficultyLevel[difficulty_str.lower()]
difficulty_level = DIFFICULTY_MAP[difficulty_enum]
if difficulty_level > highest_difficulty_level:
highest_difficulty = difficulty_enum
highest_difficulty_level = difficulty_level
except KeyError:
logger.warning(
f"Unexpected difficulty level '{difficulty_str}' "
f"in test '{test_name}'"
)
continue
except Exception as e:
logger.warning(
"An unexpected error [1] occurred while analyzing report [2]."
"Please notify a maintainer.\n"
f"Report data [1]: {data}\n"
f"Error [2]: {e}"
)
logger.warning(
"Make sure you selected the right test, no reports were generated."
)
break
if highest_difficulty is not None:
highest_difficulty_str = highest_difficulty.name # convert enum to string
else:
highest_difficulty_str = ""
if highest_difficulty_level and not just_string:
return f"{highest_difficulty_str}: {highest_difficulty_level}"
elif highest_difficulty_str:
return highest_difficulty_str
return "No successful tests"
# def get_git_commit_sha(directory: Path) -> Optional[str]:
# try:
# repo = git.Repo(directory)
# remote_url = repo.remotes.origin.url
# if remote_url.endswith(".git"):
# remote_url = remote_url[:-4]
# git_commit_sha = f"{remote_url}/tree/{repo.head.commit.hexsha}"
# # logger.debug(f"GIT_COMMIT_SHA: {git_commit_sha}")
# return git_commit_sha
# except Exception:
# # logger.error(f"{directory} is not a git repository!")
# return None
def write_pretty_json(data, json_file):
sorted_data = deep_sort(data)
json_graph = json.dumps(sorted_data, indent=4)
with open(json_file, "w") as f:
f.write(json_graph)
f.write("\n")
def deep_sort(obj):
"""
Recursively sort the keys in JSON object
"""
if isinstance(obj, dict):
return {k: deep_sort(v) for k, v in sorted(obj.items())}
if isinstance(obj, list):
return [deep_sort(elem) for elem in obj]
return obj