feat(benchmark): Add -N, --attempts option for multiple attempts per challenge

LLMs are probabilistic systems. Reproducibility of completions is not guaranteed. It only makes sense to account for this, by running challenges multiple times to obtain a success ratio rather than a boolean success/failure result. Changes: - Add `-N`, `--attempts` option to CLI and `attempts_per_challenge` parameter to `main.py:run_benchmark`. - Add dynamic `i_attempt` fixture through `pytest_generate_tests` hook in conftest.py to achieve multiple runs per challenge. - Modify `pytest_runtest_makereport` hook in conftest.py to handle multiple reporting calls per challenge. - Refactor report_types.py, reports.py, process_report.ty to allow multiple results per challenge. - Calculate `success_percentage` from results of the current run, rather than all known results ever. - Add docstrings to a number of models in report_types.py. - Allow `None` as a success value, e.g. for runs that did not render any results before being cut off. - Make SingletonReportManager thread-safe.
2025-12-17 14:04:27 +01:00 · 2024-01-22 14:37:12 +01:00
parent 488f40a20f
commit a0cae78ba3
12 changed files with 181 additions and 141 deletions
--- a/benchmark/agbenchmark/utils/utils.py
+++ b/benchmark/agbenchmark/utils/utils.py
@@ -32,17 +32,6 @@ def replace_backslash(value: Any) -> Any:
        return value


-def calculate_success_percentage(results: list[bool]) -> float:
-    # Take the last 10 results or all if less than 10
-    last_results = results[-10:] if len(results) > 10 else results
-    success_count = last_results.count(True)
-    total_count = len(last_results)
-    if total_count == 0:
-        return 0
-    success_percentage = (success_count / total_count) * 100  # as a percentage
-    return round(success_percentage, 2)
-
-
 def get_test_path(json_file: str | Path) -> str:
    if isinstance(json_file, str):
        json_file = Path(json_file)
@@ -71,8 +60,8 @@ def get_highest_success_difficulty(

    for test_name, test_data in data.items():
        try:
-            if test_data.metrics.success:
-                difficulty_str = test_data.metrics.difficulty
+            if any(r.success for r in test_data.results):
+                difficulty_str = test_data.difficulty
                if not difficulty_str:
                    continue