debug(benchmark): Add more debug code to pinpoint cause of rare crash

Target: https://github.com/Significant-Gravitas/AutoGPT/actions/runs/7941977633/job/21684817491
This commit is contained in:
Reinier van der Leer
2024-02-17 15:48:57 +01:00
parent d5ad719757
commit 4ede773f5a
2 changed files with 23 additions and 15 deletions

View File

@@ -33,7 +33,7 @@ class TestResult(BaseModel):
logger.error(
"Error validating `success ^ fail_reason` on TestResult: "
f"success = {repr(values['success'])}; "
f"fail_reason = {repr(v)} ({v})"
f"fail_reason = {repr(v)}"
)
if v:
success = values["success"]

View File

@@ -4,6 +4,7 @@ import os
from pathlib import Path
import pytest
from pydantic import ValidationError
from agbenchmark.challenges import ChallengeInfo
from agbenchmark.config import AgentBenchmarkConfig
@@ -86,21 +87,28 @@ def add_test_result_to_report(
else:
test_report.metrics.attempted = True
test_report.results.append(
TestResult(
success=call.excinfo is None,
run_time=f"{str(round(call.duration, 3))} seconds",
fail_reason=None if call.excinfo is None else str(call.excinfo.value),
reached_cutoff=user_properties.get("timed_out", False),
n_steps=user_properties.get("n_steps"),
cost=user_properties.get("agent_task_cost"),
try:
test_report.results.append(
TestResult(
success=call.excinfo is None,
run_time=f"{str(round(call.duration, 3))} seconds",
fail_reason=None if call.excinfo is None else str(call.excinfo.value),
reached_cutoff=user_properties.get("timed_out", False),
n_steps=user_properties.get("n_steps"),
cost=user_properties.get("agent_task_cost"),
)
)
)
test_report.metrics.success_percentage = (
sum(r.success or False for r in test_report.results)
/ len(test_report.results)
* 100
)
test_report.metrics.success_percentage = (
sum(r.success or False for r in test_report.results)
/ len(test_report.results)
* 100
)
except ValidationError:
logger.error(
"Validation failed on TestResult; "
f"call.excinfo = {repr(call.excinfo)} ({call.excinfo})"
)
raise
prev_test_results: list[bool | None] = get_and_update_success_history(
test_name, test_report.results[-1].success