From 4ede773f5a8639ed41d8b33e3325608f35cebd4f Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Sat, 17 Feb 2024 15:48:57 +0100
Subject: [PATCH] debug(benchmark): Add more debug code to pinpoint cause of
 rare crash

Target: https://github.com/Significant-Gravitas/AutoGPT/actions/runs/7941977633/job/21684817491
---
 .../reports/processing/report_types.py        |  2 +-
 benchmark/agbenchmark/reports/reports.py      | 36 +++++++++++--------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py
index a3ad8af4..0475455a 100644
--- a/benchmark/agbenchmark/reports/processing/report_types.py
+++ b/benchmark/agbenchmark/reports/processing/report_types.py
@@ -33,7 +33,7 @@ class TestResult(BaseModel):
             logger.error(
                 "Error validating `success ^ fail_reason` on TestResult: "
                 f"success = {repr(values['success'])}; "
-                f"fail_reason = {repr(v)} ({v})"
+                f"fail_reason = {repr(v)}"
             )
         if v:
             success = values["success"]
diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py
index 60accd58..53804602 100644
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@@ -4,6 +4,7 @@ import os
 from pathlib import Path
 
 import pytest
+from pydantic import ValidationError
 
 from agbenchmark.challenges import ChallengeInfo
 from agbenchmark.config import AgentBenchmarkConfig
@@ -86,21 +87,28 @@ def add_test_result_to_report(
     else:
         test_report.metrics.attempted = True
 
-    test_report.results.append(
-        TestResult(
-            success=call.excinfo is None,
-            run_time=f"{str(round(call.duration, 3))} seconds",
-            fail_reason=None if call.excinfo is None else str(call.excinfo.value),
-            reached_cutoff=user_properties.get("timed_out", False),
-            n_steps=user_properties.get("n_steps"),
-            cost=user_properties.get("agent_task_cost"),
+    try:
+        test_report.results.append(
+            TestResult(
+                success=call.excinfo is None,
+                run_time=f"{str(round(call.duration, 3))} seconds",
+                fail_reason=None if call.excinfo is None else str(call.excinfo.value),
+                reached_cutoff=user_properties.get("timed_out", False),
+                n_steps=user_properties.get("n_steps"),
+                cost=user_properties.get("agent_task_cost"),
+            )
         )
-    )
-    test_report.metrics.success_percentage = (
-        sum(r.success or False for r in test_report.results)
-        / len(test_report.results)
-        * 100
-    )
+        test_report.metrics.success_percentage = (
+            sum(r.success or False for r in test_report.results)
+            / len(test_report.results)
+            * 100
+        )
+    except ValidationError:
+        logger.error(
+            "Validation failed on TestResult; "
+            f"call.excinfo = {repr(call.excinfo)} ({call.excinfo})"
+        )
+        raise
 
     prev_test_results: list[bool | None] = get_and_update_success_history(
         test_name, test_report.results[-1].success