Update agbenchmark to v0.0.9

2026-01-09 09:14:19 +01:00 · 2023-08-31 00:14:20 +02:00
parent d2cc22c698
commit a4ef53c55c
4 changed files with 118 additions and 53 deletions
--- a/agbenchmark/analyze_reports.py
+++ b/agbenchmark/analyze_reports.py
@@ -1,50 +1,142 @@
+#!/usr/bin/env python3
+
 import json
+import logging
+import re
+import sys
 from collections import defaultdict
 from pathlib import Path

 from tabulate import tabulate

+info = "-v" in sys.argv
+debug = "-vv" in sys.argv
+granular = "--granular" in sys.argv
+
+logging.basicConfig(
+    level=logging.DEBUG if debug else logging.INFO if info else logging.WARNING
+)
+logger = logging.getLogger(__name__)
+
 # Get a list of all JSON files in the directory
-json_files = [
-    f for f in (Path(__file__).parent / "reports").iterdir() if f.name.endswith(".json")
+report_files = [
+    report_file
+    for dir in (Path(__file__).parent / "reports").iterdir()
+    if re.match(r"^\d{8}T\d{6}_", dir.name)
+    and (report_file := dir / "report.json").is_file()
 ]

-# Create sets to store unique suffixes and test names
-labels = list()
-test_names = list()
+labels = list[str]()
+runs_per_label = defaultdict[str, int](lambda: 0)
+suite_names = list[str]()
+test_names = list[str]()

 # Create a dictionary to store grouped success values by suffix and test
-grouped_success_values = defaultdict(list)
+grouped_success_values = defaultdict[str, list[str]](list[str])

 # Loop through each JSON file to collect suffixes and success values
-for json_file in sorted(json_files, key=lambda f: f.name.split("_")[1]):
-    if len(json_file.name.split("_")) < 3:
-        label = json_file.name.split("_")[0]
-    else:
-        label = json_file.name.split("_", 2)[2].rsplit(".", 1)[0]
+for report_file in sorted(report_files):
+    with open(report_file) as f:
+        logger.info(f"Loading {report_file}...")
+
+        data = json.load(f)
+        if "tests" in data:
+            test_tree = data["tests"]
+            label = data["agent_git_commit_sha"].rsplit("/", 1)[1][:7]  # commit hash
+        else:
+            # Benchmark run still in progress
+            test_tree = data
+            label = report_file.parent.name.split("_", 1)[1]
+            logger.info(f"Run '{label}' seems to be in progress")
+
+        runs_per_label[label] += 1
+
+        def process_test(test_name: str, test_data: dict):
+            result_group = grouped_success_values[f"{label}|{test_name}"]
+
+            if "tests" in test_data:
+                logger.debug(f"{test_name} is a test suite")
+
+                # Test suite
+                suite_attempted = any(
+                    test["metrics"]["attempted"] for test in test_data["tests"].values()
+                )
+                logger.debug(f"suite_attempted: {suite_attempted}")
+                if not suite_attempted:
+                    return
+
+                if test_name not in test_names:
+                    test_names.append(test_name)
+
+                if test_data["metrics"]["percentage"] == 0:
+                    result_indicator = "❌"
+                else:
+                    highest_difficulty = test_data["metrics"]["highest_difficulty"]
+                    result_indicator = {
+                        "interface": "🔌",
+                        "novice": "🌑",
+                        "basic": "🌒",
+                        "intermediate": "🌓",
+                        "advanced": "🌔",
+                        "hard": "🌕",
+                    }[highest_difficulty]
+
+                logger.debug(f"result group: {result_group}")
+                logger.debug(f"runs_per_label: {runs_per_label[label]}")
+                if len(result_group) + 1 < runs_per_label[label]:
+                    result_group.extend(
+                        ["❔"] * (runs_per_label[label] - len(result_group) - 1)
+                    )
+                result_group.append(result_indicator)
+                logger.debug(f"result group (after): {result_group}")
+
+                if granular:
+                    for test_name, test in test_data["tests"].items():
+                        process_test(test_name, test)
+                return
+
+            test_metrics = test_data["metrics"]
+            result_indicator = "❔"
+
+            if not "attempted" in test_metrics:
+                return
+            elif test_metrics["attempted"]:
+                if test_name not in test_names:
+                    test_names.append(test_name)
+
+                success_value = test_metrics["success"]
+                result_indicator = {True: "✅", False: "❌"}[success_value]
+
+            if len(result_group) + 1 < runs_per_label[label]:
+                result_group.extend(
+                    ["  "] * (runs_per_label[label] - len(result_group) - 1)
+                )
+            result_group.append(result_indicator)
+
+        for test_name, suite in test_tree.items():
+            try:
+                process_test(test_name, suite)
+            except KeyError as e:
+                print(f"{test_name}.metrics: {suite['metrics']}")
+                raise
+
    if label not in labels:
        labels.append(label)

-    with open(json_file) as f:
-        data = json.load(f)
-        for test_name in data["tests"]:
-            if test_name not in test_names:
-                test_names.append(test_name)
-            success_value = data["tests"][test_name]["metrics"]["success"]
-            grouped_success_values[f"{label}|{test_name}"].append(
-                {True: "✅", False: "❌"}[success_value]
-            )
-
 # Create headers
 headers = ["Test Name"] + list(labels)

 # Prepare data for tabulation
-table_data = []
+table_data = list[list[str]]()
 for test_name in test_names:
    row = [test_name]
    for label in labels:
-        success_values = grouped_success_values.get(f"{label}|{test_name}", ["❔"])
-        row.append(" ".join(success_values))
+        results = grouped_success_values.get(f"{label}|{test_name}", ["❔"])
+        if len(results) < runs_per_label[label]:
+            results.extend(["❔"] * (runs_per_label[label] - len(results)))
+        if len(results) > 1 and all(r == "❔" for r in results):
+            results.clear()
+        row.append(" ".join(results))
    table_data.append(row)

 # Print tabulated data
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,4 +1 @@
-{
-  "workspace": "auto_gpt_workspace",
-  "entry_path": "agbenchmark.benchmarks"
-}
+{"workspace": "auto_gpt_workspace", "entry_path": "agbenchmark.benchmarks"}
--- a/agbenchmark/regression_tests.json
+++ b/agbenchmark/regression_tests.json
@@ -1,24 +0,0 @@
-{
-    "TestBasicCodeGeneration": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestWriteFile"
-        ],
-        "data_path": "agbenchmark/challenges/code/d3"
-    },
-    "TestBasicMemory": {
-        "difficulty": "basic",
-        "data_path": "agbenchmark/challenges/memory/m1"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestWriteFile"
-        ],
-        "data_path": "agbenchmark/challenges/interface/read_file"
-    },
-    "TestWriteFile": {
-        "dependencies": [],
-        "data_path": "agbenchmark/challenges/interface/write_file"
-    }
-}
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,7 +31,7 @@ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_
 prompt_toolkit>=3.0.38
 pydantic
 inflection
-agbenchmark
+agbenchmark>=0.0.9
 agent-protocol>=0.1.1

 # web server