From a4ef53c55c06d19c722a5197d360aadc09ff5fc6 Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Thu, 31 Aug 2023 00:14:20 +0200 Subject: [PATCH] Update agbenchmark to v0.0.9 --- agbenchmark/analyze_reports.py | 140 +++++++++++++++++++++++++----- agbenchmark/config.json | 5 +- agbenchmark/regression_tests.json | 24 ----- requirements.txt | 2 +- 4 files changed, 118 insertions(+), 53 deletions(-) delete mode 100644 agbenchmark/regression_tests.json diff --git a/agbenchmark/analyze_reports.py b/agbenchmark/analyze_reports.py index f46e8aed..f02cbe61 100644 --- a/agbenchmark/analyze_reports.py +++ b/agbenchmark/analyze_reports.py @@ -1,50 +1,142 @@ +#!/usr/bin/env python3 + import json +import logging +import re +import sys from collections import defaultdict from pathlib import Path from tabulate import tabulate +info = "-v" in sys.argv +debug = "-vv" in sys.argv +granular = "--granular" in sys.argv + +logging.basicConfig( + level=logging.DEBUG if debug else logging.INFO if info else logging.WARNING +) +logger = logging.getLogger(__name__) + # Get a list of all JSON files in the directory -json_files = [ - f for f in (Path(__file__).parent / "reports").iterdir() if f.name.endswith(".json") +report_files = [ + report_file + for dir in (Path(__file__).parent / "reports").iterdir() + if re.match(r"^\d{8}T\d{6}_", dir.name) + and (report_file := dir / "report.json").is_file() ] -# Create sets to store unique suffixes and test names -labels = list() -test_names = list() +labels = list[str]() +runs_per_label = defaultdict[str, int](lambda: 0) +suite_names = list[str]() +test_names = list[str]() # Create a dictionary to store grouped success values by suffix and test -grouped_success_values = defaultdict(list) +grouped_success_values = defaultdict[str, list[str]](list[str]) # Loop through each JSON file to collect suffixes and success values -for json_file in sorted(json_files, key=lambda f: f.name.split("_")[1]): - if len(json_file.name.split("_")) < 3: - label = json_file.name.split("_")[0] - else: - label = json_file.name.split("_", 2)[2].rsplit(".", 1)[0] +for report_file in sorted(report_files): + with open(report_file) as f: + logger.info(f"Loading {report_file}...") + + data = json.load(f) + if "tests" in data: + test_tree = data["tests"] + label = data["agent_git_commit_sha"].rsplit("/", 1)[1][:7] # commit hash + else: + # Benchmark run still in progress + test_tree = data + label = report_file.parent.name.split("_", 1)[1] + logger.info(f"Run '{label}' seems to be in progress") + + runs_per_label[label] += 1 + + def process_test(test_name: str, test_data: dict): + result_group = grouped_success_values[f"{label}|{test_name}"] + + if "tests" in test_data: + logger.debug(f"{test_name} is a test suite") + + # Test suite + suite_attempted = any( + test["metrics"]["attempted"] for test in test_data["tests"].values() + ) + logger.debug(f"suite_attempted: {suite_attempted}") + if not suite_attempted: + return + + if test_name not in test_names: + test_names.append(test_name) + + if test_data["metrics"]["percentage"] == 0: + result_indicator = "❌" + else: + highest_difficulty = test_data["metrics"]["highest_difficulty"] + result_indicator = { + "interface": "🔌", + "novice": "🌑", + "basic": "🌒", + "intermediate": "🌓", + "advanced": "🌔", + "hard": "🌕", + }[highest_difficulty] + + logger.debug(f"result group: {result_group}") + logger.debug(f"runs_per_label: {runs_per_label[label]}") + if len(result_group) + 1 < runs_per_label[label]: + result_group.extend( + ["❔"] * (runs_per_label[label] - len(result_group) - 1) + ) + result_group.append(result_indicator) + logger.debug(f"result group (after): {result_group}") + + if granular: + for test_name, test in test_data["tests"].items(): + process_test(test_name, test) + return + + test_metrics = test_data["metrics"] + result_indicator = "❔" + + if not "attempted" in test_metrics: + return + elif test_metrics["attempted"]: + if test_name not in test_names: + test_names.append(test_name) + + success_value = test_metrics["success"] + result_indicator = {True: "✅", False: "❌"}[success_value] + + if len(result_group) + 1 < runs_per_label[label]: + result_group.extend( + [" "] * (runs_per_label[label] - len(result_group) - 1) + ) + result_group.append(result_indicator) + + for test_name, suite in test_tree.items(): + try: + process_test(test_name, suite) + except KeyError as e: + print(f"{test_name}.metrics: {suite['metrics']}") + raise + if label not in labels: labels.append(label) - with open(json_file) as f: - data = json.load(f) - for test_name in data["tests"]: - if test_name not in test_names: - test_names.append(test_name) - success_value = data["tests"][test_name]["metrics"]["success"] - grouped_success_values[f"{label}|{test_name}"].append( - {True: "✅", False: "❌"}[success_value] - ) - # Create headers headers = ["Test Name"] + list(labels) # Prepare data for tabulation -table_data = [] +table_data = list[list[str]]() for test_name in test_names: row = [test_name] for label in labels: - success_values = grouped_success_values.get(f"{label}|{test_name}", ["❔"]) - row.append(" ".join(success_values)) + results = grouped_success_values.get(f"{label}|{test_name}", ["❔"]) + if len(results) < runs_per_label[label]: + results.extend(["❔"] * (runs_per_label[label] - len(results))) + if len(results) > 1 and all(r == "❔" for r in results): + results.clear() + row.append(" ".join(results)) table_data.append(row) # Print tabulated data diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 47785864..d3762ac0 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,4 +1 @@ -{ - "workspace": "auto_gpt_workspace", - "entry_path": "agbenchmark.benchmarks" -} +{"workspace": "auto_gpt_workspace", "entry_path": "agbenchmark.benchmarks"} \ No newline at end of file diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json deleted file mode 100644 index 8d59b1a4..00000000 --- a/agbenchmark/regression_tests.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "TestBasicCodeGeneration": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "data_path": "agbenchmark/challenges/code/d3" - }, - "TestBasicMemory": { - "difficulty": "basic", - "data_path": "agbenchmark/challenges/memory/m1" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "data_path": "agbenchmark/challenges/interface/read_file" - }, - "TestWriteFile": { - "dependencies": [], - "data_path": "agbenchmark/challenges/interface/write_file" - } -} diff --git a/requirements.txt b/requirements.txt index 5dc87ff6..ba2dc094 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,7 +31,7 @@ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_ prompt_toolkit>=3.0.38 pydantic inflection -agbenchmark +agbenchmark>=0.0.9 agent-protocol>=0.1.1 # web server