mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-09 09:14:19 +01:00
Update agbenchmark to v0.0.9
This commit is contained in:
@@ -1,50 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from tabulate import tabulate
|
||||
|
||||
info = "-v" in sys.argv
|
||||
debug = "-vv" in sys.argv
|
||||
granular = "--granular" in sys.argv
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if debug else logging.INFO if info else logging.WARNING
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Get a list of all JSON files in the directory
|
||||
json_files = [
|
||||
f for f in (Path(__file__).parent / "reports").iterdir() if f.name.endswith(".json")
|
||||
report_files = [
|
||||
report_file
|
||||
for dir in (Path(__file__).parent / "reports").iterdir()
|
||||
if re.match(r"^\d{8}T\d{6}_", dir.name)
|
||||
and (report_file := dir / "report.json").is_file()
|
||||
]
|
||||
|
||||
# Create sets to store unique suffixes and test names
|
||||
labels = list()
|
||||
test_names = list()
|
||||
labels = list[str]()
|
||||
runs_per_label = defaultdict[str, int](lambda: 0)
|
||||
suite_names = list[str]()
|
||||
test_names = list[str]()
|
||||
|
||||
# Create a dictionary to store grouped success values by suffix and test
|
||||
grouped_success_values = defaultdict(list)
|
||||
grouped_success_values = defaultdict[str, list[str]](list[str])
|
||||
|
||||
# Loop through each JSON file to collect suffixes and success values
|
||||
for json_file in sorted(json_files, key=lambda f: f.name.split("_")[1]):
|
||||
if len(json_file.name.split("_")) < 3:
|
||||
label = json_file.name.split("_")[0]
|
||||
else:
|
||||
label = json_file.name.split("_", 2)[2].rsplit(".", 1)[0]
|
||||
for report_file in sorted(report_files):
|
||||
with open(report_file) as f:
|
||||
logger.info(f"Loading {report_file}...")
|
||||
|
||||
data = json.load(f)
|
||||
if "tests" in data:
|
||||
test_tree = data["tests"]
|
||||
label = data["agent_git_commit_sha"].rsplit("/", 1)[1][:7] # commit hash
|
||||
else:
|
||||
# Benchmark run still in progress
|
||||
test_tree = data
|
||||
label = report_file.parent.name.split("_", 1)[1]
|
||||
logger.info(f"Run '{label}' seems to be in progress")
|
||||
|
||||
runs_per_label[label] += 1
|
||||
|
||||
def process_test(test_name: str, test_data: dict):
|
||||
result_group = grouped_success_values[f"{label}|{test_name}"]
|
||||
|
||||
if "tests" in test_data:
|
||||
logger.debug(f"{test_name} is a test suite")
|
||||
|
||||
# Test suite
|
||||
suite_attempted = any(
|
||||
test["metrics"]["attempted"] for test in test_data["tests"].values()
|
||||
)
|
||||
logger.debug(f"suite_attempted: {suite_attempted}")
|
||||
if not suite_attempted:
|
||||
return
|
||||
|
||||
if test_name not in test_names:
|
||||
test_names.append(test_name)
|
||||
|
||||
if test_data["metrics"]["percentage"] == 0:
|
||||
result_indicator = "❌"
|
||||
else:
|
||||
highest_difficulty = test_data["metrics"]["highest_difficulty"]
|
||||
result_indicator = {
|
||||
"interface": "🔌",
|
||||
"novice": "🌑",
|
||||
"basic": "🌒",
|
||||
"intermediate": "🌓",
|
||||
"advanced": "🌔",
|
||||
"hard": "🌕",
|
||||
}[highest_difficulty]
|
||||
|
||||
logger.debug(f"result group: {result_group}")
|
||||
logger.debug(f"runs_per_label: {runs_per_label[label]}")
|
||||
if len(result_group) + 1 < runs_per_label[label]:
|
||||
result_group.extend(
|
||||
["❔"] * (runs_per_label[label] - len(result_group) - 1)
|
||||
)
|
||||
result_group.append(result_indicator)
|
||||
logger.debug(f"result group (after): {result_group}")
|
||||
|
||||
if granular:
|
||||
for test_name, test in test_data["tests"].items():
|
||||
process_test(test_name, test)
|
||||
return
|
||||
|
||||
test_metrics = test_data["metrics"]
|
||||
result_indicator = "❔"
|
||||
|
||||
if not "attempted" in test_metrics:
|
||||
return
|
||||
elif test_metrics["attempted"]:
|
||||
if test_name not in test_names:
|
||||
test_names.append(test_name)
|
||||
|
||||
success_value = test_metrics["success"]
|
||||
result_indicator = {True: "✅", False: "❌"}[success_value]
|
||||
|
||||
if len(result_group) + 1 < runs_per_label[label]:
|
||||
result_group.extend(
|
||||
[" "] * (runs_per_label[label] - len(result_group) - 1)
|
||||
)
|
||||
result_group.append(result_indicator)
|
||||
|
||||
for test_name, suite in test_tree.items():
|
||||
try:
|
||||
process_test(test_name, suite)
|
||||
except KeyError as e:
|
||||
print(f"{test_name}.metrics: {suite['metrics']}")
|
||||
raise
|
||||
|
||||
if label not in labels:
|
||||
labels.append(label)
|
||||
|
||||
with open(json_file) as f:
|
||||
data = json.load(f)
|
||||
for test_name in data["tests"]:
|
||||
if test_name not in test_names:
|
||||
test_names.append(test_name)
|
||||
success_value = data["tests"][test_name]["metrics"]["success"]
|
||||
grouped_success_values[f"{label}|{test_name}"].append(
|
||||
{True: "✅", False: "❌"}[success_value]
|
||||
)
|
||||
|
||||
# Create headers
|
||||
headers = ["Test Name"] + list(labels)
|
||||
|
||||
# Prepare data for tabulation
|
||||
table_data = []
|
||||
table_data = list[list[str]]()
|
||||
for test_name in test_names:
|
||||
row = [test_name]
|
||||
for label in labels:
|
||||
success_values = grouped_success_values.get(f"{label}|{test_name}", ["❔"])
|
||||
row.append(" ".join(success_values))
|
||||
results = grouped_success_values.get(f"{label}|{test_name}", ["❔"])
|
||||
if len(results) < runs_per_label[label]:
|
||||
results.extend(["❔"] * (runs_per_label[label] - len(results)))
|
||||
if len(results) > 1 and all(r == "❔" for r in results):
|
||||
results.clear()
|
||||
row.append(" ".join(results))
|
||||
table_data.append(row)
|
||||
|
||||
# Print tabulated data
|
||||
|
||||
@@ -1,4 +1 @@
|
||||
{
|
||||
"workspace": "auto_gpt_workspace",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
}
|
||||
{"workspace": "auto_gpt_workspace", "entry_path": "agbenchmark.benchmarks"}
|
||||
@@ -1,24 +0,0 @@
|
||||
{
|
||||
"TestBasicCodeGeneration": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/code/d3"
|
||||
},
|
||||
"TestBasicMemory": {
|
||||
"difficulty": "basic",
|
||||
"data_path": "agbenchmark/challenges/memory/m1"
|
||||
},
|
||||
"TestReadFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/interface/read_file"
|
||||
},
|
||||
"TestWriteFile": {
|
||||
"dependencies": [],
|
||||
"data_path": "agbenchmark/challenges/interface/write_file"
|
||||
}
|
||||
}
|
||||
@@ -31,7 +31,7 @@ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_
|
||||
prompt_toolkit>=3.0.38
|
||||
pydantic
|
||||
inflection
|
||||
agbenchmark
|
||||
agbenchmark>=0.0.9
|
||||
agent-protocol>=0.1.1
|
||||
|
||||
# web server
|
||||
|
||||
Reference in New Issue
Block a user