Update agbenchmark to v0.0.9

This commit is contained in:
Reinier van der Leer
2023-08-31 00:14:20 +02:00
parent d2cc22c698
commit a4ef53c55c
4 changed files with 118 additions and 53 deletions

View File

@@ -1,50 +1,142 @@
#!/usr/bin/env python3
import json
import logging
import re
import sys
from collections import defaultdict
from pathlib import Path
from tabulate import tabulate
info = "-v" in sys.argv
debug = "-vv" in sys.argv
granular = "--granular" in sys.argv
logging.basicConfig(
level=logging.DEBUG if debug else logging.INFO if info else logging.WARNING
)
logger = logging.getLogger(__name__)
# Get a list of all JSON files in the directory
json_files = [
f for f in (Path(__file__).parent / "reports").iterdir() if f.name.endswith(".json")
report_files = [
report_file
for dir in (Path(__file__).parent / "reports").iterdir()
if re.match(r"^\d{8}T\d{6}_", dir.name)
and (report_file := dir / "report.json").is_file()
]
# Create sets to store unique suffixes and test names
labels = list()
test_names = list()
labels = list[str]()
runs_per_label = defaultdict[str, int](lambda: 0)
suite_names = list[str]()
test_names = list[str]()
# Create a dictionary to store grouped success values by suffix and test
grouped_success_values = defaultdict(list)
grouped_success_values = defaultdict[str, list[str]](list[str])
# Loop through each JSON file to collect suffixes and success values
for json_file in sorted(json_files, key=lambda f: f.name.split("_")[1]):
if len(json_file.name.split("_")) < 3:
label = json_file.name.split("_")[0]
else:
label = json_file.name.split("_", 2)[2].rsplit(".", 1)[0]
for report_file in sorted(report_files):
with open(report_file) as f:
logger.info(f"Loading {report_file}...")
data = json.load(f)
if "tests" in data:
test_tree = data["tests"]
label = data["agent_git_commit_sha"].rsplit("/", 1)[1][:7] # commit hash
else:
# Benchmark run still in progress
test_tree = data
label = report_file.parent.name.split("_", 1)[1]
logger.info(f"Run '{label}' seems to be in progress")
runs_per_label[label] += 1
def process_test(test_name: str, test_data: dict):
result_group = grouped_success_values[f"{label}|{test_name}"]
if "tests" in test_data:
logger.debug(f"{test_name} is a test suite")
# Test suite
suite_attempted = any(
test["metrics"]["attempted"] for test in test_data["tests"].values()
)
logger.debug(f"suite_attempted: {suite_attempted}")
if not suite_attempted:
return
if test_name not in test_names:
test_names.append(test_name)
if test_data["metrics"]["percentage"] == 0:
result_indicator = ""
else:
highest_difficulty = test_data["metrics"]["highest_difficulty"]
result_indicator = {
"interface": "🔌",
"novice": "🌑",
"basic": "🌒",
"intermediate": "🌓",
"advanced": "🌔",
"hard": "🌕",
}[highest_difficulty]
logger.debug(f"result group: {result_group}")
logger.debug(f"runs_per_label: {runs_per_label[label]}")
if len(result_group) + 1 < runs_per_label[label]:
result_group.extend(
[""] * (runs_per_label[label] - len(result_group) - 1)
)
result_group.append(result_indicator)
logger.debug(f"result group (after): {result_group}")
if granular:
for test_name, test in test_data["tests"].items():
process_test(test_name, test)
return
test_metrics = test_data["metrics"]
result_indicator = ""
if not "attempted" in test_metrics:
return
elif test_metrics["attempted"]:
if test_name not in test_names:
test_names.append(test_name)
success_value = test_metrics["success"]
result_indicator = {True: "", False: ""}[success_value]
if len(result_group) + 1 < runs_per_label[label]:
result_group.extend(
[" "] * (runs_per_label[label] - len(result_group) - 1)
)
result_group.append(result_indicator)
for test_name, suite in test_tree.items():
try:
process_test(test_name, suite)
except KeyError as e:
print(f"{test_name}.metrics: {suite['metrics']}")
raise
if label not in labels:
labels.append(label)
with open(json_file) as f:
data = json.load(f)
for test_name in data["tests"]:
if test_name not in test_names:
test_names.append(test_name)
success_value = data["tests"][test_name]["metrics"]["success"]
grouped_success_values[f"{label}|{test_name}"].append(
{True: "", False: ""}[success_value]
)
# Create headers
headers = ["Test Name"] + list(labels)
# Prepare data for tabulation
table_data = []
table_data = list[list[str]]()
for test_name in test_names:
row = [test_name]
for label in labels:
success_values = grouped_success_values.get(f"{label}|{test_name}", [""])
row.append(" ".join(success_values))
results = grouped_success_values.get(f"{label}|{test_name}", [""])
if len(results) < runs_per_label[label]:
results.extend([""] * (runs_per_label[label] - len(results)))
if len(results) > 1 and all(r == "" for r in results):
results.clear()
row.append(" ".join(results))
table_data.append(row)
# Print tabulated data

View File

@@ -1,4 +1 @@
{
"workspace": "auto_gpt_workspace",
"entry_path": "agbenchmark.benchmarks"
}
{"workspace": "auto_gpt_workspace", "entry_path": "agbenchmark.benchmarks"}

View File

@@ -1,24 +0,0 @@
{
"TestBasicCodeGeneration": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"data_path": "agbenchmark/challenges/code/d3"
},
"TestBasicMemory": {
"difficulty": "basic",
"data_path": "agbenchmark/challenges/memory/m1"
},
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"data_path": "agbenchmark/challenges/interface/read_file"
},
"TestWriteFile": {
"dependencies": [],
"data_path": "agbenchmark/challenges/interface/write_file"
}
}

View File

@@ -31,7 +31,7 @@ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_
prompt_toolkit>=3.0.38
pydantic
inflection
agbenchmark
agbenchmark>=0.0.9
agent-protocol>=0.1.1
# web server