mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-02 05:44:32 +01:00
fixing backslashes, adding basic metrics (#89)
This commit is contained in:
@@ -3,7 +3,9 @@ import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Union
|
||||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.utils import get_highest_success_difficulty
|
||||
|
||||
|
||||
class ReportManager:
|
||||
@@ -23,7 +25,6 @@ class ReportManager:
|
||||
if file_content: # if file is not empty, load the json
|
||||
data = json.loads(file_content)
|
||||
self.tests = {k: data[k] for k in sorted(data)}
|
||||
data = self.replace_backslash(data)
|
||||
else: # if file is empty, assign an empty dictionary
|
||||
self.tests = {}
|
||||
except FileNotFoundError:
|
||||
@@ -36,8 +37,9 @@ class ReportManager:
|
||||
with open(self.filename, "w") as f:
|
||||
json.dump(self.tests, f, indent=4)
|
||||
|
||||
def add_test(self, test_name: str, test_details: dict) -> None:
|
||||
def add_test(self, test_name: str, test_details: dict | list) -> None:
|
||||
self.tests[test_name] = test_details
|
||||
|
||||
self.save()
|
||||
|
||||
def remove_test(self, test_name: str) -> None:
|
||||
@@ -50,19 +52,12 @@ class ReportManager:
|
||||
self.tests = {
|
||||
"command": command.split(os.sep)[-1],
|
||||
"completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
|
||||
"time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
|
||||
"metrics": {
|
||||
"run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
|
||||
"highest_difficulty": get_highest_success_difficulty(self.tests),
|
||||
},
|
||||
"tests": self.tests,
|
||||
"config": config,
|
||||
}
|
||||
|
||||
self.save()
|
||||
|
||||
def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
|
||||
if isinstance(value, str):
|
||||
return value.replace("\\\\", "/") # escape \ with \\
|
||||
elif isinstance(value, list):
|
||||
return [self.replace_backslash(i) for i in value]
|
||||
elif isinstance(value, dict):
|
||||
return {k: self.replace_backslash(v) for k, v in value.items()}
|
||||
else:
|
||||
return value
|
||||
|
||||
@@ -23,26 +23,10 @@ def run_agent(
|
||||
"""Calling to get a response"""
|
||||
|
||||
if MOCK_FLAG:
|
||||
print("ITS A MOCK TEST", challenge_location)
|
||||
copy_artifacts_into_workspace(
|
||||
config["workspace"], "artifacts_out", challenge_location
|
||||
)
|
||||
else:
|
||||
timeout = config["cutoff"]
|
||||
print(
|
||||
f"Running Python function '{config['entry_path']}' with timeout {timeout}"
|
||||
)
|
||||
command = [sys.executable, "-m", config["entry_path"], str(task)]
|
||||
process = subprocess.Popen(
|
||||
command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
cwd=os.getcwd(),
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
print(
|
||||
f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}"
|
||||
)
|
||||
|
||||
@@ -13,6 +13,6 @@
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@
|
||||
"type": "execute_python_code"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "medium",
|
||||
"difficulty": "novice",
|
||||
"description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"type": "custom_python"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "medium",
|
||||
"difficulty": "advanced",
|
||||
"description": "Tests ability for the agent to build a simple web server locally",
|
||||
"side_effects": []
|
||||
}
|
||||
|
||||
@@ -1,15 +1,52 @@
|
||||
import json
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, validator
|
||||
|
||||
|
||||
class DifficultyLevel(Enum):
|
||||
interface = "interface"
|
||||
basic = "basic"
|
||||
novice = "novice"
|
||||
intermediate = "intermediate"
|
||||
advanced = "advanced"
|
||||
expert = "expert"
|
||||
human = "human"
|
||||
|
||||
|
||||
# map from enum to difficulty level (numeric)
|
||||
DIFFICULTY_MAP = {
|
||||
DifficultyLevel.interface: 1,
|
||||
DifficultyLevel.basic: 2,
|
||||
DifficultyLevel.novice: 3,
|
||||
DifficultyLevel.intermediate: 4,
|
||||
DifficultyLevel.advanced: 5,
|
||||
DifficultyLevel.expert: 6,
|
||||
DifficultyLevel.human: 7,
|
||||
}
|
||||
|
||||
|
||||
class Info(BaseModel):
|
||||
difficulty: str
|
||||
difficulty: DifficultyLevel
|
||||
description: str
|
||||
side_effects: List[str]
|
||||
|
||||
@validator("difficulty", pre=True)
|
||||
def difficulty_to_enum(cls: "Info", v: str | DifficultyLevel) -> DifficultyLevel:
|
||||
"""Convert a string to an instance of DifficultyLevel."""
|
||||
if isinstance(v, DifficultyLevel):
|
||||
return v
|
||||
|
||||
if isinstance(v, str):
|
||||
try:
|
||||
return DifficultyLevel(v.lower())
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
raise ValueError(f"Cannot convert {v} to DifficultyLevel.")
|
||||
|
||||
|
||||
class Ground(BaseModel):
|
||||
answer: str
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
},
|
||||
"info": {
|
||||
"description": "This reads the file quickly",
|
||||
"difficulty": "basic",
|
||||
"difficulty": "interface",
|
||||
"side_effects": [""]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "interface",
|
||||
"description": "Tests if an llm can search",
|
||||
"side_effects": [""]
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "interface",
|
||||
"description": "Tests the writing to file",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
|
||||
@@ -10,13 +10,9 @@
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "basic_memory_mock",
|
||||
"mock_task": "Follow the instructions in the instructions_1.txt file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "novice",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "medium",
|
||||
"difficulty": "intermediate",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "medium",
|
||||
"difficulty": "advanced",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,6 @@
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"description": "Tests ability to retrieve information from a website.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "novice",
|
||||
"description": "Tests ability to retrieve information.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "intermediate",
|
||||
"description": "Tests ability to retrieve information.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
|
||||
@@ -9,15 +9,10 @@ from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
from agbenchmark.start_benchmark import CURRENT_DIRECTORY
|
||||
|
||||
load_dotenv()
|
||||
|
||||
IMPROVE = os.getenv("IMPROVE", "False")
|
||||
|
||||
from agbenchmark.utils import replace_backslash
|
||||
|
||||
json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)
|
||||
|
||||
@@ -36,7 +31,11 @@ def get_test_path(json_file: str) -> str:
|
||||
# Create the path from "agbenchmark" onwards
|
||||
challenge_location = Path(*path.parts[agbenchmark_index:])
|
||||
|
||||
return str(challenge_location)
|
||||
formatted_location = replace_backslash(str(challenge_location))
|
||||
if isinstance(formatted_location, str):
|
||||
return formatted_location
|
||||
else:
|
||||
return str(challenge_location)
|
||||
|
||||
|
||||
def generate_tests() -> None:
|
||||
@@ -68,7 +67,7 @@ def generate_tests() -> None:
|
||||
)
|
||||
sys.path.append(str(custom_python_location))
|
||||
|
||||
for (module_loader, name, ispkg) in pkgutil.iter_modules(
|
||||
for module_loader, name, ispkg in pkgutil.iter_modules(
|
||||
[str(custom_python_location)]
|
||||
):
|
||||
module = importlib.import_module(name)
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path # noqa
|
||||
from typing import Any, Dict, Generator
|
||||
|
||||
@@ -13,6 +15,7 @@ from agbenchmark.start_benchmark import (
|
||||
REGRESSION_TESTS_PATH,
|
||||
get_regression_data,
|
||||
)
|
||||
from agbenchmark.utils import calculate_success_percentage
|
||||
|
||||
|
||||
def resolve_workspace(config: Dict[str, Any]) -> str:
|
||||
@@ -107,9 +110,29 @@ def challenge_data(request: Any) -> None:
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True, scope="session")
|
||||
def mock(request: Any) -> None:
|
||||
return request.config.getoption("--mock")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True, scope="function")
|
||||
def timer(request: Any) -> Any:
|
||||
start_time = time.time()
|
||||
yield
|
||||
run_time = time.time() - start_time
|
||||
request.node.user_properties.append(("run_time", run_time))
|
||||
|
||||
|
||||
# tests that consistently pass are considered regression tests
|
||||
regression_manager = ReportManager(REGRESSION_TESTS_PATH)
|
||||
|
||||
# user facing reporting information
|
||||
info_manager = ReportManager(INFO_TESTS_PATH)
|
||||
|
||||
INTERNAL_LOGS = Path(__file__).resolve().parent # agbenchmark/conftest.py
|
||||
# internal db step in replacement track pass/fail rate
|
||||
internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json"))
|
||||
|
||||
|
||||
def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||
if call.when == "call":
|
||||
@@ -122,23 +145,66 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||
)
|
||||
# Extract the challenge_location from the class
|
||||
challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
|
||||
test_name = item.nodeid.split("::")[1]
|
||||
item.test_name = test_name
|
||||
|
||||
test_details = {
|
||||
"difficulty": difficulty,
|
||||
"dependencies": dependencies,
|
||||
"test": challenge_location,
|
||||
"data_path": challenge_location,
|
||||
}
|
||||
|
||||
print("pytest_runtest_makereport", test_details)
|
||||
if call.excinfo is None:
|
||||
regression_manager.add_test(item.nodeid.split("::")[1], test_details)
|
||||
test_details["success"] = True
|
||||
else:
|
||||
regression_manager.remove_test(item.nodeid.split("::")[1])
|
||||
test_details["success"] = False
|
||||
test_details["fail_reason"] = str(call.excinfo.value)
|
||||
info_details: Any = {
|
||||
"data_path": challenge_location,
|
||||
"is_regression": False,
|
||||
"metrics": {
|
||||
"difficulty": difficulty,
|
||||
"success": False,
|
||||
},
|
||||
}
|
||||
|
||||
info_manager.add_test(item.nodeid.split("::")[1], test_details)
|
||||
mock = "--mock" in sys.argv # Check if --mock is in sys.argv
|
||||
|
||||
if call.excinfo is None:
|
||||
info_details["metrics"]["success"] = True
|
||||
else:
|
||||
if not mock: # don't remove if it's a mock test
|
||||
regression_manager.remove_test(test_name)
|
||||
info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
|
||||
|
||||
prev_test_results: list[bool] = []
|
||||
|
||||
if not mock:
|
||||
# only add if it's an actual test
|
||||
prev_test_results = internal_info.tests.get(test_name, [])
|
||||
prev_test_results.append(info_details["metrics"]["success"])
|
||||
internal_info.add_test(test_name, prev_test_results)
|
||||
|
||||
# can calculate success rate regardless of mock
|
||||
info_details["metrics"]["success_%"] = calculate_success_percentage(
|
||||
prev_test_results
|
||||
)
|
||||
|
||||
if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
|
||||
# if the last 3 tests were successful, add to the regression tests
|
||||
info_details["is_regression"] = True
|
||||
regression_manager.add_test(test_name, test_details)
|
||||
|
||||
# user facing reporting
|
||||
item.info_details = info_details
|
||||
if call.when == "teardown":
|
||||
run_time = dict(item.user_properties).get("run_time")
|
||||
|
||||
info_details = getattr(item, "info_details", {})
|
||||
test_name = getattr(item, "test_name", "")
|
||||
|
||||
if info_details and test_name:
|
||||
if run_time:
|
||||
info_details["metrics"][
|
||||
"run_time"
|
||||
] = f"{str(round(run_time, 3))} seconds"
|
||||
|
||||
info_manager.add_test(test_name, info_details)
|
||||
|
||||
|
||||
def pytest_sessionfinish(session: Any) -> None:
|
||||
@@ -146,6 +212,7 @@ def pytest_sessionfinish(session: Any) -> None:
|
||||
with open(CONFIG_PATH, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
internal_info.save()
|
||||
info_manager.end_info_report(config)
|
||||
regression_manager.save()
|
||||
|
||||
|
||||
67
agbenchmark/internal_info.json
Normal file
67
agbenchmark/internal_info.json
Normal file
@@ -0,0 +1,67 @@
|
||||
{
|
||||
"TestBasicMemory": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestBasicRetrieval": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestCreateSimpleWebServer": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithGuidance": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithoutGuidance": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReadFile": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultipleIds": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultipleIdsWithNoise": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultiplePhrasesWithNoise": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval2": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval3": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestSearch": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestWriteFile": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
]
|
||||
}
|
||||
@@ -1,20 +1,11 @@
|
||||
{
|
||||
"TestReadFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark/challenges/interface/read_file",
|
||||
"success": true
|
||||
},
|
||||
"TestBasicMemory": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestReadFile",
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m1",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/memory/m1"
|
||||
},
|
||||
"TestBasicRetrieval": {
|
||||
"difficulty": "basic",
|
||||
@@ -22,78 +13,60 @@
|
||||
"TestWriteFile",
|
||||
"TestSearch"
|
||||
],
|
||||
"test": "agbenchmark/challenges/retrieval/r1",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/retrieval/r1"
|
||||
},
|
||||
"TestReadFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/interface/read_file"
|
||||
},
|
||||
"TestRememberMultipleIds": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestBasicMemory"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m2",
|
||||
"success": true
|
||||
},
|
||||
"TestRetrieval2": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"test": "agbenchmark/challenges/retrieval/r2",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/memory/m2"
|
||||
},
|
||||
"TestRememberMultipleIdsWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIds"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m3",
|
||||
"success": true
|
||||
},
|
||||
"TestRetrieval3": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestRetrieval2"
|
||||
],
|
||||
"test": "agbenchmark/challenges/retrieval/r3",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/memory/m3"
|
||||
},
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIdsWithNoise"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m4",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/memory/m4"
|
||||
},
|
||||
"TestRetrieval2": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2"
|
||||
},
|
||||
"TestRetrieval3": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestRetrieval2"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/retrieval/r3"
|
||||
},
|
||||
"TestSearch": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark/challenges/interface/search",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/interface/search"
|
||||
},
|
||||
"TestWriteFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/interface/write_file",
|
||||
"success": true
|
||||
},
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestReadFile",
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark/challenges/code/d1",
|
||||
"success": true
|
||||
},
|
||||
"TestDebugSimpleTypoWithoutGuidance": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestDebugSimpleTypoWithGuidance"
|
||||
],
|
||||
"test": "agbenchmark/challenges/code/d2",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/interface/write_file"
|
||||
}
|
||||
}
|
||||
@@ -1,109 +1,148 @@
|
||||
{
|
||||
"command": "agbenchmark start --mock",
|
||||
"completion_time": "2023-07-10-21:19",
|
||||
"time_elapsed": "8.75 seconds",
|
||||
"completion_time": "2023-07-11-21:09",
|
||||
"metrics": {
|
||||
"run_time": "0.96 seconds",
|
||||
"highest_difficulty": "advanced: 5"
|
||||
},
|
||||
"tests": {
|
||||
"TestWriteFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/interface/write_file",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.008 seconds"
|
||||
}
|
||||
},
|
||||
"TestReadFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark/challenges/interface/read_file",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/interface/read_file",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.005 seconds"
|
||||
}
|
||||
},
|
||||
"TestSearch": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark/challenges/interface/search",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/interface/search",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.006 seconds"
|
||||
}
|
||||
},
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestReadFile",
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark/challenges/code/d1",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/code/d1",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0,
|
||||
"run_time": "0.489 seconds"
|
||||
}
|
||||
},
|
||||
"TestBasicMemory": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestReadFile",
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m1",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/memory/m1",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.02 seconds"
|
||||
}
|
||||
},
|
||||
"TestBasicRetrieval": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile",
|
||||
"TestSearch"
|
||||
],
|
||||
"test": "agbenchmark/challenges/retrieval/r1",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/retrieval/r1",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.01 seconds"
|
||||
}
|
||||
},
|
||||
"TestDebugSimpleTypoWithoutGuidance": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestDebugSimpleTypoWithGuidance"
|
||||
],
|
||||
"test": "agbenchmark/challenges/code/d2",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/code/d2",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": false,
|
||||
"fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
|
||||
"success_%": 0,
|
||||
"run_time": "0.001 seconds"
|
||||
}
|
||||
},
|
||||
"TestCreateSimpleWebServer": {
|
||||
"data_path": "agbenchmark/challenges/code/d3",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "advanced",
|
||||
"success": false,
|
||||
"fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
|
||||
"success_%": 0,
|
||||
"run_time": "0.001 seconds"
|
||||
}
|
||||
},
|
||||
"TestRememberMultipleIds": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestBasicMemory"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m2",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/memory/m2",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.018 seconds"
|
||||
}
|
||||
},
|
||||
"TestRetrieval2": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"test": "agbenchmark/challenges/retrieval/r2",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.009 seconds"
|
||||
}
|
||||
},
|
||||
"TestRememberMultipleIdsWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIds"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m3",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/memory/m3",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "intermediate",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.022 seconds"
|
||||
}
|
||||
},
|
||||
"TestRetrieval3": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestRetrieval2"
|
||||
],
|
||||
"test": "agbenchmark/challenges/retrieval/r3",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/retrieval/r3",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "intermediate",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.01 seconds"
|
||||
}
|
||||
},
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIdsWithNoise"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m4",
|
||||
"success": true
|
||||
"data_path": "agbenchmark/challenges/memory/m4",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "advanced",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.021 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark/benchmarks.py",
|
||||
"entry_path": "agbenchmark.benchmarks",
|
||||
"cutoff": 60
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,10 @@
|
||||
# radio charts, logs, helper functions for tests, anything else relevant.
|
||||
import glob
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel
|
||||
|
||||
|
||||
def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
|
||||
@@ -15,3 +19,54 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
|
||||
run_name = f"{file_count + 1}.json"
|
||||
new_file_path = INFO_TESTS_PATH / run_name
|
||||
return str(new_file_path)
|
||||
|
||||
|
||||
def replace_backslash(value: Any) -> Any:
|
||||
if isinstance(value, str):
|
||||
return re.sub(
|
||||
r"\\+", "/", value
|
||||
) # replace one or more backslashes with a forward slash
|
||||
elif isinstance(value, list):
|
||||
return [replace_backslash(i) for i in value]
|
||||
elif isinstance(value, dict):
|
||||
return {k: replace_backslash(v) for k, v in value.items()}
|
||||
else:
|
||||
return value
|
||||
|
||||
|
||||
def calculate_success_percentage(results: list[bool]) -> float:
|
||||
success_count = results.count(True)
|
||||
total_count = len(results)
|
||||
if total_count == 0:
|
||||
return 0
|
||||
success_percentage = (success_count / total_count) * 100 # as a percentage
|
||||
return round(success_percentage, 2)
|
||||
|
||||
|
||||
def get_highest_success_difficulty(data: dict) -> str:
|
||||
highest_difficulty = None
|
||||
highest_difficulty_level = -1
|
||||
|
||||
for test_name, test_data in data.items():
|
||||
if test_data["metrics"]["success"]:
|
||||
# Replace 'medium' with 'intermediate' for this example
|
||||
difficulty_str = test_data["metrics"]["difficulty"]
|
||||
|
||||
try:
|
||||
difficulty_enum = DifficultyLevel[difficulty_str.lower()]
|
||||
difficulty_level = DIFFICULTY_MAP[difficulty_enum]
|
||||
|
||||
if difficulty_level > highest_difficulty_level:
|
||||
highest_difficulty = difficulty_enum
|
||||
highest_difficulty_level = difficulty_level
|
||||
except KeyError:
|
||||
print(
|
||||
f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'"
|
||||
)
|
||||
|
||||
if highest_difficulty is not None:
|
||||
highest_difficulty_str = highest_difficulty.name # convert enum to string
|
||||
else:
|
||||
highest_difficulty_str = ""
|
||||
|
||||
return f"{highest_difficulty_str}: {highest_difficulty_level}"
|
||||
|
||||
Submodule agent/SuperAGI updated: 9280512910...bd4b3def65
Submodule agent/gpt-engineer updated: 42400fd679...cde9be3e73
Submodule agent/mini-agi updated: 6a1d08880c...08764876d9
Submodule agent/smol-developer updated: a0e9f4f39e...c52b14b1d5
Reference in New Issue
Block a user