mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-03 14:24:24 +01:00
Refactoring for TDD (#222)
This commit is contained in:
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -13,5 +13,6 @@
|
||||
black . --exclude test.py
|
||||
isort .
|
||||
mypy .
|
||||
autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark
|
||||
autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark"
|
||||
agbenchmark/start_benchmark.py
|
||||
```
|
||||
|
||||
18
.gitmodules
vendored
18
.gitmodules
vendored
@@ -1,7 +1,7 @@
|
||||
[submodule "agent/Auto-GPT"]
|
||||
path = agent/Auto-GPT
|
||||
url = https://github.com/Significant-Gravitas/Auto-GPT
|
||||
branch = master
|
||||
path = agent/Auto-GPT
|
||||
url = https://github.com/Significant-Gravitas/Auto-GPT
|
||||
branch = master
|
||||
[submodule "agent/gpt-engineer"]
|
||||
path = agent/gpt-engineer
|
||||
url = https://github.com/merwanehamadi/gpt-engineer.git
|
||||
@@ -23,10 +23,10 @@
|
||||
url = https://github.com/SilenNaihin/babyagi.git
|
||||
branch = benchmark-integration
|
||||
[submodule "agent/beebot"]
|
||||
path = agent/beebot
|
||||
url = https://github.com/AutoPackAI/beebot.git
|
||||
branch = main
|
||||
path = agent/beebot
|
||||
url = https://github.com/AutoPackAI/beebot.git
|
||||
branch = main
|
||||
[submodule "agbenchmark/challenges"]
|
||||
path = agbenchmark/challenges
|
||||
url = https://github.com/SilenNaihin/agbenchmark_challenges.git
|
||||
branch = main
|
||||
path = agbenchmark/challenges
|
||||
url = https://github.com/SilenNaihin/agbenchmark_challenges.git
|
||||
branch = main
|
||||
|
||||
@@ -48,11 +48,13 @@ def run_agent(
|
||||
start_time = time.time()
|
||||
|
||||
while True:
|
||||
|
||||
# This checks if there's data to be read from stdout without blocking.
|
||||
if process.stdout and select.select([process.stdout], [], [], 0)[0]:
|
||||
output = process.stdout.readline()
|
||||
print(output.strip())
|
||||
try:
|
||||
# This checks if there's data to be read from stdout without blocking.
|
||||
if process.stdout and select.select([process.stdout], [], [], 0)[0]:
|
||||
output = process.stdout.readline()
|
||||
print(output.strip())
|
||||
except Exception as e:
|
||||
print("Error reading stdout", e)
|
||||
|
||||
# Check if process has ended, has no more output, or exceeded timeout
|
||||
if process.poll() is not None or (time.time() - start_time > timeout):
|
||||
|
||||
@@ -8,7 +8,7 @@ from typing import Any, Dict, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from agbenchmark.start_benchmark import CURRENT_DIRECTORY, get_regression_data
|
||||
from agbenchmark.start_benchmark import CHALLENGES_PATH, get_regression_data
|
||||
from agbenchmark.utils.challenge import Challenge
|
||||
from agbenchmark.utils.data_types import ChallengeData, SuiteConfig
|
||||
from agbenchmark.utils.utils import get_test_path
|
||||
@@ -158,7 +158,7 @@ def create_challenge(
|
||||
def generate_tests() -> None: # sourcery skip: invert-any-all
|
||||
print("Generating tests...")
|
||||
|
||||
json_files = deque(glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True))
|
||||
json_files = deque(glob.glob(f"{CHALLENGES_PATH}/**/data.json", recursive=True))
|
||||
regression_tests = get_regression_data()
|
||||
|
||||
# for suites to know if the file has already been used to generate the tests
|
||||
|
||||
@@ -4,7 +4,7 @@ import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.reports.processing.graphs import save_single_radar_chart
|
||||
from agbenchmark.reports.processing.process_report import get_agent_category
|
||||
@@ -42,18 +42,8 @@ class ReportManager:
|
||||
with open(self.filename, "w") as f:
|
||||
json.dump(self.tests, f, indent=4)
|
||||
|
||||
def add_test(
|
||||
self,
|
||||
test_name: str,
|
||||
test_details: dict | list,
|
||||
agent_name: Optional[str] = None,
|
||||
) -> None:
|
||||
if agent_name:
|
||||
if agent_name not in self.tests:
|
||||
self.tests[agent_name] = {}
|
||||
self.tests[agent_name][test_name] = test_details
|
||||
else:
|
||||
self.tests[test_name] = test_details
|
||||
def add_test(self, test_name: str, test_details: dict | list) -> None:
|
||||
self.tests[test_name] = test_details
|
||||
|
||||
self.save()
|
||||
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
{
|
||||
"BabyAGI": {
|
||||
"TestWriteFile": [
|
||||
false,
|
||||
false
|
||||
]
|
||||
},
|
||||
"gpt-engineer": {
|
||||
"TestWriteFile": [
|
||||
true,
|
||||
false
|
||||
]
|
||||
},
|
||||
"mini-agi": {
|
||||
"TestBasicMemory": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false
|
||||
],
|
||||
"TestBasicRetrieval": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestReadFile": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestSearch": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestWriteFile": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval2.2": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRetrieval2.1": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRetrieval2.0": [
|
||||
true,
|
||||
false
|
||||
],
|
||||
"TestRememberMultipleIds": [
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false
|
||||
],
|
||||
"TestRememberMultipleIdsWithNoise": [
|
||||
false
|
||||
],
|
||||
"TestRememberMultipleWithNoise": [
|
||||
false,
|
||||
true,
|
||||
false
|
||||
],
|
||||
"TestRememberMultiplePhrasesWithNoise": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithGuidance": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestCodeBasic": [
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRevenueRetrieval_1.0": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRevenueRetrieval_1.1": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRevenueRetrieval_1.2": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReturnCode_Simple": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReturnCode_Write": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReturnCode_Modify": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReturnCode_Tests": [
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestPlanCreation": [
|
||||
true
|
||||
],
|
||||
"TestGoalDivergence": [
|
||||
false
|
||||
],
|
||||
"TestBasicContentGen": [
|
||||
true
|
||||
],
|
||||
"TestAdaptSimpleTypoWithGuidance": [
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithoutGuidance": [
|
||||
true
|
||||
],
|
||||
"TestCreateSimpleWebServer": [
|
||||
true
|
||||
],
|
||||
"TestGoalLoss_Hard": [
|
||||
false
|
||||
],
|
||||
"TestGoalLoss_advanced": [
|
||||
false
|
||||
],
|
||||
"TestGoalLoss_Medium": [
|
||||
false
|
||||
],
|
||||
"TestGoalLoss_Simple": [
|
||||
false
|
||||
],
|
||||
"TestInstructionFollowing": [
|
||||
false
|
||||
],
|
||||
"TestAdaptLink": [
|
||||
true
|
||||
],
|
||||
"TestFunctionCodeGeneration": [
|
||||
false
|
||||
],
|
||||
"TestDebugMultipleTypo": [
|
||||
true
|
||||
],
|
||||
"TestThreeSum": [
|
||||
false
|
||||
],
|
||||
"TestAdaptTeslaRevenue": [
|
||||
false
|
||||
],
|
||||
"TestRetrieval3": [
|
||||
false
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -8,11 +8,15 @@ import pytest
|
||||
|
||||
from agbenchmark.agent_interface import MOCK_FLAG
|
||||
from agbenchmark.reports.ReportManager import ReportManager
|
||||
from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH
|
||||
from agbenchmark.start_benchmark import (
|
||||
CONFIG_PATH,
|
||||
REGRESSION_TESTS_PATH,
|
||||
REPORTS_PATH,
|
||||
SUCCESS_RATE_PATH,
|
||||
)
|
||||
from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig
|
||||
from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone
|
||||
from agbenchmark.utils.utils import (
|
||||
AGENT_NAME,
|
||||
calculate_success_percentage,
|
||||
get_highest_success_difficulty,
|
||||
get_test_path,
|
||||
@@ -25,10 +29,8 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH)
|
||||
# user facing reporting information
|
||||
info_manager = ReportManager(str(Path(REPORTS_PATH) / "report.json"))
|
||||
|
||||
INTERNAL_LOGS_PATH = Path(__file__).resolve().parent
|
||||
|
||||
# internal db step in replacement track pass/fail rate
|
||||
internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
|
||||
internal_info = ReportManager(SUCCESS_RATE_PATH)
|
||||
|
||||
|
||||
def generate_combined_suite_report(
|
||||
@@ -112,19 +114,12 @@ def get_previous_test_results(
|
||||
agent_tests: dict[str, list[bool]] = {}
|
||||
mock = "--mock" in sys.argv # Check if --mock is in sys.argv
|
||||
|
||||
# if the structure is nested inside of the agent name
|
||||
if AGENT_NAME:
|
||||
agent_tests = internal_info.tests.get(AGENT_NAME, {})
|
||||
|
||||
if agent_tests:
|
||||
prev_test_results = agent_tests.get(test_name, [])
|
||||
else:
|
||||
prev_test_results = internal_info.tests.get(test_name, [])
|
||||
prev_test_results = internal_info.tests.get(test_name, [])
|
||||
|
||||
if not mock:
|
||||
# only add if it's an actual test
|
||||
prev_test_results.append(info_details["metrics"]["success"])
|
||||
internal_info.add_test(test_name, prev_test_results, AGENT_NAME)
|
||||
internal_info.add_test(test_name, prev_test_results)
|
||||
|
||||
# can calculate success rate regardless of mock
|
||||
info_details["metrics"]["success_%"] = calculate_success_percentage(
|
||||
|
||||
@@ -21,6 +21,8 @@ HeliconeLockManager.write_custom_property("benchmark_start_time", BENCHMARK_STAR
|
||||
CONFIG_PATH,
|
||||
REGRESSION_TESTS_PATH,
|
||||
REPORTS_PATH,
|
||||
SUCCESS_RATE_PATH,
|
||||
CHALLENGES_PATH,
|
||||
) = calculate_dynamic_paths()
|
||||
|
||||
|
||||
@@ -101,16 +103,8 @@ def start(
|
||||
for key, value in config.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
if not os.path.exists(REGRESSION_TESTS_PATH):
|
||||
with open(REGRESSION_TESTS_PATH, "w"):
|
||||
pass
|
||||
|
||||
os.environ["MOCK_TEST"] = "True" if mock else "False"
|
||||
|
||||
if not os.path.exists(Path(REPORTS_PATH) / "report.json"):
|
||||
with open(Path(REPORTS_PATH) / "report.json", "w"):
|
||||
pass
|
||||
|
||||
pytest_args = ["-vs"]
|
||||
if test:
|
||||
print("Running specific test:", test)
|
||||
|
||||
@@ -58,9 +58,6 @@ query ExampleQuery($properties: [PropertyFilter!]){
|
||||
)
|
||||
response.raise_for_status() # Raises a HTTPError if the response was an unsuccessful status code
|
||||
|
||||
print(f"Response status code: {response.status_code}")
|
||||
print(f"Response text: {response.text}")
|
||||
|
||||
data = response.json()
|
||||
except requests.HTTPError as http_err:
|
||||
print(f"HTTP error occurred: {http_err}")
|
||||
@@ -72,11 +69,7 @@ query ExampleQuery($properties: [PropertyFilter!]){
|
||||
print(f"Other error occurred: {err}")
|
||||
raise
|
||||
|
||||
print("this is the data!", data)
|
||||
try:
|
||||
return (
|
||||
data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None)
|
||||
)
|
||||
except Exception as err:
|
||||
print(f"Error occurred: {err}")
|
||||
raise
|
||||
if data is None or data.get("data") is None:
|
||||
raise ValueError("Invalid response received from server: no data")
|
||||
|
||||
return data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None)
|
||||
|
||||
@@ -180,21 +180,39 @@ def get_highest_success_difficulty(
|
||||
return "No successful tests"
|
||||
|
||||
|
||||
def assign_paths(folder_path: Path) -> tuple[str, str, str]:
|
||||
def assign_paths(folder_path: Path) -> tuple[str, str, str, str, str]:
|
||||
CONFIG_PATH = str(folder_path / "config.json")
|
||||
REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json")
|
||||
|
||||
reports_location = folder_path / "reports"
|
||||
|
||||
# if the user has a locally defined challenges path that they've added tests to
|
||||
CHALLENGES_PATH = str(folder_path / "challenges")
|
||||
if not os.path.exists(CHALLENGES_PATH):
|
||||
Path(__file__).parent / "challenges"
|
||||
|
||||
if not os.path.exists(reports_location):
|
||||
os.makedirs(reports_location)
|
||||
|
||||
# from the ci
|
||||
if REPORT_LOCATION:
|
||||
reports_location = Path.cwd() / REPORT_LOCATION
|
||||
|
||||
REPORTS_PATH = calculate_info_test_path(reports_location)
|
||||
|
||||
return CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH
|
||||
REGRESSION_TESTS_PATH = str(reports_location / "regression_tests.json")
|
||||
|
||||
SUCCESS_RATE_PATH = str(reports_location / "success_rate.json")
|
||||
|
||||
return (
|
||||
CONFIG_PATH,
|
||||
REGRESSION_TESTS_PATH,
|
||||
REPORTS_PATH,
|
||||
SUCCESS_RATE_PATH,
|
||||
CHALLENGES_PATH,
|
||||
)
|
||||
|
||||
|
||||
def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
|
||||
def calculate_dynamic_paths() -> tuple[Path, str, str, str, str, str]:
|
||||
# the default home is where you're running from
|
||||
HOME_DIRECTORY = Path(os.getcwd())
|
||||
benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
|
||||
@@ -207,22 +225,47 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
|
||||
HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
|
||||
benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
|
||||
|
||||
CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths(
|
||||
benchmarks_folder_path
|
||||
)
|
||||
(
|
||||
CONFIG_PATH,
|
||||
REGRESSION_TESTS_PATH,
|
||||
REPORTS_PATH,
|
||||
SUCCESS_RATE_PATH,
|
||||
CHALLENGES_PATH,
|
||||
) = assign_paths(benchmarks_folder_path)
|
||||
else:
|
||||
# otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
|
||||
# used when its just a pip install
|
||||
CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths(
|
||||
benchmarks_folder_path
|
||||
)
|
||||
(
|
||||
CONFIG_PATH,
|
||||
REGRESSION_TESTS_PATH,
|
||||
REPORTS_PATH,
|
||||
SUCCESS_RATE_PATH,
|
||||
CHALLENGES_PATH,
|
||||
) = assign_paths(benchmarks_folder_path)
|
||||
|
||||
if not benchmarks_folder_path.exists():
|
||||
benchmarks_folder_path.mkdir(exist_ok=True)
|
||||
|
||||
if not os.path.exists(benchmarks_folder_path / "reports"):
|
||||
os.makedirs(benchmarks_folder_path / "reports")
|
||||
|
||||
if not os.path.exists(REGRESSION_TESTS_PATH):
|
||||
with open(REGRESSION_TESTS_PATH, "w"):
|
||||
pass
|
||||
|
||||
if not os.path.exists(SUCCESS_RATE_PATH):
|
||||
with open(SUCCESS_RATE_PATH, "w"):
|
||||
pass
|
||||
|
||||
if not os.path.exists(Path(REPORTS_PATH) / "report.json"):
|
||||
with open(Path(REPORTS_PATH) / "report.json", "w"):
|
||||
pass
|
||||
|
||||
return (
|
||||
HOME_DIRECTORY,
|
||||
CONFIG_PATH,
|
||||
REGRESSION_TESTS_PATH,
|
||||
REPORTS_PATH,
|
||||
SUCCESS_RATE_PATH,
|
||||
CHALLENGES_PATH,
|
||||
)
|
||||
|
||||
Submodule agent/Auto-GPT updated: b7f1df3e1d...410a1496ba
Submodule agent/BabyAGI updated: abeae86c8a...16f1b9519f
Submodule agent/SuperAGI updated: ae3b89a325...646f33a761
Submodule agent/gpt-engineer updated: 9bb81041ac...47bc50b71c
Submodule agent/mini-agi updated: 3e83765fa5...2fc70aa003
Submodule agent/smol-developer updated: a23d01369c...2bdb7f24a8
Reference in New Issue
Block a user