Refactoring for TDD (#222)

This commit is contained in:
Silen Naihin
2023-07-31 21:59:47 +01:00
committed by GitHub
parent 1a7079f6c5
commit f9fea473f5
16 changed files with 97 additions and 279 deletions

View File

@@ -13,5 +13,6 @@
black . --exclude test.py
isort .
mypy .
autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark
autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark"
agbenchmark/start_benchmark.py
```

18
.gitmodules vendored
View File

@@ -1,7 +1,7 @@
[submodule "agent/Auto-GPT"]
path = agent/Auto-GPT
url = https://github.com/Significant-Gravitas/Auto-GPT
branch = master
path = agent/Auto-GPT
url = https://github.com/Significant-Gravitas/Auto-GPT
branch = master
[submodule "agent/gpt-engineer"]
path = agent/gpt-engineer
url = https://github.com/merwanehamadi/gpt-engineer.git
@@ -23,10 +23,10 @@
url = https://github.com/SilenNaihin/babyagi.git
branch = benchmark-integration
[submodule "agent/beebot"]
path = agent/beebot
url = https://github.com/AutoPackAI/beebot.git
branch = main
path = agent/beebot
url = https://github.com/AutoPackAI/beebot.git
branch = main
[submodule "agbenchmark/challenges"]
path = agbenchmark/challenges
url = https://github.com/SilenNaihin/agbenchmark_challenges.git
branch = main
path = agbenchmark/challenges
url = https://github.com/SilenNaihin/agbenchmark_challenges.git
branch = main

View File

@@ -48,11 +48,13 @@ def run_agent(
start_time = time.time()
while True:
# This checks if there's data to be read from stdout without blocking.
if process.stdout and select.select([process.stdout], [], [], 0)[0]:
output = process.stdout.readline()
print(output.strip())
try:
# This checks if there's data to be read from stdout without blocking.
if process.stdout and select.select([process.stdout], [], [], 0)[0]:
output = process.stdout.readline()
print(output.strip())
except Exception as e:
print("Error reading stdout", e)
# Check if process has ended, has no more output, or exceeded timeout
if process.poll() is not None or (time.time() - start_time > timeout):

View File

@@ -8,7 +8,7 @@ from typing import Any, Dict, Optional
import pytest
from agbenchmark.start_benchmark import CURRENT_DIRECTORY, get_regression_data
from agbenchmark.start_benchmark import CHALLENGES_PATH, get_regression_data
from agbenchmark.utils.challenge import Challenge
from agbenchmark.utils.data_types import ChallengeData, SuiteConfig
from agbenchmark.utils.utils import get_test_path
@@ -158,7 +158,7 @@ def create_challenge(
def generate_tests() -> None: # sourcery skip: invert-any-all
print("Generating tests...")
json_files = deque(glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True))
json_files = deque(glob.glob(f"{CHALLENGES_PATH}/**/data.json", recursive=True))
regression_tests = get_regression_data()
# for suites to know if the file has already been used to generate the tests

View File

@@ -4,7 +4,7 @@ import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional
from typing import Any, Dict
from agbenchmark.reports.processing.graphs import save_single_radar_chart
from agbenchmark.reports.processing.process_report import get_agent_category
@@ -42,18 +42,8 @@ class ReportManager:
with open(self.filename, "w") as f:
json.dump(self.tests, f, indent=4)
def add_test(
self,
test_name: str,
test_details: dict | list,
agent_name: Optional[str] = None,
) -> None:
if agent_name:
if agent_name not in self.tests:
self.tests[agent_name] = {}
self.tests[agent_name][test_name] = test_details
else:
self.tests[test_name] = test_details
def add_test(self, test_name: str, test_details: dict | list) -> None:
self.tests[test_name] = test_details
self.save()

View File

@@ -1,200 +0,0 @@
{
"BabyAGI": {
"TestWriteFile": [
false,
false
]
},
"gpt-engineer": {
"TestWriteFile": [
true,
false
]
},
"mini-agi": {
"TestBasicMemory": [
true,
true,
true,
true,
true,
false,
false,
true,
false
],
"TestBasicRetrieval": [
true,
true,
true,
true,
true,
true
],
"TestReadFile": [
true,
true,
true,
true,
true,
true
],
"TestSearch": [
true,
true,
true,
true,
true,
true
],
"TestWriteFile": [
true,
true,
true,
true,
true
],
"TestRetrieval2.2": [
false,
false,
false,
false
],
"TestRetrieval2.1": [
false,
false,
false,
false,
false,
false
],
"TestRetrieval2.0": [
true,
false
],
"TestRememberMultipleIds": [
false,
false,
true,
false
],
"TestRememberMultipleIdsWithNoise": [
false
],
"TestRememberMultipleWithNoise": [
false,
true,
false
],
"TestRememberMultiplePhrasesWithNoise": [
false,
false,
false,
false,
false,
false,
false
],
"TestDebugSimpleTypoWithGuidance": [
true,
true,
true,
true,
true,
true
],
"TestCodeBasic": [
false,
true,
false,
false
],
"TestRevenueRetrieval_1.0": [
true,
true,
true,
true,
true,
true
],
"TestRevenueRetrieval_1.1": [
false,
false,
false,
false
],
"TestRevenueRetrieval_1.2": [
false,
false,
false,
false
],
"TestReturnCode_Simple": [
false,
false
],
"TestReturnCode_Write": [
false,
false
],
"TestReturnCode_Modify": [
false,
false
],
"TestReturnCode_Tests": [
false,
false
],
"TestPlanCreation": [
true
],
"TestGoalDivergence": [
false
],
"TestBasicContentGen": [
true
],
"TestAdaptSimpleTypoWithGuidance": [
false
],
"TestDebugSimpleTypoWithoutGuidance": [
true
],
"TestCreateSimpleWebServer": [
true
],
"TestGoalLoss_Hard": [
false
],
"TestGoalLoss_advanced": [
false
],
"TestGoalLoss_Medium": [
false
],
"TestGoalLoss_Simple": [
false
],
"TestInstructionFollowing": [
false
],
"TestAdaptLink": [
true
],
"TestFunctionCodeGeneration": [
false
],
"TestDebugMultipleTypo": [
true
],
"TestThreeSum": [
false
],
"TestAdaptTeslaRevenue": [
false
],
"TestRetrieval3": [
false
]
}
}

View File

@@ -8,11 +8,15 @@ import pytest
from agbenchmark.agent_interface import MOCK_FLAG
from agbenchmark.reports.ReportManager import ReportManager
from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH
from agbenchmark.start_benchmark import (
CONFIG_PATH,
REGRESSION_TESTS_PATH,
REPORTS_PATH,
SUCCESS_RATE_PATH,
)
from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig
from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone
from agbenchmark.utils.utils import (
AGENT_NAME,
calculate_success_percentage,
get_highest_success_difficulty,
get_test_path,
@@ -25,10 +29,8 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH)
# user facing reporting information
info_manager = ReportManager(str(Path(REPORTS_PATH) / "report.json"))
INTERNAL_LOGS_PATH = Path(__file__).resolve().parent
# internal db step in replacement track pass/fail rate
internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
internal_info = ReportManager(SUCCESS_RATE_PATH)
def generate_combined_suite_report(
@@ -112,19 +114,12 @@ def get_previous_test_results(
agent_tests: dict[str, list[bool]] = {}
mock = "--mock" in sys.argv # Check if --mock is in sys.argv
# if the structure is nested inside of the agent name
if AGENT_NAME:
agent_tests = internal_info.tests.get(AGENT_NAME, {})
if agent_tests:
prev_test_results = agent_tests.get(test_name, [])
else:
prev_test_results = internal_info.tests.get(test_name, [])
prev_test_results = internal_info.tests.get(test_name, [])
if not mock:
# only add if it's an actual test
prev_test_results.append(info_details["metrics"]["success"])
internal_info.add_test(test_name, prev_test_results, AGENT_NAME)
internal_info.add_test(test_name, prev_test_results)
# can calculate success rate regardless of mock
info_details["metrics"]["success_%"] = calculate_success_percentage(

View File

@@ -21,6 +21,8 @@ HeliconeLockManager.write_custom_property("benchmark_start_time", BENCHMARK_STAR
CONFIG_PATH,
REGRESSION_TESTS_PATH,
REPORTS_PATH,
SUCCESS_RATE_PATH,
CHALLENGES_PATH,
) = calculate_dynamic_paths()
@@ -101,16 +103,8 @@ def start(
for key, value in config.items():
print(f"{key}: {value}")
if not os.path.exists(REGRESSION_TESTS_PATH):
with open(REGRESSION_TESTS_PATH, "w"):
pass
os.environ["MOCK_TEST"] = "True" if mock else "False"
if not os.path.exists(Path(REPORTS_PATH) / "report.json"):
with open(Path(REPORTS_PATH) / "report.json", "w"):
pass
pytest_args = ["-vs"]
if test:
print("Running specific test:", test)

View File

@@ -58,9 +58,6 @@ query ExampleQuery($properties: [PropertyFilter!]){
)
response.raise_for_status() # Raises a HTTPError if the response was an unsuccessful status code
print(f"Response status code: {response.status_code}")
print(f"Response text: {response.text}")
data = response.json()
except requests.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
@@ -72,11 +69,7 @@ query ExampleQuery($properties: [PropertyFilter!]){
print(f"Other error occurred: {err}")
raise
print("this is the data!", data)
try:
return (
data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None)
)
except Exception as err:
print(f"Error occurred: {err}")
raise
if data is None or data.get("data") is None:
raise ValueError("Invalid response received from server: no data")
return data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None)

View File

@@ -180,21 +180,39 @@ def get_highest_success_difficulty(
return "No successful tests"
def assign_paths(folder_path: Path) -> tuple[str, str, str]:
def assign_paths(folder_path: Path) -> tuple[str, str, str, str, str]:
CONFIG_PATH = str(folder_path / "config.json")
REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json")
reports_location = folder_path / "reports"
# if the user has a locally defined challenges path that they've added tests to
CHALLENGES_PATH = str(folder_path / "challenges")
if not os.path.exists(CHALLENGES_PATH):
Path(__file__).parent / "challenges"
if not os.path.exists(reports_location):
os.makedirs(reports_location)
# from the ci
if REPORT_LOCATION:
reports_location = Path.cwd() / REPORT_LOCATION
REPORTS_PATH = calculate_info_test_path(reports_location)
return CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH
REGRESSION_TESTS_PATH = str(reports_location / "regression_tests.json")
SUCCESS_RATE_PATH = str(reports_location / "success_rate.json")
return (
CONFIG_PATH,
REGRESSION_TESTS_PATH,
REPORTS_PATH,
SUCCESS_RATE_PATH,
CHALLENGES_PATH,
)
def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
def calculate_dynamic_paths() -> tuple[Path, str, str, str, str, str]:
# the default home is where you're running from
HOME_DIRECTORY = Path(os.getcwd())
benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
@@ -207,22 +225,47 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths(
benchmarks_folder_path
)
(
CONFIG_PATH,
REGRESSION_TESTS_PATH,
REPORTS_PATH,
SUCCESS_RATE_PATH,
CHALLENGES_PATH,
) = assign_paths(benchmarks_folder_path)
else:
# otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
# used when its just a pip install
CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths(
benchmarks_folder_path
)
(
CONFIG_PATH,
REGRESSION_TESTS_PATH,
REPORTS_PATH,
SUCCESS_RATE_PATH,
CHALLENGES_PATH,
) = assign_paths(benchmarks_folder_path)
if not benchmarks_folder_path.exists():
benchmarks_folder_path.mkdir(exist_ok=True)
if not os.path.exists(benchmarks_folder_path / "reports"):
os.makedirs(benchmarks_folder_path / "reports")
if not os.path.exists(REGRESSION_TESTS_PATH):
with open(REGRESSION_TESTS_PATH, "w"):
pass
if not os.path.exists(SUCCESS_RATE_PATH):
with open(SUCCESS_RATE_PATH, "w"):
pass
if not os.path.exists(Path(REPORTS_PATH) / "report.json"):
with open(Path(REPORTS_PATH) / "report.json", "w"):
pass
return (
HOME_DIRECTORY,
CONFIG_PATH,
REGRESSION_TESTS_PATH,
REPORTS_PATH,
SUCCESS_RATE_PATH,
CHALLENGES_PATH,
)