From f9fea473f5f6f392a04e4496b0b528353040c6dd Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 31 Jul 2023 21:59:47 +0100 Subject: [PATCH] Refactoring for TDD (#222) --- .github/PULL_REQUEST_TEMPLATE.md | 3 +- .gitmodules | 18 +- agbenchmark/agent_interface.py | 12 +- agbenchmark/generate_test.py | 4 +- agbenchmark/reports/ReportManager.py | 16 +- agbenchmark/reports/internal_info.json | 200 -------------------- agbenchmark/reports/reports.py | 23 +-- agbenchmark/start_benchmark.py | 10 +- agbenchmark/utils/get_data_from_helicone.py | 15 +- agbenchmark/utils/utils.py | 63 +++++- agent/Auto-GPT | 2 +- agent/BabyAGI | 2 +- agent/SuperAGI | 2 +- agent/gpt-engineer | 2 +- agent/mini-agi | 2 +- agent/smol-developer | 2 +- 16 files changed, 97 insertions(+), 279 deletions(-) delete mode 100644 agbenchmark/reports/internal_info.json diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 5144742f..22e4f1b5 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -13,5 +13,6 @@ black . --exclude test.py isort . mypy . - autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark + autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark" +agbenchmark/start_benchmark.py ``` diff --git a/.gitmodules b/.gitmodules index 7d817ec3..cb83ef33 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "agent/Auto-GPT"] - path = agent/Auto-GPT - url = https://github.com/Significant-Gravitas/Auto-GPT - branch = master + path = agent/Auto-GPT + url = https://github.com/Significant-Gravitas/Auto-GPT + branch = master [submodule "agent/gpt-engineer"] path = agent/gpt-engineer url = https://github.com/merwanehamadi/gpt-engineer.git @@ -23,10 +23,10 @@ url = https://github.com/SilenNaihin/babyagi.git branch = benchmark-integration [submodule "agent/beebot"] - path = agent/beebot - url = https://github.com/AutoPackAI/beebot.git - branch = main + path = agent/beebot + url = https://github.com/AutoPackAI/beebot.git + branch = main [submodule "agbenchmark/challenges"] - path = agbenchmark/challenges - url = https://github.com/SilenNaihin/agbenchmark_challenges.git - branch = main + path = agbenchmark/challenges + url = https://github.com/SilenNaihin/agbenchmark_challenges.git + branch = main diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 4087151a..adcd2944 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -48,11 +48,13 @@ def run_agent( start_time = time.time() while True: - - # This checks if there's data to be read from stdout without blocking. - if process.stdout and select.select([process.stdout], [], [], 0)[0]: - output = process.stdout.readline() - print(output.strip()) + try: + # This checks if there's data to be read from stdout without blocking. + if process.stdout and select.select([process.stdout], [], [], 0)[0]: + output = process.stdout.readline() + print(output.strip()) + except Exception as e: + print("Error reading stdout", e) # Check if process has ended, has no more output, or exceeded timeout if process.poll() is not None or (time.time() - start_time > timeout): diff --git a/agbenchmark/generate_test.py b/agbenchmark/generate_test.py index 92875183..180893f7 100644 --- a/agbenchmark/generate_test.py +++ b/agbenchmark/generate_test.py @@ -8,7 +8,7 @@ from typing import Any, Dict, Optional import pytest -from agbenchmark.start_benchmark import CURRENT_DIRECTORY, get_regression_data +from agbenchmark.start_benchmark import CHALLENGES_PATH, get_regression_data from agbenchmark.utils.challenge import Challenge from agbenchmark.utils.data_types import ChallengeData, SuiteConfig from agbenchmark.utils.utils import get_test_path @@ -158,7 +158,7 @@ def create_challenge( def generate_tests() -> None: # sourcery skip: invert-any-all print("Generating tests...") - json_files = deque(glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)) + json_files = deque(glob.glob(f"{CHALLENGES_PATH}/**/data.json", recursive=True)) regression_tests = get_regression_data() # for suites to know if the file has already been used to generate the tests diff --git a/agbenchmark/reports/ReportManager.py b/agbenchmark/reports/ReportManager.py index def8946a..d0669dff 100644 --- a/agbenchmark/reports/ReportManager.py +++ b/agbenchmark/reports/ReportManager.py @@ -4,7 +4,7 @@ import sys import time from datetime import datetime from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Dict from agbenchmark.reports.processing.graphs import save_single_radar_chart from agbenchmark.reports.processing.process_report import get_agent_category @@ -42,18 +42,8 @@ class ReportManager: with open(self.filename, "w") as f: json.dump(self.tests, f, indent=4) - def add_test( - self, - test_name: str, - test_details: dict | list, - agent_name: Optional[str] = None, - ) -> None: - if agent_name: - if agent_name not in self.tests: - self.tests[agent_name] = {} - self.tests[agent_name][test_name] = test_details - else: - self.tests[test_name] = test_details + def add_test(self, test_name: str, test_details: dict | list) -> None: + self.tests[test_name] = test_details self.save() diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json deleted file mode 100644 index a3d50b49..00000000 --- a/agbenchmark/reports/internal_info.json +++ /dev/null @@ -1,200 +0,0 @@ -{ - "BabyAGI": { - "TestWriteFile": [ - false, - false - ] - }, - "gpt-engineer": { - "TestWriteFile": [ - true, - false - ] - }, - "mini-agi": { - "TestBasicMemory": [ - true, - true, - true, - true, - true, - false, - false, - true, - false - ], - "TestBasicRetrieval": [ - true, - true, - true, - true, - true, - true - ], - "TestReadFile": [ - true, - true, - true, - true, - true, - true - ], - "TestSearch": [ - true, - true, - true, - true, - true, - true - ], - "TestWriteFile": [ - true, - true, - true, - true, - true - ], - "TestRetrieval2.2": [ - false, - false, - false, - false - ], - "TestRetrieval2.1": [ - false, - false, - false, - false, - false, - false - ], - "TestRetrieval2.0": [ - true, - false - ], - "TestRememberMultipleIds": [ - false, - false, - true, - false - ], - "TestRememberMultipleIdsWithNoise": [ - false - ], - "TestRememberMultipleWithNoise": [ - false, - true, - false - ], - "TestRememberMultiplePhrasesWithNoise": [ - false, - false, - false, - false, - false, - false, - false - ], - "TestDebugSimpleTypoWithGuidance": [ - true, - true, - true, - true, - true, - true - ], - "TestCodeBasic": [ - false, - true, - false, - false - ], - "TestRevenueRetrieval_1.0": [ - true, - true, - true, - true, - true, - true - ], - "TestRevenueRetrieval_1.1": [ - false, - false, - false, - false - ], - "TestRevenueRetrieval_1.2": [ - false, - false, - false, - false - ], - "TestReturnCode_Simple": [ - false, - false - ], - "TestReturnCode_Write": [ - false, - false - ], - "TestReturnCode_Modify": [ - false, - false - ], - "TestReturnCode_Tests": [ - false, - false - ], - "TestPlanCreation": [ - true - ], - "TestGoalDivergence": [ - false - ], - "TestBasicContentGen": [ - true - ], - "TestAdaptSimpleTypoWithGuidance": [ - false - ], - "TestDebugSimpleTypoWithoutGuidance": [ - true - ], - "TestCreateSimpleWebServer": [ - true - ], - "TestGoalLoss_Hard": [ - false - ], - "TestGoalLoss_advanced": [ - false - ], - "TestGoalLoss_Medium": [ - false - ], - "TestGoalLoss_Simple": [ - false - ], - "TestInstructionFollowing": [ - false - ], - "TestAdaptLink": [ - true - ], - "TestFunctionCodeGeneration": [ - false - ], - "TestDebugMultipleTypo": [ - true - ], - "TestThreeSum": [ - false - ], - "TestAdaptTeslaRevenue": [ - false - ], - "TestRetrieval3": [ - false - ] - } -} \ No newline at end of file diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py index 0cce471d..53af7de8 100644 --- a/agbenchmark/reports/reports.py +++ b/agbenchmark/reports/reports.py @@ -8,11 +8,15 @@ import pytest from agbenchmark.agent_interface import MOCK_FLAG from agbenchmark.reports.ReportManager import ReportManager -from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH +from agbenchmark.start_benchmark import ( + CONFIG_PATH, + REGRESSION_TESTS_PATH, + REPORTS_PATH, + SUCCESS_RATE_PATH, +) from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone from agbenchmark.utils.utils import ( - AGENT_NAME, calculate_success_percentage, get_highest_success_difficulty, get_test_path, @@ -25,10 +29,8 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH) # user facing reporting information info_manager = ReportManager(str(Path(REPORTS_PATH) / "report.json")) -INTERNAL_LOGS_PATH = Path(__file__).resolve().parent - # internal db step in replacement track pass/fail rate -internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json")) +internal_info = ReportManager(SUCCESS_RATE_PATH) def generate_combined_suite_report( @@ -112,19 +114,12 @@ def get_previous_test_results( agent_tests: dict[str, list[bool]] = {} mock = "--mock" in sys.argv # Check if --mock is in sys.argv - # if the structure is nested inside of the agent name - if AGENT_NAME: - agent_tests = internal_info.tests.get(AGENT_NAME, {}) - - if agent_tests: - prev_test_results = agent_tests.get(test_name, []) - else: - prev_test_results = internal_info.tests.get(test_name, []) + prev_test_results = internal_info.tests.get(test_name, []) if not mock: # only add if it's an actual test prev_test_results.append(info_details["metrics"]["success"]) - internal_info.add_test(test_name, prev_test_results, AGENT_NAME) + internal_info.add_test(test_name, prev_test_results) # can calculate success rate regardless of mock info_details["metrics"]["success_%"] = calculate_success_percentage( diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 6d77d125..26856d5e 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -21,6 +21,8 @@ HeliconeLockManager.write_custom_property("benchmark_start_time", BENCHMARK_STAR CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH, + SUCCESS_RATE_PATH, + CHALLENGES_PATH, ) = calculate_dynamic_paths() @@ -101,16 +103,8 @@ def start( for key, value in config.items(): print(f"{key}: {value}") - if not os.path.exists(REGRESSION_TESTS_PATH): - with open(REGRESSION_TESTS_PATH, "w"): - pass - os.environ["MOCK_TEST"] = "True" if mock else "False" - if not os.path.exists(Path(REPORTS_PATH) / "report.json"): - with open(Path(REPORTS_PATH) / "report.json", "w"): - pass - pytest_args = ["-vs"] if test: print("Running specific test:", test) diff --git a/agbenchmark/utils/get_data_from_helicone.py b/agbenchmark/utils/get_data_from_helicone.py index b7ac78a0..bae27a6d 100644 --- a/agbenchmark/utils/get_data_from_helicone.py +++ b/agbenchmark/utils/get_data_from_helicone.py @@ -58,9 +58,6 @@ query ExampleQuery($properties: [PropertyFilter!]){ ) response.raise_for_status() # Raises a HTTPError if the response was an unsuccessful status code - print(f"Response status code: {response.status_code}") - print(f"Response text: {response.text}") - data = response.json() except requests.HTTPError as http_err: print(f"HTTP error occurred: {http_err}") @@ -72,11 +69,7 @@ query ExampleQuery($properties: [PropertyFilter!]){ print(f"Other error occurred: {err}") raise - print("this is the data!", data) - try: - return ( - data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None) - ) - except Exception as err: - print(f"Error occurred: {err}") - raise + if data is None or data.get("data") is None: + raise ValueError("Invalid response received from server: no data") + + return data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None) diff --git a/agbenchmark/utils/utils.py b/agbenchmark/utils/utils.py index a1e3bbe4..88fdc889 100644 --- a/agbenchmark/utils/utils.py +++ b/agbenchmark/utils/utils.py @@ -180,21 +180,39 @@ def get_highest_success_difficulty( return "No successful tests" -def assign_paths(folder_path: Path) -> tuple[str, str, str]: +def assign_paths(folder_path: Path) -> tuple[str, str, str, str, str]: CONFIG_PATH = str(folder_path / "config.json") - REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json") reports_location = folder_path / "reports" + + # if the user has a locally defined challenges path that they've added tests to + CHALLENGES_PATH = str(folder_path / "challenges") + if not os.path.exists(CHALLENGES_PATH): + Path(__file__).parent / "challenges" + + if not os.path.exists(reports_location): + os.makedirs(reports_location) + # from the ci if REPORT_LOCATION: reports_location = Path.cwd() / REPORT_LOCATION REPORTS_PATH = calculate_info_test_path(reports_location) - return CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH + REGRESSION_TESTS_PATH = str(reports_location / "regression_tests.json") + + SUCCESS_RATE_PATH = str(reports_location / "success_rate.json") + + return ( + CONFIG_PATH, + REGRESSION_TESTS_PATH, + REPORTS_PATH, + SUCCESS_RATE_PATH, + CHALLENGES_PATH, + ) -def calculate_dynamic_paths() -> tuple[Path, str, str, str]: +def calculate_dynamic_paths() -> tuple[Path, str, str, str, str, str]: # the default home is where you're running from HOME_DIRECTORY = Path(os.getcwd()) benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark" @@ -207,22 +225,47 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]: HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark" - CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths( - benchmarks_folder_path - ) + ( + CONFIG_PATH, + REGRESSION_TESTS_PATH, + REPORTS_PATH, + SUCCESS_RATE_PATH, + CHALLENGES_PATH, + ) = assign_paths(benchmarks_folder_path) else: # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo) # used when its just a pip install - CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths( - benchmarks_folder_path - ) + ( + CONFIG_PATH, + REGRESSION_TESTS_PATH, + REPORTS_PATH, + SUCCESS_RATE_PATH, + CHALLENGES_PATH, + ) = assign_paths(benchmarks_folder_path) if not benchmarks_folder_path.exists(): benchmarks_folder_path.mkdir(exist_ok=True) + if not os.path.exists(benchmarks_folder_path / "reports"): + os.makedirs(benchmarks_folder_path / "reports") + + if not os.path.exists(REGRESSION_TESTS_PATH): + with open(REGRESSION_TESTS_PATH, "w"): + pass + + if not os.path.exists(SUCCESS_RATE_PATH): + with open(SUCCESS_RATE_PATH, "w"): + pass + + if not os.path.exists(Path(REPORTS_PATH) / "report.json"): + with open(Path(REPORTS_PATH) / "report.json", "w"): + pass + return ( HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH, + SUCCESS_RATE_PATH, + CHALLENGES_PATH, ) diff --git a/agent/Auto-GPT b/agent/Auto-GPT index b7f1df3e..410a1496 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit b7f1df3e1d397edb4f3a7168a929dc762280f597 +Subproject commit 410a1496bae94a2dddd2d4eac1308e34b03d9f39 diff --git a/agent/BabyAGI b/agent/BabyAGI index abeae86c..16f1b951 160000 --- a/agent/BabyAGI +++ b/agent/BabyAGI @@ -1 +1 @@ -Subproject commit abeae86c8a0d9ae802a9bf4243a4c950a319e8f3 +Subproject commit 16f1b9519fea5543695203be0262a1b41c77cbba diff --git a/agent/SuperAGI b/agent/SuperAGI index ae3b89a3..646f33a7 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit ae3b89a325994c9dda74b5de39d6f7c48010270f +Subproject commit 646f33a761d8332821aeb4a5dc167b619d907c50 diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 9bb81041..47bc50b7 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 9bb81041ace9f09e8ea0e34e29f2e46bb9d46a36 +Subproject commit 47bc50b71c0465349a6489e0170792c0018472f3 diff --git a/agent/mini-agi b/agent/mini-agi index 3e83765f..2fc70aa0 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 3e83765fa54d240c80d0f9578083d5b11fe34ce4 +Subproject commit 2fc70aa0032eec986dfb1020854a1b3b8aaf6780 diff --git a/agent/smol-developer b/agent/smol-developer index a23d0136..2bdb7f24 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit a23d01369cea976e80b7889fdbf1096619471301 +Subproject commit 2bdb7f24a8d28c2e8eac402cfd4fbda7dbc1ba8f