Refactoring for TDD (#222)

2026-02-18 04:34:24 +01:00 · 2023-07-31 21:59:47 +01:00
parent 1a7079f6c5
commit f9fea473f5
16 changed files with 97 additions and 279 deletions
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -13,5 +13,6 @@
  black . --exclude test.py
  isort .
  mypy .
-  autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark
+  autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark"
+agbenchmark/start_benchmark.py
  ```
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "agent/Auto-GPT"]
-	path = agent/Auto-GPT
-	url = https://github.com/Significant-Gravitas/Auto-GPT
-	branch = master
+	path = agent/Auto-GPT
+	url = https://github.com/Significant-Gravitas/Auto-GPT
+	branch = master
 [submodule "agent/gpt-engineer"]
 	path = agent/gpt-engineer
 	url = https://github.com/merwanehamadi/gpt-engineer.git
@@ -23,10 +23,10 @@
 	url = https://github.com/SilenNaihin/babyagi.git
 	branch = benchmark-integration
 [submodule "agent/beebot"]
-	path = agent/beebot
-	url = https://github.com/AutoPackAI/beebot.git
-	branch = main
+	path = agent/beebot
+	url = https://github.com/AutoPackAI/beebot.git
+	branch = main
 [submodule "agbenchmark/challenges"]
-	path = agbenchmark/challenges
-	url = https://github.com/SilenNaihin/agbenchmark_challenges.git
-	branch = main
+	path = agbenchmark/challenges
+	url = https://github.com/SilenNaihin/agbenchmark_challenges.git
+	branch = main
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -48,11 +48,13 @@ def run_agent(
        start_time = time.time()

        while True:
-
-            # This checks if there's data to be read from stdout without blocking.
-            if process.stdout and select.select([process.stdout], [], [], 0)[0]:
-                output = process.stdout.readline()
-                print(output.strip())
+            try:
+                # This checks if there's data to be read from stdout without blocking.
+                if process.stdout and select.select([process.stdout], [], [], 0)[0]:
+                    output = process.stdout.readline()
+                    print(output.strip())
+            except Exception as e:
+                print("Error reading stdout", e)

            # Check if process has ended, has no more output, or exceeded timeout
            if process.poll() is not None or (time.time() - start_time > timeout):
--- a/agbenchmark/generate_test.py
+++ b/agbenchmark/generate_test.py
@@ -8,7 +8,7 @@ from typing import Any, Dict, Optional

 import pytest

-from agbenchmark.start_benchmark import CURRENT_DIRECTORY, get_regression_data
+from agbenchmark.start_benchmark import CHALLENGES_PATH, get_regression_data
 from agbenchmark.utils.challenge import Challenge
 from agbenchmark.utils.data_types import ChallengeData, SuiteConfig
 from agbenchmark.utils.utils import get_test_path
@@ -158,7 +158,7 @@ def create_challenge(
 def generate_tests() -> None:  # sourcery skip: invert-any-all
    print("Generating tests...")

-    json_files = deque(glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True))
+    json_files = deque(glob.glob(f"{CHALLENGES_PATH}/**/data.json", recursive=True))
    regression_tests = get_regression_data()

    # for suites to know if the file has already been used to generate the tests
--- a/agbenchmark/reports/ReportManager.py
+++ b/agbenchmark/reports/ReportManager.py
@@ -4,7 +4,7 @@ import sys
 import time
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict

 from agbenchmark.reports.processing.graphs import save_single_radar_chart
 from agbenchmark.reports.processing.process_report import get_agent_category
@@ -42,18 +42,8 @@ class ReportManager:
        with open(self.filename, "w") as f:
            json.dump(self.tests, f, indent=4)

-    def add_test(
-        self,
-        test_name: str,
-        test_details: dict | list,
-        agent_name: Optional[str] = None,
-    ) -> None:
-        if agent_name:
-            if agent_name not in self.tests:
-                self.tests[agent_name] = {}
-            self.tests[agent_name][test_name] = test_details
-        else:
-            self.tests[test_name] = test_details
+    def add_test(self, test_name: str, test_details: dict | list) -> None:
+        self.tests[test_name] = test_details

        self.save()

--- a/agbenchmark/reports/internal_info.json
+++ b/agbenchmark/reports/internal_info.json
@@ -1,200 +0,0 @@
-{
-    "BabyAGI": {
-        "TestWriteFile": [
-            false,
-            false
-        ]
-    },
-    "gpt-engineer": {
-        "TestWriteFile": [
-            true,
-            false
-        ]
-    },
-    "mini-agi": {
-        "TestBasicMemory": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            false,
-            false,
-            true,
-            false
-        ],
-        "TestBasicRetrieval": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestReadFile": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestSearch": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestWriteFile": [
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestRetrieval2.2": [
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestRetrieval2.1": [
-            false,
-            false,
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestRetrieval2.0": [
-            true,
-            false
-        ],
-        "TestRememberMultipleIds": [
-            false,
-            false,
-            true,
-            false
-        ],
-        "TestRememberMultipleIdsWithNoise": [
-            false
-        ],
-        "TestRememberMultipleWithNoise": [
-            false,
-            true,
-            false
-        ],
-        "TestRememberMultiplePhrasesWithNoise": [
-            false,
-            false,
-            false,
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestDebugSimpleTypoWithGuidance": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestCodeBasic": [
-            false,
-            true,
-            false,
-            false
-        ],
-        "TestRevenueRetrieval_1.0": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestRevenueRetrieval_1.1": [
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestRevenueRetrieval_1.2": [
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestReturnCode_Simple": [
-            false,
-            false
-        ],
-        "TestReturnCode_Write": [
-            false,
-            false
-        ],
-        "TestReturnCode_Modify": [
-            false,
-            false
-        ],
-        "TestReturnCode_Tests": [
-            false,
-            false
-        ],
-        "TestPlanCreation": [
-            true
-        ],
-        "TestGoalDivergence": [
-            false
-        ],
-        "TestBasicContentGen": [
-            true
-        ],
-        "TestAdaptSimpleTypoWithGuidance": [
-            false
-        ],
-        "TestDebugSimpleTypoWithoutGuidance": [
-            true
-        ],
-        "TestCreateSimpleWebServer": [
-            true
-        ],
-        "TestGoalLoss_Hard": [
-            false
-        ],
-        "TestGoalLoss_advanced": [
-            false
-        ],
-        "TestGoalLoss_Medium": [
-            false
-        ],
-        "TestGoalLoss_Simple": [
-            false
-        ],
-        "TestInstructionFollowing": [
-            false
-        ],
-        "TestAdaptLink": [
-            true
-        ],
-        "TestFunctionCodeGeneration": [
-            false
-        ],
-        "TestDebugMultipleTypo": [
-            true
-        ],
-        "TestThreeSum": [
-            false
-        ],
-        "TestAdaptTeslaRevenue": [
-            false
-        ],
-        "TestRetrieval3": [
-            false
-        ]
-    }
-}
--- a/agbenchmark/reports/reports.py
+++ b/agbenchmark/reports/reports.py
@@ -8,11 +8,15 @@ import pytest

 from agbenchmark.agent_interface import MOCK_FLAG
 from agbenchmark.reports.ReportManager import ReportManager
-from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH
+from agbenchmark.start_benchmark import (
+    CONFIG_PATH,
+    REGRESSION_TESTS_PATH,
+    REPORTS_PATH,
+    SUCCESS_RATE_PATH,
+)
 from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig
 from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone
 from agbenchmark.utils.utils import (
-    AGENT_NAME,
    calculate_success_percentage,
    get_highest_success_difficulty,
    get_test_path,
@@ -25,10 +29,8 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH)
 # user facing reporting information
 info_manager = ReportManager(str(Path(REPORTS_PATH) / "report.json"))

-INTERNAL_LOGS_PATH = Path(__file__).resolve().parent
-
 # internal db step in replacement track pass/fail rate
-internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
+internal_info = ReportManager(SUCCESS_RATE_PATH)


 def generate_combined_suite_report(
@@ -112,19 +114,12 @@ def get_previous_test_results(
    agent_tests: dict[str, list[bool]] = {}
    mock = "--mock" in sys.argv  # Check if --mock is in sys.argv

-    # if the structure is nested inside of the agent name
-    if AGENT_NAME:
-        agent_tests = internal_info.tests.get(AGENT_NAME, {})
-
-    if agent_tests:
-        prev_test_results = agent_tests.get(test_name, [])
-    else:
-        prev_test_results = internal_info.tests.get(test_name, [])
+    prev_test_results = internal_info.tests.get(test_name, [])

    if not mock:
        # only add if it's an actual test
        prev_test_results.append(info_details["metrics"]["success"])
-        internal_info.add_test(test_name, prev_test_results, AGENT_NAME)
+        internal_info.add_test(test_name, prev_test_results)

    # can calculate success rate regardless of mock
    info_details["metrics"]["success_%"] = calculate_success_percentage(
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -21,6 +21,8 @@ HeliconeLockManager.write_custom_property("benchmark_start_time", BENCHMARK_STAR
    CONFIG_PATH,
    REGRESSION_TESTS_PATH,
    REPORTS_PATH,
+    SUCCESS_RATE_PATH,
+    CHALLENGES_PATH,
 ) = calculate_dynamic_paths()


@@ -101,16 +103,8 @@ def start(
    for key, value in config.items():
        print(f"{key}: {value}")

-    if not os.path.exists(REGRESSION_TESTS_PATH):
-        with open(REGRESSION_TESTS_PATH, "w"):
-            pass
-
    os.environ["MOCK_TEST"] = "True" if mock else "False"

-    if not os.path.exists(Path(REPORTS_PATH) / "report.json"):
-        with open(Path(REPORTS_PATH) / "report.json", "w"):
-            pass
-
    pytest_args = ["-vs"]
    if test:
        print("Running specific test:", test)
--- a/agbenchmark/utils/get_data_from_helicone.py
+++ b/agbenchmark/utils/get_data_from_helicone.py
@@ -58,9 +58,6 @@ query ExampleQuery($properties: [PropertyFilter!]){
        )
        response.raise_for_status()  # Raises a HTTPError if the response was an unsuccessful status code

-        print(f"Response status code: {response.status_code}")
-        print(f"Response text: {response.text}")
-
        data = response.json()
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
@@ -72,11 +69,7 @@ query ExampleQuery($properties: [PropertyFilter!]){
        print(f"Other error occurred: {err}")
        raise

-    print("this is the data!", data)
-    try:
-        return (
-            data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None)
-        )
-    except Exception as err:
-        print(f"Error occurred: {err}")
-        raise
+    if data is None or data.get("data") is None:
+        raise ValueError("Invalid response received from server: no data")
+
+    return data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None)
--- a/agbenchmark/utils/utils.py
+++ b/agbenchmark/utils/utils.py
@@ -180,21 +180,39 @@ def get_highest_success_difficulty(
    return "No successful tests"


-def assign_paths(folder_path: Path) -> tuple[str, str, str]:
+def assign_paths(folder_path: Path) -> tuple[str, str, str, str, str]:
    CONFIG_PATH = str(folder_path / "config.json")
-    REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json")

    reports_location = folder_path / "reports"
+
+    # if the user has a locally defined challenges path that they've added tests to
+    CHALLENGES_PATH = str(folder_path / "challenges")
+    if not os.path.exists(CHALLENGES_PATH):
+        Path(__file__).parent / "challenges"
+
+    if not os.path.exists(reports_location):
+        os.makedirs(reports_location)
+
    # from the ci
    if REPORT_LOCATION:
        reports_location = Path.cwd() / REPORT_LOCATION

    REPORTS_PATH = calculate_info_test_path(reports_location)

-    return CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH
+    REGRESSION_TESTS_PATH = str(reports_location / "regression_tests.json")
+
+    SUCCESS_RATE_PATH = str(reports_location / "success_rate.json")
+
+    return (
+        CONFIG_PATH,
+        REGRESSION_TESTS_PATH,
+        REPORTS_PATH,
+        SUCCESS_RATE_PATH,
+        CHALLENGES_PATH,
+    )


-def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
+def calculate_dynamic_paths() -> tuple[Path, str, str, str, str, str]:
    # the default home is where you're running from
    HOME_DIRECTORY = Path(os.getcwd())
    benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
@@ -207,22 +225,47 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
        HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
        benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"

-        CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths(
-            benchmarks_folder_path
-        )
+        (
+            CONFIG_PATH,
+            REGRESSION_TESTS_PATH,
+            REPORTS_PATH,
+            SUCCESS_RATE_PATH,
+            CHALLENGES_PATH,
+        ) = assign_paths(benchmarks_folder_path)
    else:
        # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
        # used when its just a pip install
-        CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths(
-            benchmarks_folder_path
-        )
+        (
+            CONFIG_PATH,
+            REGRESSION_TESTS_PATH,
+            REPORTS_PATH,
+            SUCCESS_RATE_PATH,
+            CHALLENGES_PATH,
+        ) = assign_paths(benchmarks_folder_path)

    if not benchmarks_folder_path.exists():
        benchmarks_folder_path.mkdir(exist_ok=True)

+    if not os.path.exists(benchmarks_folder_path / "reports"):
+        os.makedirs(benchmarks_folder_path / "reports")
+
+    if not os.path.exists(REGRESSION_TESTS_PATH):
+        with open(REGRESSION_TESTS_PATH, "w"):
+            pass
+
+    if not os.path.exists(SUCCESS_RATE_PATH):
+        with open(SUCCESS_RATE_PATH, "w"):
+            pass
+
+    if not os.path.exists(Path(REPORTS_PATH) / "report.json"):
+        with open(Path(REPORTS_PATH) / "report.json", "w"):
+            pass
+
    return (
        HOME_DIRECTORY,
        CONFIG_PATH,
        REGRESSION_TESTS_PATH,
        REPORTS_PATH,
+        SUCCESS_RATE_PATH,
+        CHALLENGES_PATH,
    )
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
--- a/agent/BabyAGI
+++ b/agent/BabyAGI
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
--- a/agent/mini-agi
+++ b/agent/mini-agi
--- a/agent/smol-developer
+++ b/agent/smol-developer