From f9fea473f5f6f392a04e4496b0b528353040c6dd Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 31 Jul 2023 21:59:47 +0100
Subject: [PATCH] Refactoring for TDD (#222)

---
 .github/PULL_REQUEST_TEMPLATE.md            |   3 +-
 .gitmodules                                 |  18 +-
 agbenchmark/agent_interface.py              |  12 +-
 agbenchmark/generate_test.py                |   4 +-
 agbenchmark/reports/ReportManager.py        |  16 +-
 agbenchmark/reports/internal_info.json      | 200 --------------------
 agbenchmark/reports/reports.py              |  23 +--
 agbenchmark/start_benchmark.py              |  10 +-
 agbenchmark/utils/get_data_from_helicone.py |  15 +-
 agbenchmark/utils/utils.py                  |  63 +++++-
 agent/Auto-GPT                              |   2 +-
 agent/BabyAGI                               |   2 +-
 agent/SuperAGI                              |   2 +-
 agent/gpt-engineer                          |   2 +-
 agent/mini-agi                              |   2 +-
 agent/smol-developer                        |   2 +-
 16 files changed, 97 insertions(+), 279 deletions(-)
 delete mode 100644 agbenchmark/reports/internal_info.json

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 5144742f..22e4f1b5 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -13,5 +13,6 @@
   black . --exclude test.py
   isort .
   mypy .
-  autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark
+  autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark"
+agbenchmark/start_benchmark.py
   ```
diff --git a/.gitmodules b/.gitmodules
index 7d817ec3..cb83ef33 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "agent/Auto-GPT"]
-	path = agent/Auto-GPT
-	url = https://github.com/Significant-Gravitas/Auto-GPT
-	branch = master
+	path = agent/Auto-GPT
+	url = https://github.com/Significant-Gravitas/Auto-GPT
+	branch = master
 [submodule "agent/gpt-engineer"]
 	path = agent/gpt-engineer
 	url = https://github.com/merwanehamadi/gpt-engineer.git
@@ -23,10 +23,10 @@
 	url = https://github.com/SilenNaihin/babyagi.git
 	branch = benchmark-integration
 [submodule "agent/beebot"]
-	path = agent/beebot
-	url = https://github.com/AutoPackAI/beebot.git
-	branch = main
+	path = agent/beebot
+	url = https://github.com/AutoPackAI/beebot.git
+	branch = main
 [submodule "agbenchmark/challenges"]
-	path = agbenchmark/challenges
-	url = https://github.com/SilenNaihin/agbenchmark_challenges.git
-	branch = main
+	path = agbenchmark/challenges
+	url = https://github.com/SilenNaihin/agbenchmark_challenges.git
+	branch = main
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 4087151a..adcd2944 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -48,11 +48,13 @@ def run_agent(
         start_time = time.time()
 
         while True:
-
-            # This checks if there's data to be read from stdout without blocking.
-            if process.stdout and select.select([process.stdout], [], [], 0)[0]:
-                output = process.stdout.readline()
-                print(output.strip())
+            try:
+                # This checks if there's data to be read from stdout without blocking.
+                if process.stdout and select.select([process.stdout], [], [], 0)[0]:
+                    output = process.stdout.readline()
+                    print(output.strip())
+            except Exception as e:
+                print("Error reading stdout", e)
 
             # Check if process has ended, has no more output, or exceeded timeout
             if process.poll() is not None or (time.time() - start_time > timeout):
diff --git a/agbenchmark/generate_test.py b/agbenchmark/generate_test.py
index 92875183..180893f7 100644
--- a/agbenchmark/generate_test.py
+++ b/agbenchmark/generate_test.py
@@ -8,7 +8,7 @@ from typing import Any, Dict, Optional
 
 import pytest
 
-from agbenchmark.start_benchmark import CURRENT_DIRECTORY, get_regression_data
+from agbenchmark.start_benchmark import CHALLENGES_PATH, get_regression_data
 from agbenchmark.utils.challenge import Challenge
 from agbenchmark.utils.data_types import ChallengeData, SuiteConfig
 from agbenchmark.utils.utils import get_test_path
@@ -158,7 +158,7 @@ def create_challenge(
 def generate_tests() -> None:  # sourcery skip: invert-any-all
     print("Generating tests...")
 
-    json_files = deque(glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True))
+    json_files = deque(glob.glob(f"{CHALLENGES_PATH}/**/data.json", recursive=True))
     regression_tests = get_regression_data()
 
     # for suites to know if the file has already been used to generate the tests
diff --git a/agbenchmark/reports/ReportManager.py b/agbenchmark/reports/ReportManager.py
index def8946a..d0669dff 100644
--- a/agbenchmark/reports/ReportManager.py
+++ b/agbenchmark/reports/ReportManager.py
@@ -4,7 +4,7 @@ import sys
 import time
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 from agbenchmark.reports.processing.graphs import save_single_radar_chart
 from agbenchmark.reports.processing.process_report import get_agent_category
@@ -42,18 +42,8 @@ class ReportManager:
         with open(self.filename, "w") as f:
             json.dump(self.tests, f, indent=4)
 
-    def add_test(
-        self,
-        test_name: str,
-        test_details: dict | list,
-        agent_name: Optional[str] = None,
-    ) -> None:
-        if agent_name:
-            if agent_name not in self.tests:
-                self.tests[agent_name] = {}
-            self.tests[agent_name][test_name] = test_details
-        else:
-            self.tests[test_name] = test_details
+    def add_test(self, test_name: str, test_details: dict | list) -> None:
+        self.tests[test_name] = test_details
 
         self.save()
 
diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json
deleted file mode 100644
index a3d50b49..00000000
--- a/agbenchmark/reports/internal_info.json
+++ /dev/null
@@ -1,200 +0,0 @@
-{
-    "BabyAGI": {
-        "TestWriteFile": [
-            false,
-            false
-        ]
-    },
-    "gpt-engineer": {
-        "TestWriteFile": [
-            true,
-            false
-        ]
-    },
-    "mini-agi": {
-        "TestBasicMemory": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            false,
-            false,
-            true,
-            false
-        ],
-        "TestBasicRetrieval": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestReadFile": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestSearch": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestWriteFile": [
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestRetrieval2.2": [
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestRetrieval2.1": [
-            false,
-            false,
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestRetrieval2.0": [
-            true,
-            false
-        ],
-        "TestRememberMultipleIds": [
-            false,
-            false,
-            true,
-            false
-        ],
-        "TestRememberMultipleIdsWithNoise": [
-            false
-        ],
-        "TestRememberMultipleWithNoise": [
-            false,
-            true,
-            false
-        ],
-        "TestRememberMultiplePhrasesWithNoise": [
-            false,
-            false,
-            false,
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestDebugSimpleTypoWithGuidance": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestCodeBasic": [
-            false,
-            true,
-            false,
-            false
-        ],
-        "TestRevenueRetrieval_1.0": [
-            true,
-            true,
-            true,
-            true,
-            true,
-            true
-        ],
-        "TestRevenueRetrieval_1.1": [
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestRevenueRetrieval_1.2": [
-            false,
-            false,
-            false,
-            false
-        ],
-        "TestReturnCode_Simple": [
-            false,
-            false
-        ],
-        "TestReturnCode_Write": [
-            false,
-            false
-        ],
-        "TestReturnCode_Modify": [
-            false,
-            false
-        ],
-        "TestReturnCode_Tests": [
-            false,
-            false
-        ],
-        "TestPlanCreation": [
-            true
-        ],
-        "TestGoalDivergence": [
-            false
-        ],
-        "TestBasicContentGen": [
-            true
-        ],
-        "TestAdaptSimpleTypoWithGuidance": [
-            false
-        ],
-        "TestDebugSimpleTypoWithoutGuidance": [
-            true
-        ],
-        "TestCreateSimpleWebServer": [
-            true
-        ],
-        "TestGoalLoss_Hard": [
-            false
-        ],
-        "TestGoalLoss_advanced": [
-            false
-        ],
-        "TestGoalLoss_Medium": [
-            false
-        ],
-        "TestGoalLoss_Simple": [
-            false
-        ],
-        "TestInstructionFollowing": [
-            false
-        ],
-        "TestAdaptLink": [
-            true
-        ],
-        "TestFunctionCodeGeneration": [
-            false
-        ],
-        "TestDebugMultipleTypo": [
-            true
-        ],
-        "TestThreeSum": [
-            false
-        ],
-        "TestAdaptTeslaRevenue": [
-            false
-        ],
-        "TestRetrieval3": [
-            false
-        ]
-    }
-}
\ No newline at end of file
diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py
index 0cce471d..53af7de8 100644
--- a/agbenchmark/reports/reports.py
+++ b/agbenchmark/reports/reports.py
@@ -8,11 +8,15 @@ import pytest
 
 from agbenchmark.agent_interface import MOCK_FLAG
 from agbenchmark.reports.ReportManager import ReportManager
-from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH
+from agbenchmark.start_benchmark import (
+    CONFIG_PATH,
+    REGRESSION_TESTS_PATH,
+    REPORTS_PATH,
+    SUCCESS_RATE_PATH,
+)
 from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig
 from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone
 from agbenchmark.utils.utils import (
-    AGENT_NAME,
     calculate_success_percentage,
     get_highest_success_difficulty,
     get_test_path,
@@ -25,10 +29,8 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH)
 # user facing reporting information
 info_manager = ReportManager(str(Path(REPORTS_PATH) / "report.json"))
 
-INTERNAL_LOGS_PATH = Path(__file__).resolve().parent
-
 # internal db step in replacement track pass/fail rate
-internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
+internal_info = ReportManager(SUCCESS_RATE_PATH)
 
 
 def generate_combined_suite_report(
@@ -112,19 +114,12 @@ def get_previous_test_results(
     agent_tests: dict[str, list[bool]] = {}
     mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
 
-    # if the structure is nested inside of the agent name
-    if AGENT_NAME:
-        agent_tests = internal_info.tests.get(AGENT_NAME, {})
-
-    if agent_tests:
-        prev_test_results = agent_tests.get(test_name, [])
-    else:
-        prev_test_results = internal_info.tests.get(test_name, [])
+    prev_test_results = internal_info.tests.get(test_name, [])
 
     if not mock:
         # only add if it's an actual test
         prev_test_results.append(info_details["metrics"]["success"])
-        internal_info.add_test(test_name, prev_test_results, AGENT_NAME)
+        internal_info.add_test(test_name, prev_test_results)
 
     # can calculate success rate regardless of mock
     info_details["metrics"]["success_%"] = calculate_success_percentage(
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 6d77d125..26856d5e 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -21,6 +21,8 @@ HeliconeLockManager.write_custom_property("benchmark_start_time", BENCHMARK_STAR
     CONFIG_PATH,
     REGRESSION_TESTS_PATH,
     REPORTS_PATH,
+    SUCCESS_RATE_PATH,
+    CHALLENGES_PATH,
 ) = calculate_dynamic_paths()
 
 
@@ -101,16 +103,8 @@ def start(
     for key, value in config.items():
         print(f"{key}: {value}")
 
-    if not os.path.exists(REGRESSION_TESTS_PATH):
-        with open(REGRESSION_TESTS_PATH, "w"):
-            pass
-
     os.environ["MOCK_TEST"] = "True" if mock else "False"
 
-    if not os.path.exists(Path(REPORTS_PATH) / "report.json"):
-        with open(Path(REPORTS_PATH) / "report.json", "w"):
-            pass
-
     pytest_args = ["-vs"]
     if test:
         print("Running specific test:", test)
diff --git a/agbenchmark/utils/get_data_from_helicone.py b/agbenchmark/utils/get_data_from_helicone.py
index b7ac78a0..bae27a6d 100644
--- a/agbenchmark/utils/get_data_from_helicone.py
+++ b/agbenchmark/utils/get_data_from_helicone.py
@@ -58,9 +58,6 @@ query ExampleQuery($properties: [PropertyFilter!]){
         )
         response.raise_for_status()  # Raises a HTTPError if the response was an unsuccessful status code
 
-        print(f"Response status code: {response.status_code}")
-        print(f"Response text: {response.text}")
-
         data = response.json()
     except requests.HTTPError as http_err:
         print(f"HTTP error occurred: {http_err}")
@@ -72,11 +69,7 @@ query ExampleQuery($properties: [PropertyFilter!]){
         print(f"Other error occurred: {err}")
         raise
 
-    print("this is the data!", data)
-    try:
-        return (
-            data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None)
-        )
-    except Exception as err:
-        print(f"Error occurred: {err}")
-        raise
+    if data is None or data.get("data") is None:
+        raise ValueError("Invalid response received from server: no data")
+
+    return data.get("data", {}).get("aggregatedHeliconeRequest", {}).get("cost", None)
diff --git a/agbenchmark/utils/utils.py b/agbenchmark/utils/utils.py
index a1e3bbe4..88fdc889 100644
--- a/agbenchmark/utils/utils.py
+++ b/agbenchmark/utils/utils.py
@@ -180,21 +180,39 @@ def get_highest_success_difficulty(
     return "No successful tests"
 
 
-def assign_paths(folder_path: Path) -> tuple[str, str, str]:
+def assign_paths(folder_path: Path) -> tuple[str, str, str, str, str]:
     CONFIG_PATH = str(folder_path / "config.json")
-    REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json")
 
     reports_location = folder_path / "reports"
+
+    # if the user has a locally defined challenges path that they've added tests to
+    CHALLENGES_PATH = str(folder_path / "challenges")
+    if not os.path.exists(CHALLENGES_PATH):
+        Path(__file__).parent / "challenges"
+
+    if not os.path.exists(reports_location):
+        os.makedirs(reports_location)
+
     # from the ci
     if REPORT_LOCATION:
         reports_location = Path.cwd() / REPORT_LOCATION
 
     REPORTS_PATH = calculate_info_test_path(reports_location)
 
-    return CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH
+    REGRESSION_TESTS_PATH = str(reports_location / "regression_tests.json")
+
+    SUCCESS_RATE_PATH = str(reports_location / "success_rate.json")
+
+    return (
+        CONFIG_PATH,
+        REGRESSION_TESTS_PATH,
+        REPORTS_PATH,
+        SUCCESS_RATE_PATH,
+        CHALLENGES_PATH,
+    )
 
 
-def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
+def calculate_dynamic_paths() -> tuple[Path, str, str, str, str, str]:
     # the default home is where you're running from
     HOME_DIRECTORY = Path(os.getcwd())
     benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
@@ -207,22 +225,47 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
         HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
         benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
 
-        CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths(
-            benchmarks_folder_path
-        )
+        (
+            CONFIG_PATH,
+            REGRESSION_TESTS_PATH,
+            REPORTS_PATH,
+            SUCCESS_RATE_PATH,
+            CHALLENGES_PATH,
+        ) = assign_paths(benchmarks_folder_path)
     else:
         # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
         # used when its just a pip install
-        CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH = assign_paths(
-            benchmarks_folder_path
-        )
+        (
+            CONFIG_PATH,
+            REGRESSION_TESTS_PATH,
+            REPORTS_PATH,
+            SUCCESS_RATE_PATH,
+            CHALLENGES_PATH,
+        ) = assign_paths(benchmarks_folder_path)
 
     if not benchmarks_folder_path.exists():
         benchmarks_folder_path.mkdir(exist_ok=True)
 
+    if not os.path.exists(benchmarks_folder_path / "reports"):
+        os.makedirs(benchmarks_folder_path / "reports")
+
+    if not os.path.exists(REGRESSION_TESTS_PATH):
+        with open(REGRESSION_TESTS_PATH, "w"):
+            pass
+
+    if not os.path.exists(SUCCESS_RATE_PATH):
+        with open(SUCCESS_RATE_PATH, "w"):
+            pass
+
+    if not os.path.exists(Path(REPORTS_PATH) / "report.json"):
+        with open(Path(REPORTS_PATH) / "report.json", "w"):
+            pass
+
     return (
         HOME_DIRECTORY,
         CONFIG_PATH,
         REGRESSION_TESTS_PATH,
         REPORTS_PATH,
+        SUCCESS_RATE_PATH,
+        CHALLENGES_PATH,
     )
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index b7f1df3e..410a1496 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit b7f1df3e1d397edb4f3a7168a929dc762280f597
+Subproject commit 410a1496bae94a2dddd2d4eac1308e34b03d9f39
diff --git a/agent/BabyAGI b/agent/BabyAGI
index abeae86c..16f1b951 160000
--- a/agent/BabyAGI
+++ b/agent/BabyAGI
@@ -1 +1 @@
-Subproject commit abeae86c8a0d9ae802a9bf4243a4c950a319e8f3
+Subproject commit 16f1b9519fea5543695203be0262a1b41c77cbba
diff --git a/agent/SuperAGI b/agent/SuperAGI
index ae3b89a3..646f33a7 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit ae3b89a325994c9dda74b5de39d6f7c48010270f
+Subproject commit 646f33a761d8332821aeb4a5dc167b619d907c50
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 9bb81041..47bc50b7 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 9bb81041ace9f09e8ea0e34e29f2e46bb9d46a36
+Subproject commit 47bc50b71c0465349a6489e0170792c0018472f3
diff --git a/agent/mini-agi b/agent/mini-agi
index 3e83765f..2fc70aa0 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 3e83765fa54d240c80d0f9578083d5b11fe34ce4
+Subproject commit 2fc70aa0032eec986dfb1020854a1b3b8aaf6780
diff --git a/agent/smol-developer b/agent/smol-developer
index a23d0136..2bdb7f24 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit a23d01369cea976e80b7889fdbf1096619471301
+Subproject commit 2bdb7f24a8d28c2e8eac402cfd4fbda7dbc1ba8f