From 8df82909b2938424d387cdaa817821adcbee1dac Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 10 Jul 2023 22:25:19 -0400
Subject: [PATCH] Added --test, consolidate files, reports working (#83)

---
 ...{RegressionManager.py => ReportManager.py} |  21 ++-
 agbenchmark/agent_interface.py                |  14 +-
 agbenchmark/challenges/define_task_types.py   |   6 +
 .../challenges/interface/search/data.json     |   2 +-
 agbenchmark/challenges/test_all.py            |   4 +-
 config.json => agbenchmark/config.json        |   3 +-
 agbenchmark/conftest.py                       |  19 ++-
 .../regression_tests.json                     | 125 ++++++++++--------
 agbenchmark/reports/1.json                    | 109 +++++++++++++++
 agbenchmark/start_benchmark.py                |  57 +++++---
 agbenchmark/utils.py                          |  16 +++
 agent/Auto-GPT                                |   2 +-
 agent/SuperAGI                                |   2 +-
 agent/config_example.json                     |   3 +-
 agent/gpt-engineer                            |   2 +-
 agent/mini-agi                                |   2 +-
 agent/smol-developer                          |   2 +-
 mypy.ini                                      |   2 +-
 18 files changed, 289 insertions(+), 102 deletions(-)
 rename agbenchmark/{RegressionManager.py => ReportManager.py} (75%)
 rename config.json => agbenchmark/config.json (53%)
 rename regression_tests.json => agbenchmark/regression_tests.json (62%)
 create mode 100644 agbenchmark/reports/1.json

diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/ReportManager.py
similarity index 75%
rename from agbenchmark/RegressionManager.py
rename to agbenchmark/ReportManager.py
index ac9efc69..e6d8f62f 100644
--- a/agbenchmark/RegressionManager.py
+++ b/agbenchmark/ReportManager.py
@@ -1,12 +1,17 @@
 import json
-from typing import Union
+import os
+import sys
+import time
+from datetime import datetime
+from typing import Any, Dict, Union
 
 
-class RegressionManager:
+class ReportManager:
     """Abstracts interaction with the regression tests file"""
 
     def __init__(self, filename: str):
         self.filename = filename
+        self.start_time = time.time()
         self.load()
 
     def load(self) -> None:
@@ -40,6 +45,18 @@ class RegressionManager:
             del self.tests[test_name]
             self.save()
 
+    def end_info_report(self, config: Dict[str, Any]) -> None:
+        command = " ".join(sys.argv)
+        self.tests = {
+            "command": command.split(os.sep)[-1],
+            "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
+            "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
+            "tests": self.tests,
+            "config": config,
+        }
+
+        self.save()
+
     def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
         if isinstance(value, str):
             return value.replace("\\\\", "/")  # escape \ with \\
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 1d43577c..d058ad4c 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -3,6 +3,7 @@ import shutil
 import subprocess
 import sys
 import time
+from pathlib import Path
 from typing import Any, Dict
 
 from dotenv import load_dotenv
@@ -21,6 +22,7 @@ def run_agent(
     """Calling to get a response"""
 
     if MOCK_FLAG:
+        print("ITS A MOCK TEST", challenge_location)
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_out", challenge_location
         )
@@ -30,19 +32,13 @@ def run_agent(
             f"Running Python function '{config['entry_path']}' with timeout {timeout}"
         )
 
-        # Get the current working directory
-        cwd = os.path.join(os.getcwd(), config["home_path"])
-
-        # Add current directory to Python's import path
-        sys.path.append(cwd)
-
         command = [sys.executable, config["entry_path"], str(task)]
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             universal_newlines=True,
-            cwd=cwd,
+            cwd=os.getcwd(),
         )
 
         start_time = time.time()
@@ -79,7 +75,9 @@ def run_agent(
 def copy_artifacts_into_workspace(
     workspace: str, artifact_folder_name: str, challenge_dir_path: str
 ) -> None:
-    source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
+    # this file is at agbenchmark\agent_interface.py
+    script_dir = Path(__file__).resolve().parent.parent
+    source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name)
 
     # Check if source_dir exists, if not then return immediately.
     if not os.path.exists(source_dir):
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 94cba5b7..f4e3f222 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -1,4 +1,5 @@
 import json
+from pathlib import Path
 from typing import List, Optional
 
 from pydantic import BaseModel
@@ -32,7 +33,12 @@ class ChallengeData(BaseModel):
 
     @staticmethod
     def deserialize(path: str) -> "ChallengeData":
+        # this script is in root/agbenchmark/challenges/define_task_types.py
+        script_dir = Path(__file__).resolve().parent.parent.parent
+        path = str(script_dir / path)
+
         print("Deserializing", path)
+
         with open(path, "r") as file:
             data = json.load(file)
         return ChallengeData(**data)
diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json
index 17ee1ac1..f59b2dc9 100644
--- a/agbenchmark/challenges/interface/search/data.json
+++ b/agbenchmark/challenges/interface/search/data.json
@@ -2,7 +2,7 @@
   "name": "TestSearch",
   "category": ["interface"],
   "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"],
   "ground": {
     "answer": "This is a Heading\nThis is a paragraph.",
     "should_contain": ["Heading", "paragraph"],
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index 7dee0b2a..f8bb2347 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -19,7 +19,7 @@ load_dotenv()
 IMPROVE = os.getenv("IMPROVE", "False")
 
 
-json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True)
+json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)
 
 
 def get_test_path(json_file: str) -> str:
@@ -55,7 +55,7 @@ def generate_tests() -> None:
             )
             sys.path.append(str(custom_python_location))
 
-            for (module_loader, name, ispkg) in pkgutil.iter_modules(
+            for module_loader, name, ispkg in pkgutil.iter_modules(
                 [str(custom_python_location)]
             ):
                 module = importlib.import_module(name)
diff --git a/config.json b/agbenchmark/config.json
similarity index 53%
rename from config.json
rename to agbenchmark/config.json
index 8bbcebdb..9dd8b16a 100644
--- a/config.json
+++ b/agbenchmark/config.json
@@ -1,6 +1,5 @@
 {
   "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "benchmarks.py",
-  "home_path": "agent/mini-agi",
+  "entry_path": "agbenchmark/benchmarks.py",
   "cutoff": 60
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index e321f5a2..87fdc9c1 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -6,9 +6,10 @@ from typing import Any, Dict, Generator
 
 import pytest
 
-from agbenchmark.RegressionManager import RegressionManager
+from agbenchmark.ReportManager import ReportManager
 from agbenchmark.start_benchmark import (
     CONFIG_PATH,
+    INFO_TESTS_PATH,
     REGRESSION_TESTS_PATH,
     get_regression_data,
 )
@@ -106,7 +107,8 @@ def challenge_data(request: Any) -> None:
     return request.param
 
 
-regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+regression_manager = ReportManager(REGRESSION_TESTS_PATH)
+info_manager = ReportManager(INFO_TESTS_PATH)
 
 
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
@@ -130,12 +132,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
         print("pytest_runtest_makereport", test_details)
         if call.excinfo is None:
             regression_manager.add_test(item.nodeid.split("::")[1], test_details)
+            test_details["success"] = True
         else:
             regression_manager.remove_test(item.nodeid.split("::")[1])
+            test_details["success"] = False
+            test_details["fail_reason"] = str(call.excinfo.value)
+
+        info_manager.add_test(item.nodeid.split("::")[1], test_details)
 
 
-def pytest_sessionfinish() -> None:
-    """Called at the end of the session to save regression tests"""
+def pytest_sessionfinish(session: Any) -> None:
+    """Called at the end of the session to save regression tests and info"""
+    with open(CONFIG_PATH, "r") as f:
+        config = json.load(f)
+
+    info_manager.end_info_report(config)
     regression_manager.save()
 
 
diff --git a/regression_tests.json b/agbenchmark/regression_tests.json
similarity index 62%
rename from regression_tests.json
rename to agbenchmark/regression_tests.json
index 0cf2d5f3..68632a12 100644
--- a/regression_tests.json
+++ b/agbenchmark/regression_tests.json
@@ -1,11 +1,20 @@
 {
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/interface/read_file",
+        "success": true
+    },
     "TestBasicMemory": {
         "difficulty": "basic",
         "dependencies": [
             "TestReadFile",
             "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/memory/m1"
+        "test": "agbenchmark/challenges/memory/m1",
+        "success": true
     },
     "TestBasicRetrieval": {
         "difficulty": "basic",
@@ -13,12 +22,62 @@
             "TestWriteFile",
             "TestSearch"
         ],
-        "test": "agbenchmark/challenges/retrieval/r1"
+        "test": "agbenchmark/challenges/retrieval/r1",
+        "success": true
     },
-    "TestCreateSimpleWebServer": {
+    "TestRememberMultipleIds": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestBasicMemory"
+        ],
+        "test": "agbenchmark/challenges/memory/m2",
+        "success": true
+    },
+    "TestRetrieval2": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestBasicRetrieval"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r2",
+        "success": true
+    },
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIds"
+        ],
+        "test": "agbenchmark/challenges/memory/m3",
+        "success": true
+    },
+    "TestRetrieval3": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestRetrieval2"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r3",
+        "success": true
+    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIdsWithNoise"
+        ],
+        "test": "agbenchmark/challenges/memory/m4",
+        "success": true
+    },
+    "TestSearch": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/interface/search",
+        "success": true
+    },
+    "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
-        "test": "agbenchmark/challenges/code/d3"
+        "test": "agbenchmark/challenges/interface/write_file",
+        "success": true
     },
     "TestDebugSimpleTypoWithGuidance": {
         "difficulty": "basic",
@@ -26,65 +85,15 @@
             "TestReadFile",
             "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/code/d1"
+        "test": "agbenchmark/challenges/code/d1",
+        "success": true
     },
     "TestDebugSimpleTypoWithoutGuidance": {
         "difficulty": "medium",
         "dependencies": [
             "TestDebugSimpleTypoWithGuidance"
         ],
-        "test": "agbenchmark/challenges/code/d2"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestWriteFile"
-        ],
-        "test": "agbenchmark/challenges/interface/read_file"
-    },
-    "TestRememberMultipleIds": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestBasicMemory"
-        ],
-        "test": "agbenchmark/challenges/memory/m2"
-    },
-    "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestRememberMultipleIds"
-        ],
-        "test": "agbenchmark/challenges/memory/m3"
-    },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestRememberMultipleIdsWithNoise"
-        ],
-        "test": "agbenchmark/challenges/memory/m4"
-    },
-    "TestRetrieval2": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestBasicRetrieval"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r2"
-    },
-    "TestRetrieval3": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestRetrieval2"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r3"
-    },
-    "TestSearch": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/search"
-    },
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/write_file"
+        "test": "agbenchmark/challenges/code/d2",
+        "success": true
     }
 }
\ No newline at end of file
diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/1.json
new file mode 100644
index 00000000..df07fb87
--- /dev/null
+++ b/agbenchmark/reports/1.json
@@ -0,0 +1,109 @@
+{
+    "command": "agbenchmark start --mock",
+    "completion_time": "2023-07-10-21:19",
+    "time_elapsed": "8.75 seconds",
+    "tests": {
+        "TestWriteFile": {
+            "difficulty": "basic",
+            "dependencies": [],
+            "test": "agbenchmark/challenges/interface/write_file",
+            "success": true
+        },
+        "TestReadFile": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/interface/read_file",
+            "success": true
+        },
+        "TestSearch": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/interface/search",
+            "success": true
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestReadFile",
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/code/d1",
+            "success": true
+        },
+        "TestBasicMemory": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestReadFile",
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/memory/m1",
+            "success": true
+        },
+        "TestBasicRetrieval": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestWriteFile",
+                "TestSearch"
+            ],
+            "test": "agbenchmark/challenges/retrieval/r1",
+            "success": true
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "difficulty": "medium",
+            "dependencies": [
+                "TestDebugSimpleTypoWithGuidance"
+            ],
+            "test": "agbenchmark/challenges/code/d2",
+            "success": true
+        },
+        "TestRememberMultipleIds": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestBasicMemory"
+            ],
+            "test": "agbenchmark/challenges/memory/m2",
+            "success": true
+        },
+        "TestRetrieval2": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestBasicRetrieval"
+            ],
+            "test": "agbenchmark/challenges/retrieval/r2",
+            "success": true
+        },
+        "TestRememberMultipleIdsWithNoise": {
+            "difficulty": "medium",
+            "dependencies": [
+                "TestRememberMultipleIds"
+            ],
+            "test": "agbenchmark/challenges/memory/m3",
+            "success": true
+        },
+        "TestRetrieval3": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestRetrieval2"
+            ],
+            "test": "agbenchmark/challenges/retrieval/r3",
+            "success": true
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "difficulty": "medium",
+            "dependencies": [
+                "TestRememberMultipleIdsWithNoise"
+            ],
+            "test": "agbenchmark/challenges/memory/m4",
+            "success": true
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+        "entry_path": "agbenchmark/benchmarks.py",
+        "cutoff": 60
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 68c7932b..917cd4e8 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -10,12 +10,16 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
+from agbenchmark.utils import calculate_info_test_path
+
 CURRENT_DIRECTORY = Path(__file__).resolve().parent
 
+benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark"
 
-CONFIG_PATH = str(Path(os.getcwd()) / "config.json")
+CONFIG_PATH = str(benchmarks_folder_path / "config.json")
+REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json")
 
-REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json")
+INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path)
 
 
 @click.group()
@@ -25,10 +29,11 @@ def cli() -> None:
 
 @cli.command()
 @click.option("--category", default=None, help="Specific category to run")
+@click.option("--test", default=None, help="Specific test to run")
 @click.option("--maintain", is_flag=True, help="Runs only regression tests")
 @click.option("--improve", is_flag=True, help="Run only non-regression tests")
 @click.option("--mock", is_flag=True, help="Run with mock")
-def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
+def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
     if maintain and improve:
@@ -37,6 +42,16 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
         )
         return 1
 
+    if test and (category or maintain or improve):
+        print(
+            "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
+        )
+        return 1
+
+    if not benchmarks_folder_path.exists():
+        benchmarks_folder_path.mkdir(exist_ok=True)
+
+    print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size)
     if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
         config = {}
 
@@ -46,12 +61,12 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
         )
 
         config["entry_path"] = click.prompt(
-            "Please enter a the path to your run_specific_agent function implementation",
-            default="/benchmarks.py",
+            "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
+            default="benchmarks.py",
         )
 
         config["cutoff"] = click.prompt(
-            "Please enter a hard cutoff runtime for your agent",
+            "Please enter a hard cutoff runtime for your agent per test",
             default="60",
         )
 
@@ -65,7 +80,11 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
     os.environ["MOCK_TEST"] = "True" if mock else "False"
 
     if not os.path.exists(REGRESSION_TESTS_PATH):
-        with open(REGRESSION_TESTS_PATH, "a"):
+        with open(REGRESSION_TESTS_PATH, "w"):
+            pass
+
+    if not os.path.exists(INFO_TESTS_PATH):
+        with open(INFO_TESTS_PATH, "w"):
             pass
 
     print("Current configuration:")
@@ -73,18 +92,22 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
         print(f"{key}: {value}")
 
     pytest_args = ["-vs"]
-    if category:
-        pytest_args.extend(["-m", category])
-        print("Starting benchmark tests ", category)
+    if test:
+        print("Running specific test:", test)
+        pytest_args.extend(["-k", test])
     else:
-        print("Running all categories")
+        if category:
+            pytest_args.extend(["-m", category])
+            print("Running tests of category:", category)
+        else:
+            print("Running all categories")
 
-    if maintain:
-        print("Running only regression tests")
-        pytest_args.append("--maintain")
-    elif improve:
-        print("Running only non-regression tests")
-        pytest_args.append("--improve")
+        if maintain:
+            print("Running only regression tests")
+            pytest_args.append("--maintain")
+        elif improve:
+            print("Running only non-regression tests")
+            pytest_args.append("--improve")
 
     if mock:
         pytest_args.append("--mock")
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index b05a7ac3..ffde0c6d 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1 +1,17 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
+import glob
+from pathlib import Path
+
+
+def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
+    INFO_TESTS_PATH = benchmarks_folder_path / "reports"
+
+    if not INFO_TESTS_PATH.exists():
+        INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
+        return str(INFO_TESTS_PATH / "1.json")
+    else:
+        json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
+        file_count = len(json_files)
+        run_name = f"{file_count + 1}.json"
+        new_file_path = INFO_TESTS_PATH / run_name
+        return str(new_file_path)
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index f360d503..dc2a7699 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit f360d503b113119f6b3ce0acff1dbb4dfae2223a
+Subproject commit dc2a76990c75fafacbeaa76eb2e27d48de44cadd
diff --git a/agent/SuperAGI b/agent/SuperAGI
index 7ab2994d..a28224d8 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916
+Subproject commit a28224d82572b598ccee1057086fabaf33e1aaa9
diff --git a/agent/config_example.json b/agent/config_example.json
index ba2ec0b8..7ab65bc2 100644
--- a/agent/config_example.json
+++ b/agent/config_example.json
@@ -1,6 +1,5 @@
 {
   "workspace": "projects/my-new-project/workspace",
-  "entry_path": "benchmarks.py",
-  "home_path": "",
+  "entry_path": "agbenchmark/benchmarks.py",
   "cutoff": 60
 }
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 4af8c137..cde9be3e 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 4af8c137e82cc51fdd31c23327ceffd64194b984
+Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333
diff --git a/agent/mini-agi b/agent/mini-agi
index 4af8a7e6..ad2b3450 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 4af8a7e6085f0518f06180fbf87024a2c9db4c88
+Subproject commit ad2b345050e07efb7ad0bde68c93bc2b4e2d7a92
diff --git a/agent/smol-developer b/agent/smol-developer
index a1e4a9ff..c52b14b1 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit a1e4a9ff3a75909c4a892e409a55f86a2c57b7c6
+Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d
diff --git a/mypy.ini b/mypy.ini
index 764c239f..d35c6962 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -15,5 +15,5 @@ ignore_errors = True
 [mypy-agbenchmark.mocks.tests.basic_mocks.*]
 ignore_errors = True
 
-[mypy-agbenchmark.tests.regression.RegressionManager.*]
+[mypy-agbenchmark.tests.regression.ReportManager.*]
 ignore_errors = True