Added --test, consolidate files, reports working (#83)

2026-02-18 04:34:24 +01:00 · 2023-07-10 22:25:19 -04:00
parent 437e066a66
commit 8df82909b2
18 changed files with 289 additions and 102 deletions
--- a/agbenchmark/RegressionManager.py
+++ b/agbenchmark/RegressionManager.py
@@ -1,12 +1,17 @@
 import json
-from typing import Union
+import os
+import sys
+import time
+from datetime import datetime
+from typing import Any, Dict, Union


-class RegressionManager:
+class ReportManager:
    """Abstracts interaction with the regression tests file"""

    def __init__(self, filename: str):
        self.filename = filename
+        self.start_time = time.time()
        self.load()

    def load(self) -> None:
@@ -40,6 +45,18 @@ class RegressionManager:
            del self.tests[test_name]
            self.save()

+    def end_info_report(self, config: Dict[str, Any]) -> None:
+        command = " ".join(sys.argv)
+        self.tests = {
+            "command": command.split(os.sep)[-1],
+            "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
+            "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
+            "tests": self.tests,
+            "config": config,
+        }
+
+        self.save()
+
    def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
        if isinstance(value, str):
            return value.replace("\\\\", "/")  # escape \ with \\
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -3,6 +3,7 @@ import shutil
 import subprocess
 import sys
 import time
+from pathlib import Path
 from typing import Any, Dict

 from dotenv import load_dotenv
@@ -21,6 +22,7 @@ def run_agent(
    """Calling to get a response"""

    if MOCK_FLAG:
+        print("ITS A MOCK TEST", challenge_location)
        copy_artifacts_into_workspace(
            config["workspace"], "artifacts_out", challenge_location
        )
@@ -30,19 +32,13 @@ def run_agent(
            f"Running Python function '{config['entry_path']}' with timeout {timeout}"
        )

-        # Get the current working directory
-        cwd = os.path.join(os.getcwd(), config["home_path"])
-
-        # Add current directory to Python's import path
-        sys.path.append(cwd)
-
        command = [sys.executable, config["entry_path"], str(task)]
        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            universal_newlines=True,
-            cwd=cwd,
+            cwd=os.getcwd(),
        )

        start_time = time.time()
@@ -79,7 +75,9 @@ def run_agent(
 def copy_artifacts_into_workspace(
    workspace: str, artifact_folder_name: str, challenge_dir_path: str
 ) -> None:
-    source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
+    # this file is at agbenchmark\agent_interface.py
+    script_dir = Path(__file__).resolve().parent.parent
+    source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name)

    # Check if source_dir exists, if not then return immediately.
    if not os.path.exists(source_dir):
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -1,4 +1,5 @@
 import json
+from pathlib import Path
 from typing import List, Optional

 from pydantic import BaseModel
@@ -32,7 +33,12 @@ class ChallengeData(BaseModel):

    @staticmethod
    def deserialize(path: str) -> "ChallengeData":
+        # this script is in root/agbenchmark/challenges/define_task_types.py
+        script_dir = Path(__file__).resolve().parent.parent.parent
+        path = str(script_dir / path)
+
        print("Deserializing", path)
+
        with open(path, "r") as file:
            data = json.load(file)
        return ChallengeData(**data)
--- a/agbenchmark/challenges/interface/search/data.json
+++ b/agbenchmark/challenges/interface/search/data.json
@@ -2,7 +2,7 @@
  "name": "TestSearch",
  "category": ["interface"],
  "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"],
  "ground": {
    "answer": "This is a Heading\nThis is a paragraph.",
    "should_contain": ["Heading", "paragraph"],
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -19,7 +19,7 @@ load_dotenv()
 IMPROVE = os.getenv("IMPROVE", "False")


-json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True)
+json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)


 def get_test_path(json_file: str) -> str:
@@ -55,7 +55,7 @@ def generate_tests() -> None:
            )
            sys.path.append(str(custom_python_location))

-            for (module_loader, name, ispkg) in pkgutil.iter_modules(
+            for module_loader, name, ispkg in pkgutil.iter_modules(
                [str(custom_python_location)]
            ):
                module = importlib.import_module(name)
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,6 +1,5 @@
 {
  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "benchmarks.py",
-  "home_path": "agent/mini-agi",
+  "entry_path": "agbenchmark/benchmarks.py",
  "cutoff": 60
 }
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -6,9 +6,10 @@ from typing import Any, Dict, Generator

 import pytest

-from agbenchmark.RegressionManager import RegressionManager
+from agbenchmark.ReportManager import ReportManager
 from agbenchmark.start_benchmark import (
    CONFIG_PATH,
+    INFO_TESTS_PATH,
    REGRESSION_TESTS_PATH,
    get_regression_data,
 )
@@ -106,7 +107,8 @@ def challenge_data(request: Any) -> None:
    return request.param


-regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+regression_manager = ReportManager(REGRESSION_TESTS_PATH)
+info_manager = ReportManager(INFO_TESTS_PATH)


 def pytest_runtest_makereport(item: Any, call: Any) -> None:
@@ -130,12 +132,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
        print("pytest_runtest_makereport", test_details)
        if call.excinfo is None:
            regression_manager.add_test(item.nodeid.split("::")[1], test_details)
+            test_details["success"] = True
        else:
            regression_manager.remove_test(item.nodeid.split("::")[1])
+            test_details["success"] = False
+            test_details["fail_reason"] = str(call.excinfo.value)
+
+        info_manager.add_test(item.nodeid.split("::")[1], test_details)


-def pytest_sessionfinish() -> None:
-    """Called at the end of the session to save regression tests"""
+def pytest_sessionfinish(session: Any) -> None:
+    """Called at the end of the session to save regression tests and info"""
+    with open(CONFIG_PATH, "r") as f:
+        config = json.load(f)
+
+    info_manager.end_info_report(config)
    regression_manager.save()


--- a/agbenchmark/regression_tests.json
+++ b/agbenchmark/regression_tests.json
@@ -1,11 +1,20 @@
 {
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/interface/read_file",
+        "success": true
+    },
    "TestBasicMemory": {
        "difficulty": "basic",
        "dependencies": [
            "TestReadFile",
            "TestWriteFile"
        ],
-        "test": "agbenchmark/challenges/memory/m1"
+        "test": "agbenchmark/challenges/memory/m1",
+        "success": true
    },
    "TestBasicRetrieval": {
        "difficulty": "basic",
@@ -13,12 +22,62 @@
            "TestWriteFile",
            "TestSearch"
        ],
-        "test": "agbenchmark/challenges/retrieval/r1"
+        "test": "agbenchmark/challenges/retrieval/r1",
+        "success": true
    },
-    "TestCreateSimpleWebServer": {
+    "TestRememberMultipleIds": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestBasicMemory"
+        ],
+        "test": "agbenchmark/challenges/memory/m2",
+        "success": true
+    },
+    "TestRetrieval2": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestBasicRetrieval"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r2",
+        "success": true
+    },
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIds"
+        ],
+        "test": "agbenchmark/challenges/memory/m3",
+        "success": true
+    },
+    "TestRetrieval3": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestRetrieval2"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r3",
+        "success": true
+    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIdsWithNoise"
+        ],
+        "test": "agbenchmark/challenges/memory/m4",
+        "success": true
+    },
+    "TestSearch": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/interface/search",
+        "success": true
+    },
+    "TestWriteFile": {
        "difficulty": "basic",
        "dependencies": [],
-        "test": "agbenchmark/challenges/code/d3"
+        "test": "agbenchmark/challenges/interface/write_file",
+        "success": true
    },
    "TestDebugSimpleTypoWithGuidance": {
        "difficulty": "basic",
@@ -26,65 +85,15 @@
            "TestReadFile",
            "TestWriteFile"
        ],
-        "test": "agbenchmark/challenges/code/d1"
+        "test": "agbenchmark/challenges/code/d1",
+        "success": true
    },
    "TestDebugSimpleTypoWithoutGuidance": {
        "difficulty": "medium",
        "dependencies": [
            "TestDebugSimpleTypoWithGuidance"
        ],
-        "test": "agbenchmark/challenges/code/d2"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestWriteFile"
-        ],
-        "test": "agbenchmark/challenges/interface/read_file"
-    },
-    "TestRememberMultipleIds": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestBasicMemory"
-        ],
-        "test": "agbenchmark/challenges/memory/m2"
-    },
-    "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestRememberMultipleIds"
-        ],
-        "test": "agbenchmark/challenges/memory/m3"
-    },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestRememberMultipleIdsWithNoise"
-        ],
-        "test": "agbenchmark/challenges/memory/m4"
-    },
-    "TestRetrieval2": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestBasicRetrieval"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r2"
-    },
-    "TestRetrieval3": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestRetrieval2"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r3"
-    },
-    "TestSearch": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/search"
-    },
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/write_file"
+        "test": "agbenchmark/challenges/code/d2",
+        "success": true
    }
 }
--- a/agbenchmark/reports/1.json
+++ b/agbenchmark/reports/1.json
@@ -0,0 +1,109 @@
+{
+    "command": "agbenchmark start --mock",
+    "completion_time": "2023-07-10-21:19",
+    "time_elapsed": "8.75 seconds",
+    "tests": {
+        "TestWriteFile": {
+            "difficulty": "basic",
+            "dependencies": [],
+            "test": "agbenchmark/challenges/interface/write_file",
+            "success": true
+        },
+        "TestReadFile": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/interface/read_file",
+            "success": true
+        },
+        "TestSearch": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/interface/search",
+            "success": true
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestReadFile",
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/code/d1",
+            "success": true
+        },
+        "TestBasicMemory": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestReadFile",
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/memory/m1",
+            "success": true
+        },
+        "TestBasicRetrieval": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestWriteFile",
+                "TestSearch"
+            ],
+            "test": "agbenchmark/challenges/retrieval/r1",
+            "success": true
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "difficulty": "medium",
+            "dependencies": [
+                "TestDebugSimpleTypoWithGuidance"
+            ],
+            "test": "agbenchmark/challenges/code/d2",
+            "success": true
+        },
+        "TestRememberMultipleIds": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestBasicMemory"
+            ],
+            "test": "agbenchmark/challenges/memory/m2",
+            "success": true
+        },
+        "TestRetrieval2": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestBasicRetrieval"
+            ],
+            "test": "agbenchmark/challenges/retrieval/r2",
+            "success": true
+        },
+        "TestRememberMultipleIdsWithNoise": {
+            "difficulty": "medium",
+            "dependencies": [
+                "TestRememberMultipleIds"
+            ],
+            "test": "agbenchmark/challenges/memory/m3",
+            "success": true
+        },
+        "TestRetrieval3": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestRetrieval2"
+            ],
+            "test": "agbenchmark/challenges/retrieval/r3",
+            "success": true
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "difficulty": "medium",
+            "dependencies": [
+                "TestRememberMultipleIdsWithNoise"
+            ],
+            "test": "agbenchmark/challenges/memory/m4",
+            "success": true
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+        "entry_path": "agbenchmark/benchmarks.py",
+        "cutoff": 60
+    }
+}
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -10,12 +10,16 @@ from dotenv import load_dotenv

 load_dotenv()

+from agbenchmark.utils import calculate_info_test_path
+
 CURRENT_DIRECTORY = Path(__file__).resolve().parent

+benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark"

-CONFIG_PATH = str(Path(os.getcwd()) / "config.json")
+CONFIG_PATH = str(benchmarks_folder_path / "config.json")
+REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json")

-REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json")
+INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path)


@click.group()
@@ -25,10 +29,11 @@ def cli() -> None:

@cli.command()
@click.option("--category", default=None, help="Specific category to run")
+@click.option("--test", default=None, help="Specific test to run")
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
@click.option("--mock", is_flag=True, help="Run with mock")
-def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
+def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -> int:
    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
    # Check if configuration file exists and is not empty
    if maintain and improve:
@@ -37,6 +42,16 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
        )
        return 1

+    if test and (category or maintain or improve):
+        print(
+            "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
+        )
+        return 1
+
+    if not benchmarks_folder_path.exists():
+        benchmarks_folder_path.mkdir(exist_ok=True)
+
+    print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size)
    if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
        config = {}

@@ -46,12 +61,12 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
        )

        config["entry_path"] = click.prompt(
-            "Please enter a the path to your run_specific_agent function implementation",
-            default="/benchmarks.py",
+            "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
+            default="benchmarks.py",
        )

        config["cutoff"] = click.prompt(
-            "Please enter a hard cutoff runtime for your agent",
+            "Please enter a hard cutoff runtime for your agent per test",
            default="60",
        )

@@ -65,7 +80,11 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
    os.environ["MOCK_TEST"] = "True" if mock else "False"

    if not os.path.exists(REGRESSION_TESTS_PATH):
-        with open(REGRESSION_TESTS_PATH, "a"):
+        with open(REGRESSION_TESTS_PATH, "w"):
+            pass
+
+    if not os.path.exists(INFO_TESTS_PATH):
+        with open(INFO_TESTS_PATH, "w"):
            pass

    print("Current configuration:")
@@ -73,18 +92,22 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
        print(f"{key}: {value}")

    pytest_args = ["-vs"]
-    if category:
-        pytest_args.extend(["-m", category])
-        print("Starting benchmark tests ", category)
+    if test:
+        print("Running specific test:", test)
+        pytest_args.extend(["-k", test])
    else:
-        print("Running all categories")
+        if category:
+            pytest_args.extend(["-m", category])
+            print("Running tests of category:", category)
+        else:
+            print("Running all categories")

-    if maintain:
-        print("Running only regression tests")
-        pytest_args.append("--maintain")
-    elif improve:
-        print("Running only non-regression tests")
-        pytest_args.append("--improve")
+        if maintain:
+            print("Running only regression tests")
+            pytest_args.append("--maintain")
+        elif improve:
+            print("Running only non-regression tests")
+            pytest_args.append("--improve")

    if mock:
        pytest_args.append("--mock")
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1 +1,17 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
+import glob
+from pathlib import Path
+
+
+def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
+    INFO_TESTS_PATH = benchmarks_folder_path / "reports"
+
+    if not INFO_TESTS_PATH.exists():
+        INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
+        return str(INFO_TESTS_PATH / "1.json")
+    else:
+        json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
+        file_count = len(json_files)
+        run_name = f"{file_count + 1}.json"
+        new_file_path = INFO_TESTS_PATH / run_name
+        return str(new_file_path)
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
--- a/agent/config_example.json
+++ b/agent/config_example.json
@@ -1,6 +1,5 @@
 {
  "workspace": "projects/my-new-project/workspace",
-  "entry_path": "benchmarks.py",
-  "home_path": "",
+  "entry_path": "agbenchmark/benchmarks.py",
  "cutoff": 60
 }
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
--- a/agent/mini-agi
+++ b/agent/mini-agi
--- a/agent/smol-developer
+++ b/agent/smol-developer
--- a/mypy.ini
+++ b/mypy.ini
@@ -15,5 +15,5 @@ ignore_errors = True
 [mypy-agbenchmark.mocks.tests.basic_mocks.*]
 ignore_errors = True

-[mypy-agbenchmark.tests.regression.RegressionManager.*]
+[mypy-agbenchmark.tests.regression.ReportManager.*]
 ignore_errors = True