Dynamic home path for runs (#119)

2026-02-09 08:14:27 +01:00 · 2023-07-16 21:24:06 -04:00
parent 5c7acbc719
commit ce4cefe7e7
14 changed files with 135 additions and 244 deletions
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,3 @@
 AGENT_NAME=mini-agi
-ENVIRONMENT=local
+HOME_ENV=
 MOCK_TEST=False
--- a/agbenchmark/README.md
+++ b/agbenchmark/README.md
@@ -40,45 +40,6 @@ Let people know what beautiful code you write does, document everything well

 Share your progress :)

-### Pytest
-
-an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
-
-```python
-import pytest
-from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
-import os
-
-
-class TestWriteFile(BasicChallenge):
-    """Testing if LLM can write to a file"""
-
-    def test_method(self, config):
-        # implement scoring logic by looking at workspace
-```
-
-All challenges will inherit from parent class which has the mark and any specific methods for their category
-
-```python
-@pytest.mark.basic
-class BasicChallenge(Challenge):
-    pass
-```
-
-Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test.
-
-```python
-@pytest.fixture(
-        scope="module", autouse=True
-    )  # this is specific to setting up a file for the test, not all tests have this
-    def setup_module(self, workspace):
-        Challenge.write_to_file(
-            workspace, self.data.ground.files[0], "this is how we're doing"
-        )
-```
-
-#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
-
 ## Workspace

 If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
@@ -87,29 +48,7 @@ If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-ag

 Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/

-## Repo
-
-```
-|-- auto-gpt-benchmarks/ **main project directory**
-| |-- metrics.py **combining scores, metrics, final evaluation**
-| |-- start_benchmark.py **entry point from cli**
-| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
-| |-- Challenge.py **easy challenge creation class**
-| |-- config.json **workspace folder**
-| |-- challenges/ **challenges across different domains**
-| | |-- adaptability/
-| | |-- basic_abilities/
-| | |-- code/
-| | |-- memory/
-| | |-- retrieval/
-| | |-- web_navigation/
-| | |-- writing/
-| |-- tests/
-| | |-- basic_abilities/ **every llm should pass these challenges**
-| | |-- regression/ **challenges that already passed**
-```
-
-## How to add new agents to agbenchmark ?
+## How do I add new agents to agbenchmark ?

 Example with smol developer.

@@ -120,3 +59,12 @@ https://github.com/smol-ai/developer/pull/114/files
 2- Create the submodule and the github workflow by following the same pattern as this example:

 https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files
+
+## How do I run agent in different environments?
+
+**To just use as the benchmark for your agent**. `pip install` the package and run `agbenchmark start`
+
+**For internal Auto-GPT ci runs**, specify the `AGENT_NAME` you want you use and set the `HOME_ENV`.
+Ex. `HOME_ENV=ci AGENT_NAME=mini-agi`
+
+**To develop agent alongside benchmark**, you can specify the `AGENT_NAME` you want you use and add as a submodule to the repo
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -7,7 +7,7 @@ from typing import Any, Dict

 from dotenv import load_dotenv

-from agbenchmark.start_benchmark import CURRENT_DIRECTORY
+from agbenchmark.start_benchmark import CURRENT_DIRECTORY, HOME_DIRECTORY

 load_dotenv()

@@ -25,13 +25,16 @@ def run_agent(
            config["workspace"], "artifacts_out", challenge_location
        )
    else:
-        print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}")
-        command = [sys.executable, "-m", config["entry_path"], str(task)]
+        entry_path = "agbenchmark.benchmarks"
+
+        print(f"Running Python function '{entry_path}' with timeout {cutoff}")
+        command = [sys.executable, "-m", entry_path, str(task)]
        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            universal_newlines=True,
+            cwd=HOME_DIRECTORY,
        )

        start_time = time.time()
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,4 +1,3 @@
 {
-  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "agbenchmark.benchmarks"
+  "workspace": "${os.path.join(Path.home(), 'miniagi')}"
 }
--- a/agbenchmark/internal_info.json
+++ b/agbenchmark/internal_info.json
@@ -15,6 +15,8 @@
        false
    ],
    "TestDebugSimpleTypoWithGuidance": [
+        false,
+        false,
        false,
        false,
        false
@@ -25,6 +27,7 @@
        false
    ],
    "TestReadFile": [
+        true,
        true,
        true,
        true
@@ -55,6 +58,7 @@
        true
    ],
    "TestSearch": [
+        true,
        true,
        true,
        true
@@ -68,6 +72,12 @@
        false,
        false,
        true,
-        false
+        false,
+        true,
+        false,
+        false,
+        false,
+        false,
+        true
    ]
 }
--- a/agbenchmark/reports/file1_07-14-18-54.json
+++ b/agbenchmark/reports/file1_07-14-18-54.json
@@ -1,147 +0,0 @@
-{
-    "command": "agbenchmark start --mock",
-    "completion_time": "2023-07-14-18:54",
-    "metrics": {
-        "run_time": "0.97 seconds",
-        "highest_difficulty": "advanced: 5"
-    },
-    "tests": {
-        "TestWriteFile": {
-            "data_path": "agbenchmark/challenges/interface/write_file",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "non_mock_success_%": 75.0,
-                "run_time": "0.007 seconds"
-            }
-        },
-        "TestReadFile": {
-            "data_path": "agbenchmark/challenges/interface/read_file",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.008 seconds"
-            }
-        },
-        "TestSearch": {
-            "data_path": "agbenchmark/challenges/interface/search",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.007 seconds"
-            }
-        },
-        "TestDebugSimpleTypoWithGuidance": {
-            "data_path": "agbenchmark/challenges/code/d1",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "basic",
-                "success": false,
-                "fail_reason": "assert 1 in [0.0]",
-                "non_mock_success_%": 0.0,
-                "run_time": "0.448 seconds"
-            }
-        },
-        "TestBasicMemory": {
-            "data_path": "agbenchmark/challenges/memory/m1",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "basic",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.028 seconds"
-            }
-        },
-        "TestBasicRetrieval": {
-            "data_path": "agbenchmark/challenges/retrieval/r1",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "basic",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.014 seconds"
-            }
-        },
-        "TestDebugSimpleTypoWithoutGuidance": {
-            "data_path": "agbenchmark/challenges/code/d2",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "novice",
-                "success": false,
-                "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "non_mock_success_%": 0.0,
-                "run_time": "0.001 seconds"
-            }
-        },
-        "TestCreateSimpleWebServer": {
-            "data_path": "agbenchmark/challenges/code/d3",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "advanced",
-                "success": false,
-                "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "non_mock_success_%": 0.0,
-                "run_time": "0.002 seconds"
-            }
-        },
-        "TestRememberMultipleIds": {
-            "data_path": "agbenchmark/challenges/memory/m2",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "novice",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.023 seconds"
-            }
-        },
-        "TestRetrieval2": {
-            "data_path": "agbenchmark/challenges/retrieval/r2",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "novice",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.013 seconds"
-            }
-        },
-        "TestRememberMultipleIdsWithNoise": {
-            "data_path": "agbenchmark/challenges/memory/m3",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "intermediate",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.03 seconds"
-            }
-        },
-        "TestRetrieval3": {
-            "data_path": "agbenchmark/challenges/retrieval/r3",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "intermediate",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.016 seconds"
-            }
-        },
-        "TestRememberMultiplePhrasesWithNoise": {
-            "data_path": "agbenchmark/challenges/memory/m4",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "advanced",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.034 seconds"
-            }
-        }
-    },
-    "config": {
-        "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-        "entry_path": "agbenchmark.benchmarks"
-    }
-}
--- a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
+++ b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
@@ -0,0 +1,23 @@
+{
+    "command": "agbenchmark start --test TestWriteFile",
+    "completion_time": "2023-07-16-13:07",
+    "metrics": {
+        "run_time": "13.91 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 30.0,
+                "run_time": "13.684 seconds"
+            }
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -6,20 +6,17 @@ from typing import Any

 import click
 import pytest
-from dotenv import load_dotenv

-load_dotenv()
-
-from agbenchmark.utils import calculate_info_test_path
+from agbenchmark.utils import calculate_dynamic_paths

 CURRENT_DIRECTORY = Path(__file__).resolve().parent

-benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark"
-
-CONFIG_PATH = str(benchmarks_folder_path / "config.json")
-REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json")
-
-INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path)
+(
+    HOME_DIRECTORY,
+    CONFIG_PATH,
+    REGRESSION_TESTS_PATH,
+    INFO_TESTS_PATH,
+) = calculate_dynamic_paths()


@click.group()
@@ -48,9 +45,6 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -
        )
        return 1

-    if not benchmarks_folder_path.exists():
-        benchmarks_folder_path.mkdir(exist_ok=True)
-
    print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size)
    if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
        config = {}
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -6,25 +6,28 @@ from datetime import datetime
 from pathlib import Path
 from typing import Any

+from dotenv import load_dotenv
+
+load_dotenv()
+
 from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel

+AGENT_NAME = os.getenv("AGENT_NAME")
+HOME_ENV = os.getenv("HOME_ENV")

-def calculate_info_test_path(benchmarks_folder_path: Path) -> str:

-    INFO_TESTS_PATH = (
-        benchmarks_folder_path / os.getenv("REPORT_LOCATION", ".") / "reports"
-    )
-
-    if not INFO_TESTS_PATH.exists():
-        INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
+def calculate_info_test_path(reports_path: Path) -> str:
+    print("reports_pathreports_pathreports_pathreports_path", reports_path)
+    if not reports_path.exists():
+        reports_path.mkdir(parents=True, exist_ok=True)
        return str(
-            INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
+            reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
        )
    else:
-        json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
+        json_files = glob.glob(str(reports_path / "*.json"))
        file_count = len(json_files)
        run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
-        new_file_path = INFO_TESTS_PATH / run_name
+        new_file_path = reports_path / run_name
        return str(new_file_path)


@@ -79,3 +82,61 @@ def get_highest_success_difficulty(data: dict) -> str:
        highest_difficulty_str = ""

    return f"{highest_difficulty_str}: {highest_difficulty_level}"
+
+
+def assign_paths(folder_path: Path) -> tuple[str, str, str]:
+    CONFIG_PATH = str(folder_path / "config.json")
+    REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json")
+
+    if HOME_ENV == "ci" and AGENT_NAME:
+        INFO_TESTS_PATH = calculate_info_test_path(
+            Path(os.getcwd()) / "agbenchmark" / "reports" / AGENT_NAME
+        )
+    else:
+        INFO_TESTS_PATH = calculate_info_test_path(folder_path / "reports")
+
+    return CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH
+
+
+def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
+    # the default home is where you're running from
+    HOME_DIRECTORY = Path(os.getcwd())
+    benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
+
+    if AGENT_NAME and HOME_ENV == "ci":
+        if "/Auto-GPT-Benchmarks/agent" in str(HOME_DIRECTORY):
+            raise Exception("Must run from root of benchmark repo if HOME_ENV is ci")
+
+        # however if the env is local and the agent name is defined, we want to run that agent from the repo and then get the data in the internal agbenchmark directory
+        # this is for the ci/cd pipeline
+        benchmarks_folder_path = HOME_DIRECTORY / "agent" / AGENT_NAME / "agbenchmark"
+
+        CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
+            benchmarks_folder_path
+        )
+
+        # we want to run the agent from the submodule
+        HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
+
+    elif AGENT_NAME and not os.path.join("Auto-GPT-Benchmarks", "agent") in str(
+        HOME_DIRECTORY
+    ):
+        # if the agent name is defined but the run is not from the agent repo, then home is the agent repo
+        # used for development of both a benchmark and an agent
+        HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
+        benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
+
+        CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
+            benchmarks_folder_path
+        )
+    else:
+        # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
+        # used when its just a pip install
+        CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
+            benchmarks_folder_path
+        )
+
+    if not benchmarks_folder_path.exists():
+        benchmarks_folder_path.mkdir(exist_ok=True)
+
+    return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
--- a/agent/mini-agi
+++ b/agent/mini-agi
--- a/agent/smol-developer
+++ b/agent/smol-developer