local runs, home_path config, submodule miniagi (#50)

2025-12-17 14:04:27 +01:00 · 2023-07-04 13:23:00 -04:00
parent 7f098d5fb6
commit e25f610344
16 changed files with 262 additions and 147 deletions
--- a/.github/workflows/mini-agi.yml
+++ b/.github/workflows/mini-agi.yml
@@ -0,0 +1,63 @@
 name: mini-agi Regression Test
 on:
  workflow_dispatch:
    branches: [master]
  push:
    branches: [stable, master, ci-test*]
 jobs:
  regression-tests:
    permissions:
      pull-requests: write
      contents: write
    runs-on: ubuntu-latest
    timeout-minutes: 30
    strategy:
      matrix:
        python-version: ['3.10']
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
        with:
          fetch-depth: 0
          ref: ${{ github.event.pull_request.head.ref }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
          submodules: true
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - id: get_date
        name: Get date
        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
      - name: Install Poetry
        run: |
          curl -sSL https://install.python-poetry.org | python -
      - name: Set up Poetry cache
        uses: actions/cache@v2
        with:
          path: |
            ~/.cache/pypoetry
            .venv
          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
      - name: Set up venv and install Python dependencies
        run: |
          poetry install --only main
          poetry build
      - name: Run regression tests
        run: |
          cd agent/mini-agi
          make install
          source venv/bin/activate
          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
          agbenchmark start --reg
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,6 +6,10 @@
 	path = agent/gpt-engineer
 	url = https://github.com/merwanehamadi/gpt-engineer.git
 	branch = benchmark-integration
 [submodule "agent/mini-agi"]
 	path = agent/mini-agi
 	url = https://github.com/SilenNaihin/mini-agi.git
 	branch = benchmark-integration
 [submodule "agent/smol-developer"]
 	path = agent/smol-developer
 	url = https://github.com/merwanehamadi/developer.git
--- a/README.md
+++ b/README.md
@@ -2,127 +2,13 @@
 A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
-## As a user
+### Scores:
-1. `pip install auto-gpt-benchmarks`
+Scoring of agents will go here. Both overall and by category.
 2. Add boilerplate code to run and kill agent
 3. `agbenchmark start`
   - `--category challenge_category` to run tests in a specific category
   - `--mock` to only run mock tests if they exists for each test
   - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests
 4. We call boilerplate code for your agent
 5. Show pass rate of tests, logs, and any other metrics
-## Contributing
+### Integrated Agents
-##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
+- Auto-GPT
-
+- gpt-engineer
-### To run the existing mocks
+- mini-agi
-
+- smol-developer
 1. clone the repo `auto-gpt-benchmarks`
 2. `pip install poetry`
 3. `poetry shell`
 4. `poetry install`
 5. `cp .env_example .env`
 6. `agbenchmark start --mock`
   Keep config the same and watch the logs :)
 ### To run with mini-agi
 1. Navigate to `auto-gpt-benchmarks/agent/mini-agi`
 2. `pip install -r requirements.txt`
 3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed
 4. Make sure to follow the commands above, and remove mock flag `agbenchmark start`
 - To add requirements `poetry add requirement`.
 Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.
 If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit
 Let people know what beautiful code you write does, document everything well
 Share your progress :)
 ### Pytest
 an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
 ```python
 import pytest
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 class TestWriteFile(BasicChallenge):
    """Testing if LLM can write to a file"""
    def get_file_path(self) -> str:  # all tests must implement this method
        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
    @pytest.mark.depends(on=[], name="basic_write_file")
    def test_method(self, workspace):
        # implement scoring logic by looking at workspace
 ```
 All challenges will inherit from parent class which has the mark and any specific methods for their category
 ```python
@pytest.mark.basic
 class BasicChallenge(Challenge):
    pass
 ```
 Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. 
 ```python
@pytest.fixture(
        scope="module", autouse=True
    )  # this is specific to setting up a file for the test, not all tests have this
    def setup_module(self, workspace):
        Challenge.write_to_file(
            workspace, self.data.ground.files[0], "this is how we're doing"
        )
 ```
 #### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
 ## Workspace
 If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
 #### Dataset
 Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/
 ## Repo
 ```
 |-- auto-gpt-benchmarks/ **main project directory**
 | |-- metrics.py **combining scores, metrics, final evaluation**
 | |-- start_benchmark.py **entry point from cli**
 | |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
 | |-- Challenge.py **easy challenge creation class**
 | |-- config.json **workspace folder**
 | |-- challenges/ **challenges across different domains**
 | | |-- adaptability/
 | | |-- basic_abilities/
 | | |-- code/
 | | |-- memory/
 | | |-- retrieval/
 | | |-- web_navigation/
 | | |-- writing/
 | |-- tests/
 | | |-- basic_abilities/ **every llm should pass these challenges**
 | | |-- regression/ **challenges that already passed**
 ```
 ## How to add new agents to agbenchmark ?
 Example with smol developer.
 1- Create a github branch with your agent following the same pattern as this example:
 https://github.com/smol-ai/developer/pull/114/files
 2- Create the submodule and the github workflow by following the same pattern as this example:
 https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files
--- a/agbenchmark/README.md
+++ b/agbenchmark/README.md
@@ -0,0 +1,126 @@
 ## As a user
 1. `pip install auto-gpt-benchmarks`
 2. Add boilerplate code to run and kill agent
 3. `agbenchmark start`
   - `--category challenge_category` to run tests in a specific category
   - `--mock` to only run mock tests if they exists for each test
   - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests
 4. We call boilerplate code for your agent
 5. Show pass rate of tests, logs, and any other metrics
 ## Contributing
 ##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
 ### To run the existing mocks
 1. clone the repo `auto-gpt-benchmarks`
 2. `pip install poetry`
 3. `poetry shell`
 4. `poetry install`
 5. `cp .env_example .env`
 6. `agbenchmark start --mock`
   Keep config the same and watch the logs :)
 ### To run with mini-agi
 1. Navigate to `auto-gpt-benchmarks/agent/mini-agi`
 2. `pip install -r requirements.txt`
 3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed
 4. Make sure to follow the commands above, and remove mock flag `agbenchmark start`
 - To add requirements `poetry add requirement`.
 Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.
 If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit
 Let people know what beautiful code you write does, document everything well
 Share your progress :)
 ### Pytest
 an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
 ```python
 import pytest
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 class TestWriteFile(BasicChallenge):
    """Testing if LLM can write to a file"""
    def get_file_path(self) -> str:  # all tests must implement this method
        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
    @pytest.mark.depends(on=[], name="basic_write_file")
    def test_method(self, workspace):
        # implement scoring logic by looking at workspace
 ```
 All challenges will inherit from parent class which has the mark and any specific methods for their category
 ```python
@pytest.mark.basic
 class BasicChallenge(Challenge):
    pass
 ```
 Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test.
 ```python
@pytest.fixture(
        scope="module", autouse=True
    )  # this is specific to setting up a file for the test, not all tests have this
    def setup_module(self, workspace):
        Challenge.write_to_file(
            workspace, self.data.ground.files[0], "this is how we're doing"
        )
 ```
 #### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
 ## Workspace
 If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
 #### Dataset
 Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/
 ## Repo
 ```
 |-- auto-gpt-benchmarks/ **main project directory**
 | |-- metrics.py **combining scores, metrics, final evaluation**
 | |-- start_benchmark.py **entry point from cli**
 | |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
 | |-- Challenge.py **easy challenge creation class**
 | |-- config.json **workspace folder**
 | |-- challenges/ **challenges across different domains**
 | | |-- adaptability/
 | | |-- basic_abilities/
 | | |-- code/
 | | |-- memory/
 | | |-- retrieval/
 | | |-- web_navigation/
 | | |-- writing/
 | |-- tests/
 | | |-- basic_abilities/ **every llm should pass these challenges**
 | | |-- regression/ **challenges that already passed**
 ```
 ## How to add new agents to agbenchmark ?
 Example with smol developer.
 1- Create a github branch with your agent following the same pattern as this example:
 https://github.com/smol-ai/developer/pull/114/files
 2- Create the submodule and the github workflow by following the same pattern as this example:
 https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -1,4 +1,3 @@
 import importlib
 import os
 import subprocess
 import sys
@@ -29,18 +28,18 @@ def run_agent(
        mock_manager.delegate(mock_func)
    else:
        timeout = config["cutoff"]
-        print(f"Running Python function '{config['func_path']}' with timeout {timeout}")
+        print(
            f"Running Python function '{config['entry_path']}' with timeout {timeout}"
        )
        # Get the current working directory
        cwd = os.getcwd()
        # Add current directory to Python's import path
        sys.path.append(cwd)
        sys.path.append(os.path.join(cwd, config["home_path"]))
-        module_name = config["func_path"].replace("/", ".").rstrip(".py")
+        command = [sys.executable, config["entry_path"], str(task)]
        module = importlib.import_module(module_name)
        command = [sys.executable, "benchmarks.py", str(task)]
        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -38,7 +38,7 @@ def start(category: str, reg: bool, mock: bool) -> int:
            default=os.path.join(Path.home(), "workspace"),
        )
-        config["func_path"] = click.prompt(
+        config["entry_path"] = click.prompt(
            "Please enter a the path to your run_specific_agent function implementation",
            default="/benchmarks.py",
        )
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 from typing import Any, Dict
 import pytest
 from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
@@ -11,6 +12,7 @@ class TestWriteFile(BasicChallenge):
    def get_file_path(self) -> str:  # all tests must implement this method
        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
    @pytest.mark.depends(name="basic_write_file")
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
--- a/agent/benchmarks.py
+++ b/agent/benchmarks.py
@@ -1,15 +0,0 @@
 # import subprocess
 def run_specific_agent(task, conn):
    cycle_count = 0
    while (
        not conn.poll()
    ):  # Check if there's a termination signal from the main process
        response = run_agent(task)  # run the agent and get the response and cycle count
        if response:
            cycle_count += 1
        # Send response and cycle count back to the main process
        conn.send((response, cycle_count))
--- a/agent/benchmarks_example.py
+++ b/agent/benchmarks_example.py
@@ -0,0 +1,35 @@
 import os
 import sys
 from typing import Tuple
 import pexpect
 def run_specific_agent(task: str) -> Tuple[str, int]:
    # Ensure the directory for the project exists
    os.makedirs("workspace_path", exist_ok=True)
    # Run the agent command
    child = pexpect.spawn(f"python example.py {task}")
    # Create a loop to continuously read output
    while True:
        try:
            child.expect("\n")  # This waits until a newline appears
            print(child.before.decode())  # This prints the line
        except pexpect.EOF:
            break  # No more output, break the loop
    # Check the exit status
    child.close()  # Close the child process
    # Return child process's exit status and any error messages
    return child.before.decode(), child.exitstatus
 if __name__ == "__main__":
    # The first argument is the script name itself, second is the task
    if len(sys.argv) != 2:
        print("Usage: python script.py <task>")
        sys.exit(1)
    task = sys.argv[1]
    run_specific_agent(task)
--- a/agent/config_example.json
+++ b/agent/config_example.json
@@ -0,0 +1,6 @@
 {
  "workspace": "projects/my-new-project/workspace",
  "entry_path": "benchmarks.py",
  "home_path": "",
  "cutoff": 60
 }
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
--- a/agent/mini-agi
+++ b/agent/mini-agi
--- a/agent/regression_tests_example.json
+++ b/agent/regression_tests_example.json
@@ -0,0 +1,7 @@
 {
  "TestWriteFile": {
    "difficulty": "basic",
    "dependencies": [],
    "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
  }
 }
--- a/agent/smol-developer
+++ b/agent/smol-developer
--- a/config.json
+++ b/config.json
@@ -1,5 +1,6 @@
 {
  "workspace": "projects/my-new-project/workspace",
-  "func_path": "benchmarks.py",
+  "entry_path": "benchmarks.py",
  "home_path": "",
  "cutoff": 60
 }