read mes, remove port and host from config, etc

2026-02-09 16:24:24 +01:00 · 2023-06-27 19:19:14 -04:00
parent f933717d8b
commit 76ee994d2c
10 changed files with 73 additions and 166 deletions
--- a/.env.example
+++ b/.env.example
@@ -1,4 +1,3 @@
-OPENAI_API_KEY=
 AGENT_NAME=mini-agi
 AGENT_TIMEOUT=60
 MOCK_TEST=False
--- a/README.md
+++ b/README.md
@@ -2,80 +2,70 @@

 A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work

+## As a user
+
+1. `pip install auto-gpt-benchmarks`
+2. Add boilerplate code to run and kill agent
+3. `agbenchmark start`
+   - `--category challenge_category` to run tests in a specific category
+   - `--mock` to only run mock tests if they exists for each test
+   - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests
+4. We call boilerplate code for your agent
+5. Show pass rate of tests, logs, and any other metrics
+
+## Contributing
+
 ##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x

-### To run the basic existing mock (June 21)
+### To run the existing mocks

 1. clone the repo `auto-gpt-benchmarks`
 2. `pip install poetry`
 3. `poetry shell`
 4. `poetry install`
-5. `agbenchmark start`
+5. `cp .env_example .env`
+6. `agbenchmark start --mock`
   Keep config the same and watch the logs :)

+### To run with mini-agi
+
+1. Navigate to `auto-gpt-benchmarks/agent/mini-agi`
+2. `pip install -r requirements.txt`
+3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed
+4. Make sure to follow the commands above, and remove mock flag `agbenchmark start`
+
 - To add requirements `poetry add requirement`.

 Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.

-If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit
+If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit

 Let people know what beautiful code you write does, document everything well

 Share your progress :)

-## How this works
-
-1. `pip install auto-gpt-benchmarks`
-2. Add boilerplate code to start webserver to your agent (run loop and stop condition)
-3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory
-4. We call the server to run the agent for each test
-5. Show pass rate of tests, logs, and any other metrics
-
-### To run the basic existing mock (June 21)
-
-1. clone the repo `auto-gpt-benchmarks`
-2. `pip install poetry`
-3. `poetry shell`
-4. `poetry install`
-5. `agbenchmark start`
-   Keep config the same and watch the logs :)
-
-#### Bonuses
-
- You can adds tests by git cloning auto-gpt-benchmarks to your repo
- Agent is abstracted from benchmark, don't need to do any extra setup other then starting the server
- Simple, easy to use
- Don't have to deal with cloud or parallelization yet
-
 ### Pytest

-to create a test:
+an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic

 ```python
 import pytest
-from agbenchmark.challenges.define_task_types import ChallengeData
-from ..CategoryChallenge import CategoryChallenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os

-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "r_file_data.json")
-)

-class TestSomething(CategoryChallenge):
-    """Testing if LLM can read a file"""
+class TestWriteFile(BasicChallenge):
+    """Testing if LLM can write to a file"""

-    @pytest.mark.parametrize(
-        "run_agent",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    def test_retrieval(
-        self, workspace
-    ):
-        # scoring logic goes here
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
+
+    @pytest.mark.depends(on=[], name="basic_write_file")
+    def test_method(self, workspace):
+        # implement scoring logic by looking at workspace
 ```

-All challenges will inherit from parent class which has the mark
+All challenges will inherit from parent class which has the mark and any specific methods for their category

 ```python
@pytest.mark.basic
@@ -83,50 +73,23 @@ class BasicChallenge(Challenge):
    pass
 ```

-If you want to add a custom mark to a Challenge, you must specify it before the test definition
-
-```python
-@pytest.mark.other_mark
-def test_retrieval(self, workspace):
-```
-
-To add a dependency to a challenge use the following
-
-```python
-# to defining what a test depends on
-from pytest_dependency import depends
-
-def test1(self, request, workspace):
-   depends(request, data.dependencies)
-# for defining a test as a dependency
-@pytest.mark.dependency()
-def test2
-```
-
-Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
-
-```python
-@pytest.mark.run(order=1)
-```
-
 To create a file to test a challenge, add this to the challenge file which will create a file before running the server

 ```python
-@pytest.fixture(scope="module", autouse=True)
-def setup_module(workspace):
-    if data.ground.should_contain:
+@pytest.fixture(
+        scope="module", autouse=True
+    )  # this is specific to setting up a file for the test, not all tests have this
+    def setup_module(self, workspace):
        Challenge.write_to_file(
-            workspace, data.ground.files[0], "this is how we're doing"
+            workspace, self.data.ground.files[0], "this is how we're doing"
        )
 ```

-## Api
-
-FastAPI with REST, import requests to call in auto-gpt-benchmarks. Boilerplate code given to agent project to start server
+#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)

 ## Workspace

-Defined by the user on config
+If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config

 #### Dataset

@@ -138,9 +101,9 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git
 |-- auto-gpt-benchmarks/ **main project directory**
 | |-- metrics.py **combining scores, metrics, final evaluation**
 | |-- start_benchmark.py **entry point from cli**
-| |-- conftest.py **shared fixtures across all tests**
-| |-- Challenge.py **easy challenge creation class?**
-| |-- config.json **hostname, port, workspace folder**
+| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
+| |-- Challenge.py **easy challenge creation class**
+| |-- config.json **workspace folder**
 | |-- challenges/ **challenges across different domains**
 | | |-- adaptability/
 | | |-- basic_abilities/
@@ -149,28 +112,7 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git
 | | |-- retrieval/
 | | |-- web_navigation/
 | | |-- writing/
-| |-- tests/ **challenges across different metrics**
-| | |-- basic_abilities/
-| | |-- interface/
-| |-- workspace/ **workspace related func**
-| | |-- **init**.py
-| | |-- workspace_manager.py **creation, deletion**
+| |-- tests/
+| | |-- basic_abilities/ **every llm should pass these challenges**
+| | |-- regression/ **challenges that already passed**
 ```
-
-### Easy Challenge Creation
-
-tbd, but potentially shared Challenge class that challenges instantiate as challenges need different utils/metrics for eval
-
-#### Written Challenges
-
-For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore
-
-#### Validators
-
-Designed to handle specific types of output (e.g., text, code, structured data)
-
-#### Logging
-
-Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc
-
-Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -4,7 +4,8 @@

 Input:

- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
+- **name** (str): Name of the challenge.
+- **category** (str[]): Category of the challenge such as 'basic', 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
 - **task** (str): The task that the agent needs to solve.
 - **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function.
 - **ground** (dict): The ground truth.
@@ -12,7 +13,9 @@ Input:
  - **should_contain** (list): The exact strings that are required in the final answer.
  - **should_not_contain** (list): The exact strings that should not be in the final answer.
  - **files** (list): Files that are used for retrieval. Can specify file here or an extension.
- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+- **mock** (dict): Mock response for testing.
+  - **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+  - **mock_task** (str): Task to provide for the mock function.
 - **info** (dict): Additional info about the challenge.
  - **difficulty** (str): The difficulty of this query.
  - **description** (str): Description of the challenge.
@@ -22,24 +25,26 @@ Example:

 ```python
 {
+  "name": "basic_write_file",
  "category": ["basic"],
-  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": [
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
-  ],
+  "task": "Print the the capital of America to a .txt file",
+  "dependencies": [],
  "ground": {
-    "answer": "random string: this is how we're doing",
-    "should_contain": ["random string: this is how we're doing"],
-    "files": ["file_to_check.txt"]
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": [".txt"]
+  },
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
  },
-  "mock_func": "basic_read_file_mock",
  "info": {
-    "description": "This reads the file quickly",
    "difficulty": "basic",
-    "side_effects": [""]
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
  }
 }
-
 ```

 Current Output:
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,3 @@
 {
-  "hostname": "localhost",
-  "port": 8080,
-  "workspace": "C:/Users/silen/miniagi"
+  "hostname": "localhost"
 }
--- a/agbenchmark/mocks/basic_gpt_agent.py
+++ b/agbenchmark/mocks/basic_gpt_agent.py
@@ -1,20 +0,0 @@
-import json
-import openai
-
-
-def basic_gpt_agent(query) -> str:
-    response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo-0613", messages=[{"role": "user", "content": query}]
-    )
-
-    answer = response["choices"][0]["message"]["content"]  # type: ignore
-
-    print("QUERY       : ", query)
-    print("AGENT ANSWER: ", answer)
-
-    return answer
-
-
-if __name__ == "__main__":
-    # server boilerplate example here
-    basic_gpt_agent("")
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -1,5 +1,4 @@
 from agbenchmark.Challenge import Challenge
-from ..basic_gpt_agent import basic_gpt_agent


 def basic_read_file_mock(task: str, workspace: str):
@@ -18,9 +17,8 @@ def basic_write_file_mock(task: str, workspace: str):
    """
    This mock writes to a file (creates one if it doesn't exist)
    """
-
-    # Call the basic_gpt_agent to get a response.
-    response = basic_gpt_agent(task)
-
-    # Open the file in write mode.
-    Challenge.write_to_file(workspace, "file_to_check.txt", response)
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "Washington DC is the capital of the United States of America",
+    )
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -27,10 +27,6 @@ def start(category, noreg, mock):
    if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0:
        config = {}

-        config["hostname"] = click.prompt(
-            "\nPlease enter a new hostname", default="localhost"
-        )
-        config["port"] = click.prompt("Please enter a new port", default=8080)
        config["workspace"] = click.prompt(
            "Please enter a new workspace path",
            default=os.path.join(Path.home(), "miniagi"),
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,5 +1,4 @@
 import pytest
-from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
@@ -8,9 +7,7 @@ import os
 class TestReadFile(BasicChallenge):
    """Testing if LLM can read a file"""

-    @pytest.fixture(
-        scope="module", autouse=True
-    )  # this is specific to setting up a file for the test, not all tests have this
+    @pytest.fixture(scope="module", autouse=True)
    def setup_module(self, workspace):
        Challenge.write_to_file(
            workspace, self.data.ground.files[0], "this is how we're doing"
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,5 +1,4 @@
 import pytest
-from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os

--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -3,12 +3,5 @@
        "difficulty": "basic",
        "dependencies": [],
        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "basic_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
    }
 }