mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-02-09 16:24:24 +01:00
read mes, remove port and host from config, etc
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
OPENAI_API_KEY=
|
||||
AGENT_NAME=mini-agi
|
||||
AGENT_TIMEOUT=60
|
||||
MOCK_TEST=False
|
||||
154
README.md
154
README.md
@@ -2,80 +2,70 @@
|
||||
|
||||
A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
|
||||
|
||||
## As a user
|
||||
|
||||
1. `pip install auto-gpt-benchmarks`
|
||||
2. Add boilerplate code to run and kill agent
|
||||
3. `agbenchmark start`
|
||||
- `--category challenge_category` to run tests in a specific category
|
||||
- `--mock` to only run mock tests if they exists for each test
|
||||
- `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests
|
||||
4. We call boilerplate code for your agent
|
||||
5. Show pass rate of tests, logs, and any other metrics
|
||||
|
||||
## Contributing
|
||||
|
||||
##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
|
||||
|
||||
### To run the basic existing mock (June 21)
|
||||
### To run the existing mocks
|
||||
|
||||
1. clone the repo `auto-gpt-benchmarks`
|
||||
2. `pip install poetry`
|
||||
3. `poetry shell`
|
||||
4. `poetry install`
|
||||
5. `agbenchmark start`
|
||||
5. `cp .env_example .env`
|
||||
6. `agbenchmark start --mock`
|
||||
Keep config the same and watch the logs :)
|
||||
|
||||
### To run with mini-agi
|
||||
|
||||
1. Navigate to `auto-gpt-benchmarks/agent/mini-agi`
|
||||
2. `pip install -r requirements.txt`
|
||||
3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed
|
||||
4. Make sure to follow the commands above, and remove mock flag `agbenchmark start`
|
||||
|
||||
- To add requirements `poetry add requirement`.
|
||||
|
||||
Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.
|
||||
|
||||
If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit
|
||||
If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit
|
||||
|
||||
Let people know what beautiful code you write does, document everything well
|
||||
|
||||
Share your progress :)
|
||||
|
||||
## How this works
|
||||
|
||||
1. `pip install auto-gpt-benchmarks`
|
||||
2. Add boilerplate code to start webserver to your agent (run loop and stop condition)
|
||||
3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory
|
||||
4. We call the server to run the agent for each test
|
||||
5. Show pass rate of tests, logs, and any other metrics
|
||||
|
||||
### To run the basic existing mock (June 21)
|
||||
|
||||
1. clone the repo `auto-gpt-benchmarks`
|
||||
2. `pip install poetry`
|
||||
3. `poetry shell`
|
||||
4. `poetry install`
|
||||
5. `agbenchmark start`
|
||||
Keep config the same and watch the logs :)
|
||||
|
||||
#### Bonuses
|
||||
|
||||
- You can adds tests by git cloning auto-gpt-benchmarks to your repo
|
||||
- Agent is abstracted from benchmark, don't need to do any extra setup other then starting the server
|
||||
- Simple, easy to use
|
||||
- Don't have to deal with cloud or parallelization yet
|
||||
|
||||
### Pytest
|
||||
|
||||
to create a test:
|
||||
an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
|
||||
|
||||
```python
|
||||
import pytest
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData
|
||||
from ..CategoryChallenge import CategoryChallenge
|
||||
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
|
||||
import os
|
||||
|
||||
data = ChallengeData.deserialize(
|
||||
os.path.join(os.path.dirname(__file__), "r_file_data.json")
|
||||
)
|
||||
|
||||
class TestSomething(CategoryChallenge):
|
||||
"""Testing if LLM can read a file"""
|
||||
class TestWriteFile(BasicChallenge):
|
||||
"""Testing if LLM can write to a file"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"run_agent",
|
||||
[(data.task, data.mock_func)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_retrieval(
|
||||
self, workspace
|
||||
):
|
||||
# scoring logic goes here
|
||||
def get_file_path(self) -> str: # all tests must implement this method
|
||||
return os.path.join(os.path.dirname(__file__), "w_file_data.json")
|
||||
|
||||
@pytest.mark.depends(on=[], name="basic_write_file")
|
||||
def test_method(self, workspace):
|
||||
# implement scoring logic by looking at workspace
|
||||
```
|
||||
|
||||
All challenges will inherit from parent class which has the mark
|
||||
All challenges will inherit from parent class which has the mark and any specific methods for their category
|
||||
|
||||
```python
|
||||
@pytest.mark.basic
|
||||
@@ -83,50 +73,23 @@ class BasicChallenge(Challenge):
|
||||
pass
|
||||
```
|
||||
|
||||
If you want to add a custom mark to a Challenge, you must specify it before the test definition
|
||||
|
||||
```python
|
||||
@pytest.mark.other_mark
|
||||
def test_retrieval(self, workspace):
|
||||
```
|
||||
|
||||
To add a dependency to a challenge use the following
|
||||
|
||||
```python
|
||||
# to defining what a test depends on
|
||||
from pytest_dependency import depends
|
||||
|
||||
def test1(self, request, workspace):
|
||||
depends(request, data.dependencies)
|
||||
# for defining a test as a dependency
|
||||
@pytest.mark.dependency()
|
||||
def test2
|
||||
```
|
||||
|
||||
Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
|
||||
|
||||
```python
|
||||
@pytest.mark.run(order=1)
|
||||
```
|
||||
|
||||
To create a file to test a challenge, add this to the challenge file which will create a file before running the server
|
||||
|
||||
```python
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def setup_module(workspace):
|
||||
if data.ground.should_contain:
|
||||
@pytest.fixture(
|
||||
scope="module", autouse=True
|
||||
) # this is specific to setting up a file for the test, not all tests have this
|
||||
def setup_module(self, workspace):
|
||||
Challenge.write_to_file(
|
||||
workspace, data.ground.files[0], "this is how we're doing"
|
||||
workspace, self.data.ground.files[0], "this is how we're doing"
|
||||
)
|
||||
```
|
||||
|
||||
## Api
|
||||
|
||||
FastAPI with REST, import requests to call in auto-gpt-benchmarks. Boilerplate code given to agent project to start server
|
||||
#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
|
||||
|
||||
## Workspace
|
||||
|
||||
Defined by the user on config
|
||||
If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
|
||||
|
||||
#### Dataset
|
||||
|
||||
@@ -138,9 +101,9 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git
|
||||
|-- auto-gpt-benchmarks/ **main project directory**
|
||||
| |-- metrics.py **combining scores, metrics, final evaluation**
|
||||
| |-- start_benchmark.py **entry point from cli**
|
||||
| |-- conftest.py **shared fixtures across all tests**
|
||||
| |-- Challenge.py **easy challenge creation class?**
|
||||
| |-- config.json **hostname, port, workspace folder**
|
||||
| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
|
||||
| |-- Challenge.py **easy challenge creation class**
|
||||
| |-- config.json **workspace folder**
|
||||
| |-- challenges/ **challenges across different domains**
|
||||
| | |-- adaptability/
|
||||
| | |-- basic_abilities/
|
||||
@@ -149,28 +112,7 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git
|
||||
| | |-- retrieval/
|
||||
| | |-- web_navigation/
|
||||
| | |-- writing/
|
||||
| |-- tests/ **challenges across different metrics**
|
||||
| | |-- basic_abilities/
|
||||
| | |-- interface/
|
||||
| |-- workspace/ **workspace related func**
|
||||
| | |-- **init**.py
|
||||
| | |-- workspace_manager.py **creation, deletion**
|
||||
| |-- tests/
|
||||
| | |-- basic_abilities/ **every llm should pass these challenges**
|
||||
| | |-- regression/ **challenges that already passed**
|
||||
```
|
||||
|
||||
### Easy Challenge Creation
|
||||
|
||||
tbd, but potentially shared Challenge class that challenges instantiate as challenges need different utils/metrics for eval
|
||||
|
||||
#### Written Challenges
|
||||
|
||||
For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore
|
||||
|
||||
#### Validators
|
||||
|
||||
Designed to handle specific types of output (e.g., text, code, structured data)
|
||||
|
||||
#### Logging
|
||||
|
||||
Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc
|
||||
|
||||
Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility
|
||||
|
||||
@@ -4,7 +4,8 @@
|
||||
|
||||
Input:
|
||||
|
||||
- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
|
||||
- **name** (str): Name of the challenge.
|
||||
- **category** (str[]): Category of the challenge such as 'basic', 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
|
||||
- **task** (str): The task that the agent needs to solve.
|
||||
- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function.
|
||||
- **ground** (dict): The ground truth.
|
||||
@@ -12,7 +13,9 @@ Input:
|
||||
- **should_contain** (list): The exact strings that are required in the final answer.
|
||||
- **should_not_contain** (list): The exact strings that should not be in the final answer.
|
||||
- **files** (list): Files that are used for retrieval. Can specify file here or an extension.
|
||||
- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
|
||||
- **mock** (dict): Mock response for testing.
|
||||
- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
|
||||
- **mock_task** (str): Task to provide for the mock function.
|
||||
- **info** (dict): Additional info about the challenge.
|
||||
- **difficulty** (str): The difficulty of this query.
|
||||
- **description** (str): Description of the challenge.
|
||||
@@ -22,24 +25,26 @@ Example:
|
||||
|
||||
```python
|
||||
{
|
||||
"name": "basic_write_file",
|
||||
"category": ["basic"],
|
||||
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
|
||||
"dependencies": [
|
||||
"agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
|
||||
],
|
||||
"task": "Print the the capital of America to a .txt file",
|
||||
"dependencies": [],
|
||||
"ground": {
|
||||
"answer": "random string: this is how we're doing",
|
||||
"should_contain": ["random string: this is how we're doing"],
|
||||
"files": ["file_to_check.txt"]
|
||||
"answer": "Washington",
|
||||
"should_contain": ["Washington"],
|
||||
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
|
||||
"files": [".txt"]
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "basic_write_file_mock",
|
||||
"mock_task": "What is the capital of America?"
|
||||
},
|
||||
"mock_func": "basic_read_file_mock",
|
||||
"info": {
|
||||
"description": "This reads the file quickly",
|
||||
"difficulty": "basic",
|
||||
"side_effects": [""]
|
||||
"description": "Tests the writing to file",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
Current Output:
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
{
|
||||
"hostname": "localhost",
|
||||
"port": 8080,
|
||||
"workspace": "C:/Users/silen/miniagi"
|
||||
"hostname": "localhost"
|
||||
}
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
import json
|
||||
import openai
|
||||
|
||||
|
||||
def basic_gpt_agent(query) -> str:
|
||||
response = openai.ChatCompletion.create(
|
||||
model="gpt-3.5-turbo-0613", messages=[{"role": "user", "content": query}]
|
||||
)
|
||||
|
||||
answer = response["choices"][0]["message"]["content"] # type: ignore
|
||||
|
||||
print("QUERY : ", query)
|
||||
print("AGENT ANSWER: ", answer)
|
||||
|
||||
return answer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# server boilerplate example here
|
||||
basic_gpt_agent("")
|
||||
@@ -1,5 +1,4 @@
|
||||
from agbenchmark.Challenge import Challenge
|
||||
from ..basic_gpt_agent import basic_gpt_agent
|
||||
|
||||
|
||||
def basic_read_file_mock(task: str, workspace: str):
|
||||
@@ -18,9 +17,8 @@ def basic_write_file_mock(task: str, workspace: str):
|
||||
"""
|
||||
This mock writes to a file (creates one if it doesn't exist)
|
||||
"""
|
||||
|
||||
# Call the basic_gpt_agent to get a response.
|
||||
response = basic_gpt_agent(task)
|
||||
|
||||
# Open the file in write mode.
|
||||
Challenge.write_to_file(workspace, "file_to_check.txt", response)
|
||||
Challenge.write_to_file(
|
||||
workspace,
|
||||
"file_to_check.txt",
|
||||
"Washington DC is the capital of the United States of America",
|
||||
)
|
||||
|
||||
@@ -27,10 +27,6 @@ def start(category, noreg, mock):
|
||||
if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0:
|
||||
config = {}
|
||||
|
||||
config["hostname"] = click.prompt(
|
||||
"\nPlease enter a new hostname", default="localhost"
|
||||
)
|
||||
config["port"] = click.prompt("Please enter a new port", default=8080)
|
||||
config["workspace"] = click.prompt(
|
||||
"Please enter a new workspace path",
|
||||
default=os.path.join(Path.home(), "miniagi"),
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import pytest
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData
|
||||
from agbenchmark.Challenge import Challenge
|
||||
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
|
||||
import os
|
||||
@@ -8,9 +7,7 @@ import os
|
||||
class TestReadFile(BasicChallenge):
|
||||
"""Testing if LLM can read a file"""
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module", autouse=True
|
||||
) # this is specific to setting up a file for the test, not all tests have this
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def setup_module(self, workspace):
|
||||
Challenge.write_to_file(
|
||||
workspace, self.data.ground.files[0], "this is how we're doing"
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import pytest
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData
|
||||
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
|
||||
import os
|
||||
|
||||
|
||||
@@ -3,12 +3,5 @@
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
|
||||
},
|
||||
"TestReadFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"basic_write_file"
|
||||
],
|
||||
"test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user