diff --git a/.env.example b/.env.example index 0a91118a..7782d048 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,3 @@ -OPENAI_API_KEY= AGENT_NAME=mini-agi AGENT_TIMEOUT=60 MOCK_TEST=False \ No newline at end of file diff --git a/README.md b/README.md index 79427947..2c8daa0a 100644 --- a/README.md +++ b/README.md @@ -2,80 +2,70 @@ A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work +## As a user + +1. `pip install auto-gpt-benchmarks` +2. Add boilerplate code to run and kill agent +3. `agbenchmark start` + - `--category challenge_category` to run tests in a specific category + - `--mock` to only run mock tests if they exists for each test + - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests +4. We call boilerplate code for your agent +5. Show pass rate of tests, logs, and any other metrics + +## Contributing + ##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x -### To run the basic existing mock (June 21) +### To run the existing mocks 1. clone the repo `auto-gpt-benchmarks` 2. `pip install poetry` 3. `poetry shell` 4. `poetry install` -5. `agbenchmark start` +5. `cp .env_example .env` +6. `agbenchmark start --mock` Keep config the same and watch the logs :) +### To run with mini-agi + +1. Navigate to `auto-gpt-benchmarks/agent/mini-agi` +2. `pip install -r requirements.txt` +3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed +4. Make sure to follow the commands above, and remove mock flag `agbenchmark start` + - To add requirements `poetry add requirement`. Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access. -If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit +If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit Let people know what beautiful code you write does, document everything well Share your progress :) -## How this works - -1. `pip install auto-gpt-benchmarks` -2. Add boilerplate code to start webserver to your agent (run loop and stop condition) -3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory -4. We call the server to run the agent for each test -5. Show pass rate of tests, logs, and any other metrics - -### To run the basic existing mock (June 21) - -1. clone the repo `auto-gpt-benchmarks` -2. `pip install poetry` -3. `poetry shell` -4. `poetry install` -5. `agbenchmark start` - Keep config the same and watch the logs :) - -#### Bonuses - -- You can adds tests by git cloning auto-gpt-benchmarks to your repo -- Agent is abstracted from benchmark, don't need to do any extra setup other then starting the server -- Simple, easy to use -- Don't have to deal with cloud or parallelization yet - ### Pytest -to create a test: +an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic ```python import pytest -from agbenchmark.challenges.define_task_types import ChallengeData -from ..CategoryChallenge import CategoryChallenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -data = ChallengeData.deserialize( - os.path.join(os.path.dirname(__file__), "r_file_data.json") -) -class TestSomething(CategoryChallenge): - """Testing if LLM can read a file""" +class TestWriteFile(BasicChallenge): + """Testing if LLM can write to a file""" - @pytest.mark.parametrize( - "run_agent", - [(data.task, data.mock_func)], - indirect=True, - ) - def test_retrieval( - self, workspace - ): - # scoring logic goes here + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "w_file_data.json") + + @pytest.mark.depends(on=[], name="basic_write_file") + def test_method(self, workspace): + # implement scoring logic by looking at workspace ``` -All challenges will inherit from parent class which has the mark +All challenges will inherit from parent class which has the mark and any specific methods for their category ```python @pytest.mark.basic @@ -83,50 +73,23 @@ class BasicChallenge(Challenge): pass ``` -If you want to add a custom mark to a Challenge, you must specify it before the test definition - -```python -@pytest.mark.other_mark -def test_retrieval(self, workspace): -``` - -To add a dependency to a challenge use the following - -```python -# to defining what a test depends on -from pytest_dependency import depends - -def test1(self, request, workspace): - depends(request, data.dependencies) -# for defining a test as a dependency -@pytest.mark.dependency() -def test2 -``` - -Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards - -```python -@pytest.mark.run(order=1) -``` - To create a file to test a challenge, add this to the challenge file which will create a file before running the server ```python -@pytest.fixture(scope="module", autouse=True) -def setup_module(workspace): - if data.ground.should_contain: +@pytest.fixture( + scope="module", autouse=True + ) # this is specific to setting up a file for the test, not all tests have this + def setup_module(self, workspace): Challenge.write_to_file( - workspace, data.ground.files[0], "this is how we're doing" + workspace, self.data.ground.files[0], "this is how we're doing" ) ``` -## Api - -FastAPI with REST, import requests to call in auto-gpt-benchmarks. Boilerplate code given to agent project to start server +#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py) ## Workspace -Defined by the user on config +If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config #### Dataset @@ -138,9 +101,9 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git |-- auto-gpt-benchmarks/ **main project directory** | |-- metrics.py **combining scores, metrics, final evaluation** | |-- start_benchmark.py **entry point from cli** -| |-- conftest.py **shared fixtures across all tests** -| |-- Challenge.py **easy challenge creation class?** -| |-- config.json **hostname, port, workspace folder** +| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization** +| |-- Challenge.py **easy challenge creation class** +| |-- config.json **workspace folder** | |-- challenges/ **challenges across different domains** | | |-- adaptability/ | | |-- basic_abilities/ @@ -149,28 +112,7 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git | | |-- retrieval/ | | |-- web_navigation/ | | |-- writing/ -| |-- tests/ **challenges across different metrics** -| | |-- basic_abilities/ -| | |-- interface/ -| |-- workspace/ **workspace related func** -| | |-- **init**.py -| | |-- workspace_manager.py **creation, deletion** +| |-- tests/ +| | |-- basic_abilities/ **every llm should pass these challenges** +| | |-- regression/ **challenges that already passed** ``` - -### Easy Challenge Creation - -tbd, but potentially shared Challenge class that challenges instantiate as challenges need different utils/metrics for eval - -#### Written Challenges - -For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore - -#### Validators - -Designed to handle specific types of output (e.g., text, code, structured data) - -#### Logging - -Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc - -Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index e457b85c..9e74d19c 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -4,7 +4,8 @@ Input: -- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ +- **name** (str): Name of the challenge. +- **category** (str[]): Category of the challenge such as 'basic', 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ - **task** (str): The task that the agent needs to solve. - **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function. - **ground** (dict): The ground truth. @@ -12,7 +13,9 @@ Input: - **should_contain** (list): The exact strings that are required in the final answer. - **should_not_contain** (list): The exact strings that should not be in the final answer. - **files** (list): Files that are used for retrieval. Can specify file here or an extension. -- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. +- **mock** (dict): Mock response for testing. + - **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. + - **mock_task** (str): Task to provide for the mock function. - **info** (dict): Additional info about the challenge. - **difficulty** (str): The difficulty of this query. - **description** (str): Description of the challenge. @@ -22,24 +25,26 @@ Example: ```python { + "name": "basic_write_file", "category": ["basic"], - "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": [ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" - ], + "task": "Print the the capital of America to a .txt file", + "dependencies": [], "ground": { - "answer": "random string: this is how we're doing", - "should_contain": ["random string: this is how we're doing"], - "files": ["file_to_check.txt"] + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": [".txt"] + }, + "mock": { + "mock_func": "basic_write_file_mock", + "mock_task": "What is the capital of America?" }, - "mock_func": "basic_read_file_mock", "info": { - "description": "This reads the file quickly", "difficulty": "basic", - "side_effects": [""] + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] } } - ``` Current Output: diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 9e5c1880..3de1dd64 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,3 @@ { - "hostname": "localhost", - "port": 8080, - "workspace": "C:/Users/silen/miniagi" + "hostname": "localhost" } diff --git a/agbenchmark/mocks/basic_gpt_agent.py b/agbenchmark/mocks/basic_gpt_agent.py deleted file mode 100644 index 6aac3d19..00000000 --- a/agbenchmark/mocks/basic_gpt_agent.py +++ /dev/null @@ -1,20 +0,0 @@ -import json -import openai - - -def basic_gpt_agent(query) -> str: - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo-0613", messages=[{"role": "user", "content": query}] - ) - - answer = response["choices"][0]["message"]["content"] # type: ignore - - print("QUERY : ", query) - print("AGENT ANSWER: ", answer) - - return answer - - -if __name__ == "__main__": - # server boilerplate example here - basic_gpt_agent("") diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 550095b7..631b30c2 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -1,5 +1,4 @@ from agbenchmark.Challenge import Challenge -from ..basic_gpt_agent import basic_gpt_agent def basic_read_file_mock(task: str, workspace: str): @@ -18,9 +17,8 @@ def basic_write_file_mock(task: str, workspace: str): """ This mock writes to a file (creates one if it doesn't exist) """ - - # Call the basic_gpt_agent to get a response. - response = basic_gpt_agent(task) - - # Open the file in write mode. - Challenge.write_to_file(workspace, "file_to_check.txt", response) + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "Washington DC is the capital of the United States of America", + ) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index ac612293..c9f3643c 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -27,10 +27,6 @@ def start(category, noreg, mock): if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0: config = {} - config["hostname"] = click.prompt( - "\nPlease enter a new hostname", default="localhost" - ) - config["port"] = click.prompt("Please enter a new port", default=8080) config["workspace"] = click.prompt( "Please enter a new workspace path", default=os.path.join(Path.home(), "miniagi"), diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 68288a42..f99ae608 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,5 +1,4 @@ import pytest -from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os @@ -8,9 +7,7 @@ import os class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" - @pytest.fixture( - scope="module", autouse=True - ) # this is specific to setting up a file for the test, not all tests have this + @pytest.fixture(scope="module", autouse=True) def setup_module(self, workspace): Challenge.write_to_file( workspace, self.data.ground.files[0], "this is how we're doing" diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 8caa6605..39c73b16 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,5 +1,4 @@ import pytest -from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 8a6278fe..384f9e7c 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -3,12 +3,5 @@ "difficulty": "basic", "dependencies": [], "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "basic_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" } } \ No newline at end of file