From ce4cefe7e7211025994a4eab84c3a96209e705cb Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 16 Jul 2023 21:24:06 -0400 Subject: [PATCH] Dynamic home path for runs (#119) --- .env.example | 2 +- agbenchmark/README.md | 72 ++------- agbenchmark/agent_interface.py | 9 +- agbenchmark/config.json | 3 +- agbenchmark/internal_info.json | 12 +- agbenchmark/reports/file1_07-14-18-54.json | 147 ------------------ .../reports/mini-agi/file1_07-16-13-07.json | 23 +++ agbenchmark/start_benchmark.py | 20 +-- agbenchmark/utils.py | 81 ++++++++-- agent/Auto-GPT | 2 +- agent/SuperAGI | 2 +- agent/gpt-engineer | 2 +- agent/mini-agi | 2 +- agent/smol-developer | 2 +- 14 files changed, 135 insertions(+), 244 deletions(-) delete mode 100644 agbenchmark/reports/file1_07-14-18-54.json create mode 100644 agbenchmark/reports/mini-agi/file1_07-16-13-07.json diff --git a/.env.example b/.env.example index e50ed58a..197810bb 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,3 @@ AGENT_NAME=mini-agi -ENVIRONMENT=local +HOME_ENV= MOCK_TEST=False \ No newline at end of file diff --git a/agbenchmark/README.md b/agbenchmark/README.md index 42e2bd4d..c814e6cf 100644 --- a/agbenchmark/README.md +++ b/agbenchmark/README.md @@ -40,45 +40,6 @@ Let people know what beautiful code you write does, document everything well Share your progress :) -### Pytest - -an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic - -```python -import pytest -from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge -import os - - -class TestWriteFile(BasicChallenge): - """Testing if LLM can write to a file""" - - def test_method(self, config): - # implement scoring logic by looking at workspace -``` - -All challenges will inherit from parent class which has the mark and any specific methods for their category - -```python -@pytest.mark.basic -class BasicChallenge(Challenge): - pass -``` - -Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. - -```python -@pytest.fixture( - scope="module", autouse=True - ) # this is specific to setting up a file for the test, not all tests have this - def setup_module(self, workspace): - Challenge.write_to_file( - workspace, self.data.ground.files[0], "this is how we're doing" - ) -``` - -#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py) - ## Workspace If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config @@ -87,29 +48,7 @@ If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-ag Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/ -## Repo - -``` -|-- auto-gpt-benchmarks/ **main project directory** -| |-- metrics.py **combining scores, metrics, final evaluation** -| |-- start_benchmark.py **entry point from cli** -| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization** -| |-- Challenge.py **easy challenge creation class** -| |-- config.json **workspace folder** -| |-- challenges/ **challenges across different domains** -| | |-- adaptability/ -| | |-- basic_abilities/ -| | |-- code/ -| | |-- memory/ -| | |-- retrieval/ -| | |-- web_navigation/ -| | |-- writing/ -| |-- tests/ -| | |-- basic_abilities/ **every llm should pass these challenges** -| | |-- regression/ **challenges that already passed** -``` - -## How to add new agents to agbenchmark ? +## How do I add new agents to agbenchmark ? Example with smol developer. @@ -120,3 +59,12 @@ https://github.com/smol-ai/developer/pull/114/files 2- Create the submodule and the github workflow by following the same pattern as this example: https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files + +## How do I run agent in different environments? + +**To just use as the benchmark for your agent**. `pip install` the package and run `agbenchmark start` + +**For internal Auto-GPT ci runs**, specify the `AGENT_NAME` you want you use and set the `HOME_ENV`. +Ex. `HOME_ENV=ci AGENT_NAME=mini-agi` + +**To develop agent alongside benchmark**, you can specify the `AGENT_NAME` you want you use and add as a submodule to the repo diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 897f4f8c..ff5bc890 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -7,7 +7,7 @@ from typing import Any, Dict from dotenv import load_dotenv -from agbenchmark.start_benchmark import CURRENT_DIRECTORY +from agbenchmark.start_benchmark import CURRENT_DIRECTORY, HOME_DIRECTORY load_dotenv() @@ -25,13 +25,16 @@ def run_agent( config["workspace"], "artifacts_out", challenge_location ) else: - print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}") - command = [sys.executable, "-m", config["entry_path"], str(task)] + entry_path = "agbenchmark.benchmarks" + + print(f"Running Python function '{entry_path}' with timeout {cutoff}") + command = [sys.executable, "-m", entry_path, str(task)] process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, + cwd=HOME_DIRECTORY, ) start_time = time.time() diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 820f133b..3a03b741 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,4 +1,3 @@ { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" + "workspace": "${os.path.join(Path.home(), 'miniagi')}" } diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json index 95a051d5..0e34ad7a 100644 --- a/agbenchmark/internal_info.json +++ b/agbenchmark/internal_info.json @@ -15,6 +15,8 @@ false ], "TestDebugSimpleTypoWithGuidance": [ + false, + false, false, false, false @@ -25,6 +27,7 @@ false ], "TestReadFile": [ + true, true, true, true @@ -55,6 +58,7 @@ true ], "TestSearch": [ + true, true, true, true @@ -68,6 +72,12 @@ false, false, true, - false + false, + true, + false, + false, + false, + false, + true ] } \ No newline at end of file diff --git a/agbenchmark/reports/file1_07-14-18-54.json b/agbenchmark/reports/file1_07-14-18-54.json deleted file mode 100644 index f81d19d3..00000000 --- a/agbenchmark/reports/file1_07-14-18-54.json +++ /dev/null @@ -1,147 +0,0 @@ -{ - "command": "agbenchmark start --mock", - "completion_time": "2023-07-14-18:54", - "metrics": { - "run_time": "0.97 seconds", - "highest_difficulty": "advanced: 5" - }, - "tests": { - "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": false, - "metrics": { - "difficulty": "interface", - "success": true, - "non_mock_success_%": 75.0, - "run_time": "0.007 seconds" - } - }, - "TestReadFile": { - "data_path": "agbenchmark/challenges/interface/read_file", - "is_regression": true, - "metrics": { - "difficulty": "interface", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.008 seconds" - } - }, - "TestSearch": { - "data_path": "agbenchmark/challenges/interface/search", - "is_regression": true, - "metrics": { - "difficulty": "interface", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.007 seconds" - } - }, - "TestDebugSimpleTypoWithGuidance": { - "data_path": "agbenchmark/challenges/code/d1", - "is_regression": false, - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "assert 1 in [0.0]", - "non_mock_success_%": 0.0, - "run_time": "0.448 seconds" - } - }, - "TestBasicMemory": { - "data_path": "agbenchmark/challenges/memory/m1", - "is_regression": true, - "metrics": { - "difficulty": "basic", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.028 seconds" - } - }, - "TestBasicRetrieval": { - "data_path": "agbenchmark/challenges/retrieval/r1", - "is_regression": true, - "metrics": { - "difficulty": "basic", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.014 seconds" - } - }, - "TestDebugSimpleTypoWithoutGuidance": { - "data_path": "agbenchmark/challenges/code/d2", - "is_regression": false, - "metrics": { - "difficulty": "novice", - "success": false, - "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "non_mock_success_%": 0.0, - "run_time": "0.001 seconds" - } - }, - "TestCreateSimpleWebServer": { - "data_path": "agbenchmark/challenges/code/d3", - "is_regression": false, - "metrics": { - "difficulty": "advanced", - "success": false, - "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "non_mock_success_%": 0.0, - "run_time": "0.002 seconds" - } - }, - "TestRememberMultipleIds": { - "data_path": "agbenchmark/challenges/memory/m2", - "is_regression": true, - "metrics": { - "difficulty": "novice", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.023 seconds" - } - }, - "TestRetrieval2": { - "data_path": "agbenchmark/challenges/retrieval/r2", - "is_regression": true, - "metrics": { - "difficulty": "novice", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.013 seconds" - } - }, - "TestRememberMultipleIdsWithNoise": { - "data_path": "agbenchmark/challenges/memory/m3", - "is_regression": true, - "metrics": { - "difficulty": "intermediate", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.03 seconds" - } - }, - "TestRetrieval3": { - "data_path": "agbenchmark/challenges/retrieval/r3", - "is_regression": true, - "metrics": { - "difficulty": "intermediate", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.016 seconds" - } - }, - "TestRememberMultiplePhrasesWithNoise": { - "data_path": "agbenchmark/challenges/memory/m4", - "is_regression": true, - "metrics": { - "difficulty": "advanced", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.034 seconds" - } - } - }, - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" - } -} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json new file mode 100644 index 00000000..78bafc5f --- /dev/null +++ b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json @@ -0,0 +1,23 @@ +{ + "command": "agbenchmark start --test TestWriteFile", + "completion_time": "2023-07-16-13:07", + "metrics": { + "run_time": "13.91 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 30.0, + "run_time": "13.684 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index b31c9f5f..ea17d152 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -6,20 +6,17 @@ from typing import Any import click import pytest -from dotenv import load_dotenv -load_dotenv() - -from agbenchmark.utils import calculate_info_test_path +from agbenchmark.utils import calculate_dynamic_paths CURRENT_DIRECTORY = Path(__file__).resolve().parent -benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark" - -CONFIG_PATH = str(benchmarks_folder_path / "config.json") -REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json") - -INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path) +( + HOME_DIRECTORY, + CONFIG_PATH, + REGRESSION_TESTS_PATH, + INFO_TESTS_PATH, +) = calculate_dynamic_paths() @click.group() @@ -48,9 +45,6 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) - ) return 1 - if not benchmarks_folder_path.exists(): - benchmarks_folder_path.mkdir(exist_ok=True) - print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size) if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: config = {} diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index 506c4884..c69509c7 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -6,25 +6,28 @@ from datetime import datetime from pathlib import Path from typing import Any +from dotenv import load_dotenv + +load_dotenv() + from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel +AGENT_NAME = os.getenv("AGENT_NAME") +HOME_ENV = os.getenv("HOME_ENV") -def calculate_info_test_path(benchmarks_folder_path: Path) -> str: - INFO_TESTS_PATH = ( - benchmarks_folder_path / os.getenv("REPORT_LOCATION", ".") / "reports" - ) - - if not INFO_TESTS_PATH.exists(): - INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True) +def calculate_info_test_path(reports_path: Path) -> str: + print("reports_pathreports_pathreports_pathreports_path", reports_path) + if not reports_path.exists(): + reports_path.mkdir(parents=True, exist_ok=True) return str( - INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json" + reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json" ) else: - json_files = glob.glob(str(INFO_TESTS_PATH / "*.json")) + json_files = glob.glob(str(reports_path / "*.json")) file_count = len(json_files) run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json" - new_file_path = INFO_TESTS_PATH / run_name + new_file_path = reports_path / run_name return str(new_file_path) @@ -79,3 +82,61 @@ def get_highest_success_difficulty(data: dict) -> str: highest_difficulty_str = "" return f"{highest_difficulty_str}: {highest_difficulty_level}" + + +def assign_paths(folder_path: Path) -> tuple[str, str, str]: + CONFIG_PATH = str(folder_path / "config.json") + REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json") + + if HOME_ENV == "ci" and AGENT_NAME: + INFO_TESTS_PATH = calculate_info_test_path( + Path(os.getcwd()) / "agbenchmark" / "reports" / AGENT_NAME + ) + else: + INFO_TESTS_PATH = calculate_info_test_path(folder_path / "reports") + + return CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH + + +def calculate_dynamic_paths() -> tuple[Path, str, str, str]: + # the default home is where you're running from + HOME_DIRECTORY = Path(os.getcwd()) + benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark" + + if AGENT_NAME and HOME_ENV == "ci": + if "/Auto-GPT-Benchmarks/agent" in str(HOME_DIRECTORY): + raise Exception("Must run from root of benchmark repo if HOME_ENV is ci") + + # however if the env is local and the agent name is defined, we want to run that agent from the repo and then get the data in the internal agbenchmark directory + # this is for the ci/cd pipeline + benchmarks_folder_path = HOME_DIRECTORY / "agent" / AGENT_NAME / "agbenchmark" + + CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths( + benchmarks_folder_path + ) + + # we want to run the agent from the submodule + HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME + + elif AGENT_NAME and not os.path.join("Auto-GPT-Benchmarks", "agent") in str( + HOME_DIRECTORY + ): + # if the agent name is defined but the run is not from the agent repo, then home is the agent repo + # used for development of both a benchmark and an agent + HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME + benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark" + + CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths( + benchmarks_folder_path + ) + else: + # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo) + # used when its just a pip install + CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths( + benchmarks_folder_path + ) + + if not benchmarks_folder_path.exists(): + benchmarks_folder_path.mkdir(exist_ok=True) + + return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH diff --git a/agent/Auto-GPT b/agent/Auto-GPT index 62ad7aa8..114c484b 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit 62ad7aa8c9172f8b07cad939e215912088d6dc16 +Subproject commit 114c484b5cfe9a69a74ddcc00025d4a126f54120 diff --git a/agent/SuperAGI b/agent/SuperAGI index f880b246..ae3b89a3 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit f880b24644fbd057d44e8b4390f3ac165c90249b +Subproject commit ae3b89a325994c9dda74b5de39d6f7c48010270f diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 9bb81041..a1d9673f 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 9bb81041ace9f09e8ea0e34e29f2e46bb9d46a36 +Subproject commit a1d9673f82ffce89a9b437e1b54d2e068160819d diff --git a/agent/mini-agi b/agent/mini-agi index 0f8eba95..bb02bf0d 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 0f8eba95d284a9a06801b40ae02c55f65f1a0ce9 +Subproject commit bb02bf0d5cdbf045ff145271b78e4b4ee7225011 diff --git a/agent/smol-developer b/agent/smol-developer index a23d0136..bec01917 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit a23d01369cea976e80b7889fdbf1096619471301 +Subproject commit bec01917a9fa6e7bd73e4d14b328dba468cae495