Dynamic home path for runs (#119)

This commit is contained in:
Silen Naihin
2023-07-16 21:24:06 -04:00
committed by GitHub
parent 5c7acbc719
commit ce4cefe7e7
14 changed files with 135 additions and 244 deletions

View File

@@ -1,3 +1,3 @@
AGENT_NAME=mini-agi
ENVIRONMENT=local
HOME_ENV=
MOCK_TEST=False

View File

@@ -40,45 +40,6 @@ Let people know what beautiful code you write does, document everything well
Share your progress :)
### Pytest
an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
```python
import pytest
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
import os
class TestWriteFile(BasicChallenge):
"""Testing if LLM can write to a file"""
def test_method(self, config):
# implement scoring logic by looking at workspace
```
All challenges will inherit from parent class which has the mark and any specific methods for their category
```python
@pytest.mark.basic
class BasicChallenge(Challenge):
pass
```
Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test.
```python
@pytest.fixture(
scope="module", autouse=True
) # this is specific to setting up a file for the test, not all tests have this
def setup_module(self, workspace):
Challenge.write_to_file(
workspace, self.data.ground.files[0], "this is how we're doing"
)
```
#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
## Workspace
If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
@@ -87,29 +48,7 @@ If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-ag
Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/
## Repo
```
|-- auto-gpt-benchmarks/ **main project directory**
| |-- metrics.py **combining scores, metrics, final evaluation**
| |-- start_benchmark.py **entry point from cli**
| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
| |-- Challenge.py **easy challenge creation class**
| |-- config.json **workspace folder**
| |-- challenges/ **challenges across different domains**
| | |-- adaptability/
| | |-- basic_abilities/
| | |-- code/
| | |-- memory/
| | |-- retrieval/
| | |-- web_navigation/
| | |-- writing/
| |-- tests/
| | |-- basic_abilities/ **every llm should pass these challenges**
| | |-- regression/ **challenges that already passed**
```
## How to add new agents to agbenchmark ?
## How do I add new agents to agbenchmark ?
Example with smol developer.
@@ -120,3 +59,12 @@ https://github.com/smol-ai/developer/pull/114/files
2- Create the submodule and the github workflow by following the same pattern as this example:
https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files
## How do I run agent in different environments?
**To just use as the benchmark for your agent**. `pip install` the package and run `agbenchmark start`
**For internal Auto-GPT ci runs**, specify the `AGENT_NAME` you want you use and set the `HOME_ENV`.
Ex. `HOME_ENV=ci AGENT_NAME=mini-agi`
**To develop agent alongside benchmark**, you can specify the `AGENT_NAME` you want you use and add as a submodule to the repo

View File

@@ -7,7 +7,7 @@ from typing import Any, Dict
from dotenv import load_dotenv
from agbenchmark.start_benchmark import CURRENT_DIRECTORY
from agbenchmark.start_benchmark import CURRENT_DIRECTORY, HOME_DIRECTORY
load_dotenv()
@@ -25,13 +25,16 @@ def run_agent(
config["workspace"], "artifacts_out", challenge_location
)
else:
print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}")
command = [sys.executable, "-m", config["entry_path"], str(task)]
entry_path = "agbenchmark.benchmarks"
print(f"Running Python function '{entry_path}' with timeout {cutoff}")
command = [sys.executable, "-m", entry_path, str(task)]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
cwd=HOME_DIRECTORY,
)
start_time = time.time()

View File

@@ -1,4 +1,3 @@
{
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "agbenchmark.benchmarks"
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
}

View File

@@ -15,6 +15,8 @@
false
],
"TestDebugSimpleTypoWithGuidance": [
false,
false,
false,
false,
false
@@ -25,6 +27,7 @@
false
],
"TestReadFile": [
true,
true,
true,
true
@@ -55,6 +58,7 @@
true
],
"TestSearch": [
true,
true,
true,
true
@@ -68,6 +72,12 @@
false,
false,
true,
false
false,
true,
false,
false,
false,
false,
true
]
}

View File

@@ -1,147 +0,0 @@
{
"command": "agbenchmark start --mock",
"completion_time": "2023-07-14-18:54",
"metrics": {
"run_time": "0.97 seconds",
"highest_difficulty": "advanced: 5"
},
"tests": {
"TestWriteFile": {
"data_path": "agbenchmark/challenges/interface/write_file",
"is_regression": false,
"metrics": {
"difficulty": "interface",
"success": true,
"non_mock_success_%": 75.0,
"run_time": "0.007 seconds"
}
},
"TestReadFile": {
"data_path": "agbenchmark/challenges/interface/read_file",
"is_regression": true,
"metrics": {
"difficulty": "interface",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.008 seconds"
}
},
"TestSearch": {
"data_path": "agbenchmark/challenges/interface/search",
"is_regression": true,
"metrics": {
"difficulty": "interface",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.007 seconds"
}
},
"TestDebugSimpleTypoWithGuidance": {
"data_path": "agbenchmark/challenges/code/d1",
"is_regression": false,
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"non_mock_success_%": 0.0,
"run_time": "0.448 seconds"
}
},
"TestBasicMemory": {
"data_path": "agbenchmark/challenges/memory/m1",
"is_regression": true,
"metrics": {
"difficulty": "basic",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.028 seconds"
}
},
"TestBasicRetrieval": {
"data_path": "agbenchmark/challenges/retrieval/r1",
"is_regression": true,
"metrics": {
"difficulty": "basic",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.014 seconds"
}
},
"TestDebugSimpleTypoWithoutGuidance": {
"data_path": "agbenchmark/challenges/code/d2",
"is_regression": false,
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"non_mock_success_%": 0.0,
"run_time": "0.001 seconds"
}
},
"TestCreateSimpleWebServer": {
"data_path": "agbenchmark/challenges/code/d3",
"is_regression": false,
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"non_mock_success_%": 0.0,
"run_time": "0.002 seconds"
}
},
"TestRememberMultipleIds": {
"data_path": "agbenchmark/challenges/memory/m2",
"is_regression": true,
"metrics": {
"difficulty": "novice",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.023 seconds"
}
},
"TestRetrieval2": {
"data_path": "agbenchmark/challenges/retrieval/r2",
"is_regression": true,
"metrics": {
"difficulty": "novice",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.013 seconds"
}
},
"TestRememberMultipleIdsWithNoise": {
"data_path": "agbenchmark/challenges/memory/m3",
"is_regression": true,
"metrics": {
"difficulty": "intermediate",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.03 seconds"
}
},
"TestRetrieval3": {
"data_path": "agbenchmark/challenges/retrieval/r3",
"is_regression": true,
"metrics": {
"difficulty": "intermediate",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.016 seconds"
}
},
"TestRememberMultiplePhrasesWithNoise": {
"data_path": "agbenchmark/challenges/memory/m4",
"is_regression": true,
"metrics": {
"difficulty": "advanced",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.034 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "agbenchmark.benchmarks"
}
}

View File

@@ -0,0 +1,23 @@
{
"command": "agbenchmark start --test TestWriteFile",
"completion_time": "2023-07-16-13:07",
"metrics": {
"run_time": "13.91 seconds",
"highest_difficulty": "interface: 1"
},
"tests": {
"TestWriteFile": {
"data_path": "agbenchmark/challenges/interface/write_file",
"is_regression": false,
"metrics": {
"difficulty": "interface",
"success": true,
"success_%": 30.0,
"run_time": "13.684 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
}
}

View File

@@ -6,20 +6,17 @@ from typing import Any
import click
import pytest
from dotenv import load_dotenv
load_dotenv()
from agbenchmark.utils import calculate_info_test_path
from agbenchmark.utils import calculate_dynamic_paths
CURRENT_DIRECTORY = Path(__file__).resolve().parent
benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark"
CONFIG_PATH = str(benchmarks_folder_path / "config.json")
REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json")
INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path)
(
HOME_DIRECTORY,
CONFIG_PATH,
REGRESSION_TESTS_PATH,
INFO_TESTS_PATH,
) = calculate_dynamic_paths()
@click.group()
@@ -48,9 +45,6 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -
)
return 1
if not benchmarks_folder_path.exists():
benchmarks_folder_path.mkdir(exist_ok=True)
print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size)
if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
config = {}

View File

@@ -6,25 +6,28 @@ from datetime import datetime
from pathlib import Path
from typing import Any
from dotenv import load_dotenv
load_dotenv()
from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel
AGENT_NAME = os.getenv("AGENT_NAME")
HOME_ENV = os.getenv("HOME_ENV")
def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
INFO_TESTS_PATH = (
benchmarks_folder_path / os.getenv("REPORT_LOCATION", ".") / "reports"
)
if not INFO_TESTS_PATH.exists():
INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
def calculate_info_test_path(reports_path: Path) -> str:
print("reports_pathreports_pathreports_pathreports_path", reports_path)
if not reports_path.exists():
reports_path.mkdir(parents=True, exist_ok=True)
return str(
INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
)
else:
json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
json_files = glob.glob(str(reports_path / "*.json"))
file_count = len(json_files)
run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
new_file_path = INFO_TESTS_PATH / run_name
new_file_path = reports_path / run_name
return str(new_file_path)
@@ -79,3 +82,61 @@ def get_highest_success_difficulty(data: dict) -> str:
highest_difficulty_str = ""
return f"{highest_difficulty_str}: {highest_difficulty_level}"
def assign_paths(folder_path: Path) -> tuple[str, str, str]:
CONFIG_PATH = str(folder_path / "config.json")
REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json")
if HOME_ENV == "ci" and AGENT_NAME:
INFO_TESTS_PATH = calculate_info_test_path(
Path(os.getcwd()) / "agbenchmark" / "reports" / AGENT_NAME
)
else:
INFO_TESTS_PATH = calculate_info_test_path(folder_path / "reports")
return CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH
def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
# the default home is where you're running from
HOME_DIRECTORY = Path(os.getcwd())
benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
if AGENT_NAME and HOME_ENV == "ci":
if "/Auto-GPT-Benchmarks/agent" in str(HOME_DIRECTORY):
raise Exception("Must run from root of benchmark repo if HOME_ENV is ci")
# however if the env is local and the agent name is defined, we want to run that agent from the repo and then get the data in the internal agbenchmark directory
# this is for the ci/cd pipeline
benchmarks_folder_path = HOME_DIRECTORY / "agent" / AGENT_NAME / "agbenchmark"
CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
benchmarks_folder_path
)
# we want to run the agent from the submodule
HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
elif AGENT_NAME and not os.path.join("Auto-GPT-Benchmarks", "agent") in str(
HOME_DIRECTORY
):
# if the agent name is defined but the run is not from the agent repo, then home is the agent repo
# used for development of both a benchmark and an agent
HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
benchmarks_folder_path
)
else:
# otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
# used when its just a pip install
CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
benchmarks_folder_path
)
if not benchmarks_folder_path.exists():
benchmarks_folder_path.mkdir(exist_ok=True)
return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH