diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml new file mode 100644 index 00000000..2b192511 --- /dev/null +++ b/.github/workflows/autogpt.yml @@ -0,0 +1,62 @@ +name: Auto-GPT Regression Test + +on: + workflow_dispatch: + +jobs: + regression-tests: + permissions: + pull-requests: write + contents: write + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ["3.10"] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Set up venv and install Python dependencies + run: | + python -m venv venv + source venv/bin/activate + poetry install + + - name: Build project + run: | + source venv/bin/activate + poetry build + cd agent/Auto-GPT + pip install -r requirements.txt + pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl + agbenchmark start --reg + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.gitignore b/.gitignore index 68bc17f9..c41065ca 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,6 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ +.DS_Store +``` diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..b2dc714c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "Auto-GPT"] + path = agent/Auto-GPT + url = https://github.com/Significant-Gravitas/Auto-GPT.git + branch = benchmark-integration diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 2ff2acf3..0961dc0f 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,9 +1,10 @@ -import os import importlib -import time -from agbenchmark.mocks.MockManager import MockManager -from multiprocessing import Process, Pipe +from agbenchmark.mocks.MockManager import MockManager +import os +import sys +import subprocess +import time from dotenv import load_dotenv load_dotenv() @@ -26,45 +27,44 @@ def run_agent(task, mock_func, config): timeout = config["cutoff"] print(f"Running Python function '{config['func_path']}' with timeout {timeout}") - parent_conn, child_conn = Pipe() + # Get the current working directory + cwd = os.getcwd() + + # Add current directory to Python's import path + sys.path.append(cwd) + - # Import the specific agent dynamically module_name = config["func_path"].replace("/", ".").rstrip(".py") module = importlib.import_module(module_name) - run_specific_agent = getattr(module, "run_specific_agent") - process = Process(target=run_specific_agent, args=(task, child_conn)) - process.start() + + command = [sys.executable, "benchmarks.py", str(task)] + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, cwd=cwd) + start_time = time.time() + timeout = config["cutoff"] while True: - if ( - parent_conn.poll() - ): # Check if there's a new message from the child process - response, cycle_count = parent_conn.recv() - print(f"Cycle {cycle_count}: {response}") + output = process.stdout.readline() + print(output.strip()) - if cycle_count >= config["cutoff"]: - print( - f"Cycle count has reached the limit of {config['cutoff']}. Terminating." - ) - child_conn.send("terminate") - break - - if time.time() - start_time > timeout: - print( - "The Python function has exceeded the time limit and was terminated." - ) - child_conn.send( - "terminate" - ) # Send a termination signal to the child process - break - - if not process.is_alive(): + # Check if process has ended + if process.poll() is not None: print("The Python function has finished running.") break - process.join() + # Check if process has exceeded timeout + if time.time() - start_time > timeout: + print("The Python function has exceeded the time limit and was terminated.") + process.terminate() + break + + # Optional: sleep for a while + time.sleep(0.1) + + # Wait for process to terminate, then get return code + process.wait() + ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" diff --git a/agbenchmark/config.json b/agbenchmark/config.json index d9b42ca4..e1c5f154 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,5 @@ { - "workspace": "C:\\Users\\silen\\miniagi", - "func_path": "agent/benchmarks.py", + "workspace": "autogpt/workspace/auto_gpt_workspace", + "func_path": "benchmarks.py", "cutoff": 60 } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 0f1fc7bb..4284d1eb 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -1,15 +1,18 @@ import json import os +from pathlib import Path + import pytest import shutil from agbenchmark.tests.regression.RegressionManager import RegressionManager +from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH @pytest.fixture(scope="module") def config(request): - config_file = os.path.abspath("agbenchmark/config.json") - print(f"Config file: {config_file}") - with open(config_file, "r") as f: + + print(f"Config file: {CONFIG_PATH}") + with open(CONFIG_PATH, "r") as f: config = json.load(f) if request.config.getoption("--mock"): @@ -36,10 +39,7 @@ def workspace(config): def pytest_addoption(parser): parser.addoption("--mock", action="store_true", default=False) - -regression_json = "agbenchmark/tests/regression/regression_tests.json" - -regression_manager = RegressionManager(regression_json) +regression_manager = RegressionManager(REGRESSION_TESTS_PATH) # this is to get the challenge_data from every test @@ -53,13 +53,16 @@ def pytest_runtest_makereport(item, call): challenge_data = item.funcargs.get("challenge_data", None) difficulty = challenge_data.info.difficulty if challenge_data else "unknown" dependencies = challenge_data.dependencies if challenge_data else [] - + parts = item.nodeid.split("::")[0].split("/") + agbenchmark_index = parts.index("agbenchmark") + file_path = "/".join(parts[agbenchmark_index:]) test_details = { "difficulty": difficulty, "dependencies": dependencies, - "test": item.nodeid, + "test": file_path, } + print("pytest_runtest_makereport", test_details) if call.excinfo is None: regression_manager.add_test(item.nodeid.split("::")[1], test_details) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index fe395cd2..28b038e9 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -7,6 +7,13 @@ from dotenv import load_dotenv, set_key load_dotenv() +CURRENT_DIRECTORY = Path(__file__).resolve().parent + +new_path = CURRENT_DIRECTORY / "config.json" + +CONFIG_PATH = str(new_path.resolve()) + +REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json") @click.group() def cli(): @@ -15,16 +22,12 @@ def cli(): @cli.command() @click.option("--category", default=None, help="Specific category to run") -@click.option("--noreg", is_flag=True, help="Skip regression tests") +@click.option("--reg", is_flag=True, help="Runs only regression tests") @click.option("--mock", is_flag=True, help="Run with mock") -def start(category, noreg, mock): +def start(category, reg, mock): """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" - config_file = "agbenchmark/config.json" - - config_dir = os.path.abspath(config_file) - # Check if configuration file exists and is not empty - if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0: + if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: config = {} config["workspace"] = click.prompt( @@ -42,11 +45,11 @@ def start(category, noreg, mock): default="60", ) - with open(config_dir, "w") as f: + with open(CONFIG_PATH, "w") as f: json.dump(config, f) else: # If the configuration file exists and is not empty, load it - with open(config_dir, "r") as f: + with open(CONFIG_PATH, "r") as f: config = json.load(f) set_key(".env", "MOCK_TEST", "True" if mock else "False") @@ -58,11 +61,9 @@ def start(category, noreg, mock): if not os.path.exists(workspace_path): os.makedirs(workspace_path, exist_ok=True) - regression_path = os.path.abspath( - "agbenchmark/tests/regression/regression_tests.json" - ) - if not os.path.exists(regression_path): - with open(regression_path, "a"): + + if not os.path.exists(REGRESSION_TESTS_PATH): + with open(REGRESSION_TESTS_PATH, "a"): pass print("Current configuration:") @@ -70,31 +71,40 @@ def start(category, noreg, mock): print(f"{key}: {value}") print("Starting benchmark tests...", category) - pytest_args = ["agbenchmark", "-vs"] + tests_to_run = [] + pytest_args = ["-vs"] if category: pytest_args.extend( ["-m", category] - ) # run categorys that are of a specific marker - if noreg: - pytest_args.extend( - ["-k", "not regression"] - ) # run categorys that are of a specific marker but don't include regression categorys - print(f"Running {'non-regression' + category if noreg else category} categorys") + ) else: - if noreg: - print("Running all non-regression categorys") - pytest_args.extend( - ["-k", "not regression"] - ) # run categorys that are not regression categorys + if reg: + print("Running all regression tests") + tests_to_run = get_regression_tests() else: - print("Running all categorys") # run all categorys + print("Running all categories") if mock: pytest_args.append("--mock") # Run pytest with the constructed arguments + if not tests_to_run: + tests_to_run = [str(CURRENT_DIRECTORY)] + pytest_args.extend(tests_to_run) pytest.main(pytest_args) +def get_regression_tests(): + if not Path(REGRESSION_TESTS_PATH).exists(): + with open(REGRESSION_TESTS_PATH, 'w') as file: + json.dump({}, file) + + with open(REGRESSION_TESTS_PATH, 'r') as file: + data = json.load(file) + + regression_tests = [str(CURRENT_DIRECTORY / ".." / value['test']) for key, value in data.items()] + + return regression_tests + if __name__ == "__main__": start() diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 306375dd..8d3eb540 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,3 +1,5 @@ +from pathlib import Path + import pytest from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os @@ -9,10 +11,11 @@ class TestWriteFile(BasicChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "w_file_data.json") - @pytest.mark.depends(on=[], name="basic_write_file") def test_method(self, config): self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + + workspace = Path(os.getcwd()) / config['workspace'] + files_contents = self.open_files(workspace, self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json deleted file mode 100644 index 9e26dfee..00000000 --- a/agbenchmark/tests/regression/regression_tests.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/agent/Auto-GPT b/agent/Auto-GPT new file mode 160000 index 00000000..c29ec925 --- /dev/null +++ b/agent/Auto-GPT @@ -0,0 +1 @@ +Subproject commit c29ec925fd9e24f219ef0f2884b08908cd66239b diff --git a/agent/mini-agi b/agent/mini-agi deleted file mode 160000 index d2add8f1..00000000 --- a/agent/mini-agi +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b diff --git a/regression_tests.json b/regression_tests.json new file mode 100644 index 00000000..e3633a2a --- /dev/null +++ b/regression_tests.json @@ -0,0 +1,7 @@ +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py" + } +} \ No newline at end of file