From 8df82909b2938424d387cdaa817821adcbee1dac Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 10 Jul 2023 22:25:19 -0400 Subject: [PATCH] Added --test, consolidate files, reports working (#83) --- ...{RegressionManager.py => ReportManager.py} | 21 ++- agbenchmark/agent_interface.py | 14 +- agbenchmark/challenges/define_task_types.py | 6 + .../challenges/interface/search/data.json | 2 +- agbenchmark/challenges/test_all.py | 4 +- config.json => agbenchmark/config.json | 3 +- agbenchmark/conftest.py | 19 ++- .../regression_tests.json | 125 ++++++++++-------- agbenchmark/reports/1.json | 109 +++++++++++++++ agbenchmark/start_benchmark.py | 57 +++++--- agbenchmark/utils.py | 16 +++ agent/Auto-GPT | 2 +- agent/SuperAGI | 2 +- agent/config_example.json | 3 +- agent/gpt-engineer | 2 +- agent/mini-agi | 2 +- agent/smol-developer | 2 +- mypy.ini | 2 +- 18 files changed, 289 insertions(+), 102 deletions(-) rename agbenchmark/{RegressionManager.py => ReportManager.py} (75%) rename config.json => agbenchmark/config.json (53%) rename regression_tests.json => agbenchmark/regression_tests.json (62%) create mode 100644 agbenchmark/reports/1.json diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/ReportManager.py similarity index 75% rename from agbenchmark/RegressionManager.py rename to agbenchmark/ReportManager.py index ac9efc69..e6d8f62f 100644 --- a/agbenchmark/RegressionManager.py +++ b/agbenchmark/ReportManager.py @@ -1,12 +1,17 @@ import json -from typing import Union +import os +import sys +import time +from datetime import datetime +from typing import Any, Dict, Union -class RegressionManager: +class ReportManager: """Abstracts interaction with the regression tests file""" def __init__(self, filename: str): self.filename = filename + self.start_time = time.time() self.load() def load(self) -> None: @@ -40,6 +45,18 @@ class RegressionManager: del self.tests[test_name] self.save() + def end_info_report(self, config: Dict[str, Any]) -> None: + command = " ".join(sys.argv) + self.tests = { + "command": command.split(os.sep)[-1], + "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"), + "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds", + "tests": self.tests, + "config": config, + } + + self.save() + def replace_backslash(self, value: str) -> Union[str, list[str], dict]: if isinstance(value, str): return value.replace("\\\\", "/") # escape \ with \\ diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 1d43577c..d058ad4c 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -3,6 +3,7 @@ import shutil import subprocess import sys import time +from pathlib import Path from typing import Any, Dict from dotenv import load_dotenv @@ -21,6 +22,7 @@ def run_agent( """Calling to get a response""" if MOCK_FLAG: + print("ITS A MOCK TEST", challenge_location) copy_artifacts_into_workspace( config["workspace"], "artifacts_out", challenge_location ) @@ -30,19 +32,13 @@ def run_agent( f"Running Python function '{config['entry_path']}' with timeout {timeout}" ) - # Get the current working directory - cwd = os.path.join(os.getcwd(), config["home_path"]) - - # Add current directory to Python's import path - sys.path.append(cwd) - command = [sys.executable, config["entry_path"], str(task)] process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, - cwd=cwd, + cwd=os.getcwd(), ) start_time = time.time() @@ -79,7 +75,9 @@ def run_agent( def copy_artifacts_into_workspace( workspace: str, artifact_folder_name: str, challenge_dir_path: str ) -> None: - source_dir = os.path.join(challenge_dir_path, artifact_folder_name) + # this file is at agbenchmark\agent_interface.py + script_dir = Path(__file__).resolve().parent.parent + source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name) # Check if source_dir exists, if not then return immediately. if not os.path.exists(source_dir): diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 94cba5b7..f4e3f222 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -1,4 +1,5 @@ import json +from pathlib import Path from typing import List, Optional from pydantic import BaseModel @@ -32,7 +33,12 @@ class ChallengeData(BaseModel): @staticmethod def deserialize(path: str) -> "ChallengeData": + # this script is in root/agbenchmark/challenges/define_task_types.py + script_dir = Path(__file__).resolve().parent.parent.parent + path = str(script_dir / path) + print("Deserializing", path) + with open(path, "r") as file: data = json.load(file) return ChallengeData(**data) diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json index 17ee1ac1..f59b2dc9 100644 --- a/agbenchmark/challenges/interface/search/data.json +++ b/agbenchmark/challenges/interface/search/data.json @@ -2,7 +2,7 @@ "name": "TestSearch", "category": ["interface"], "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", - "dependencies": [], + "dependencies": ["TestWriteFile"], "ground": { "answer": "This is a Heading\nThis is a paragraph.", "should_contain": ["Heading", "paragraph"], diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index 7dee0b2a..f8bb2347 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -19,7 +19,7 @@ load_dotenv() IMPROVE = os.getenv("IMPROVE", "False") -json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True) +json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True) def get_test_path(json_file: str) -> str: @@ -55,7 +55,7 @@ def generate_tests() -> None: ) sys.path.append(str(custom_python_location)) - for (module_loader, name, ispkg) in pkgutil.iter_modules( + for module_loader, name, ispkg in pkgutil.iter_modules( [str(custom_python_location)] ): module = importlib.import_module(name) diff --git a/config.json b/agbenchmark/config.json similarity index 53% rename from config.json rename to agbenchmark/config.json index 8bbcebdb..9dd8b16a 100644 --- a/config.json +++ b/agbenchmark/config.json @@ -1,6 +1,5 @@ { "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "benchmarks.py", - "home_path": "agent/mini-agi", + "entry_path": "agbenchmark/benchmarks.py", "cutoff": 60 } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index e321f5a2..87fdc9c1 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -6,9 +6,10 @@ from typing import Any, Dict, Generator import pytest -from agbenchmark.RegressionManager import RegressionManager +from agbenchmark.ReportManager import ReportManager from agbenchmark.start_benchmark import ( CONFIG_PATH, + INFO_TESTS_PATH, REGRESSION_TESTS_PATH, get_regression_data, ) @@ -106,7 +107,8 @@ def challenge_data(request: Any) -> None: return request.param -regression_manager = RegressionManager(REGRESSION_TESTS_PATH) +regression_manager = ReportManager(REGRESSION_TESTS_PATH) +info_manager = ReportManager(INFO_TESTS_PATH) def pytest_runtest_makereport(item: Any, call: Any) -> None: @@ -130,12 +132,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: print("pytest_runtest_makereport", test_details) if call.excinfo is None: regression_manager.add_test(item.nodeid.split("::")[1], test_details) + test_details["success"] = True else: regression_manager.remove_test(item.nodeid.split("::")[1]) + test_details["success"] = False + test_details["fail_reason"] = str(call.excinfo.value) + + info_manager.add_test(item.nodeid.split("::")[1], test_details) -def pytest_sessionfinish() -> None: - """Called at the end of the session to save regression tests""" +def pytest_sessionfinish(session: Any) -> None: + """Called at the end of the session to save regression tests and info""" + with open(CONFIG_PATH, "r") as f: + config = json.load(f) + + info_manager.end_info_report(config) regression_manager.save() diff --git a/regression_tests.json b/agbenchmark/regression_tests.json similarity index 62% rename from regression_tests.json rename to agbenchmark/regression_tests.json index 0cf2d5f3..68632a12 100644 --- a/regression_tests.json +++ b/agbenchmark/regression_tests.json @@ -1,11 +1,20 @@ { + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile" + ], + "test": "agbenchmark/challenges/interface/read_file", + "success": true + }, "TestBasicMemory": { "difficulty": "basic", "dependencies": [ "TestReadFile", "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m1" + "test": "agbenchmark/challenges/memory/m1", + "success": true }, "TestBasicRetrieval": { "difficulty": "basic", @@ -13,12 +22,62 @@ "TestWriteFile", "TestSearch" ], - "test": "agbenchmark/challenges/retrieval/r1" + "test": "agbenchmark/challenges/retrieval/r1", + "success": true }, - "TestCreateSimpleWebServer": { + "TestRememberMultipleIds": { + "difficulty": "basic", + "dependencies": [ + "TestBasicMemory" + ], + "test": "agbenchmark/challenges/memory/m2", + "success": true + }, + "TestRetrieval2": { + "difficulty": "basic", + "dependencies": [ + "TestBasicRetrieval" + ], + "test": "agbenchmark/challenges/retrieval/r2", + "success": true + }, + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIds" + ], + "test": "agbenchmark/challenges/memory/m3", + "success": true + }, + "TestRetrieval3": { + "difficulty": "basic", + "dependencies": [ + "TestRetrieval2" + ], + "test": "agbenchmark/challenges/retrieval/r3", + "success": true + }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIdsWithNoise" + ], + "test": "agbenchmark/challenges/memory/m4", + "success": true + }, + "TestSearch": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile" + ], + "test": "agbenchmark/challenges/interface/search", + "success": true + }, + "TestWriteFile": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark/challenges/code/d3" + "test": "agbenchmark/challenges/interface/write_file", + "success": true }, "TestDebugSimpleTypoWithGuidance": { "difficulty": "basic", @@ -26,65 +85,15 @@ "TestReadFile", "TestWriteFile" ], - "test": "agbenchmark/challenges/code/d1" + "test": "agbenchmark/challenges/code/d1", + "success": true }, "TestDebugSimpleTypoWithoutGuidance": { "difficulty": "medium", "dependencies": [ "TestDebugSimpleTypoWithGuidance" ], - "test": "agbenchmark/challenges/code/d2" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "test": "agbenchmark/challenges/interface/read_file" - }, - "TestRememberMultipleIds": { - "difficulty": "basic", - "dependencies": [ - "TestBasicMemory" - ], - "test": "agbenchmark/challenges/memory/m2" - }, - "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", - "dependencies": [ - "TestRememberMultipleIds" - ], - "test": "agbenchmark/challenges/memory/m3" - }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", - "dependencies": [ - "TestRememberMultipleIdsWithNoise" - ], - "test": "agbenchmark/challenges/memory/m4" - }, - "TestRetrieval2": { - "difficulty": "basic", - "dependencies": [ - "TestBasicRetrieval" - ], - "test": "agbenchmark/challenges/retrieval/r2" - }, - "TestRetrieval3": { - "difficulty": "basic", - "dependencies": [ - "TestRetrieval2" - ], - "test": "agbenchmark/challenges/retrieval/r3" - }, - "TestSearch": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/search" - }, - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/write_file" + "test": "agbenchmark/challenges/code/d2", + "success": true } } \ No newline at end of file diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/1.json new file mode 100644 index 00000000..df07fb87 --- /dev/null +++ b/agbenchmark/reports/1.json @@ -0,0 +1,109 @@ +{ + "command": "agbenchmark start --mock", + "completion_time": "2023-07-10-21:19", + "time_elapsed": "8.75 seconds", + "tests": { + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/interface/write_file", + "success": true + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile" + ], + "test": "agbenchmark/challenges/interface/read_file", + "success": true + }, + "TestSearch": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile" + ], + "test": "agbenchmark/challenges/interface/search", + "success": true + }, + "TestDebugSimpleTypoWithGuidance": { + "difficulty": "basic", + "dependencies": [ + "TestReadFile", + "TestWriteFile" + ], + "test": "agbenchmark/challenges/code/d1", + "success": true + }, + "TestBasicMemory": { + "difficulty": "basic", + "dependencies": [ + "TestReadFile", + "TestWriteFile" + ], + "test": "agbenchmark/challenges/memory/m1", + "success": true + }, + "TestBasicRetrieval": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile", + "TestSearch" + ], + "test": "agbenchmark/challenges/retrieval/r1", + "success": true + }, + "TestDebugSimpleTypoWithoutGuidance": { + "difficulty": "medium", + "dependencies": [ + "TestDebugSimpleTypoWithGuidance" + ], + "test": "agbenchmark/challenges/code/d2", + "success": true + }, + "TestRememberMultipleIds": { + "difficulty": "basic", + "dependencies": [ + "TestBasicMemory" + ], + "test": "agbenchmark/challenges/memory/m2", + "success": true + }, + "TestRetrieval2": { + "difficulty": "basic", + "dependencies": [ + "TestBasicRetrieval" + ], + "test": "agbenchmark/challenges/retrieval/r2", + "success": true + }, + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIds" + ], + "test": "agbenchmark/challenges/memory/m3", + "success": true + }, + "TestRetrieval3": { + "difficulty": "basic", + "dependencies": [ + "TestRetrieval2" + ], + "test": "agbenchmark/challenges/retrieval/r3", + "success": true + }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIdsWithNoise" + ], + "test": "agbenchmark/challenges/memory/m4", + "success": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark/benchmarks.py", + "cutoff": 60 + } +} \ No newline at end of file diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 68c7932b..917cd4e8 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -10,12 +10,16 @@ from dotenv import load_dotenv load_dotenv() +from agbenchmark.utils import calculate_info_test_path + CURRENT_DIRECTORY = Path(__file__).resolve().parent +benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark" -CONFIG_PATH = str(Path(os.getcwd()) / "config.json") +CONFIG_PATH = str(benchmarks_folder_path / "config.json") +REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json") -REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json") +INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path) @click.group() @@ -25,10 +29,11 @@ def cli() -> None: @cli.command() @click.option("--category", default=None, help="Specific category to run") +@click.option("--test", default=None, help="Specific test to run") @click.option("--maintain", is_flag=True, help="Runs only regression tests") @click.option("--improve", is_flag=True, help="Run only non-regression tests") @click.option("--mock", is_flag=True, help="Run with mock") -def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: +def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -> int: """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" # Check if configuration file exists and is not empty if maintain and improve: @@ -37,6 +42,16 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: ) return 1 + if test and (category or maintain or improve): + print( + "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test." + ) + return 1 + + if not benchmarks_folder_path.exists(): + benchmarks_folder_path.mkdir(exist_ok=True) + + print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size) if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: config = {} @@ -46,12 +61,12 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: ) config["entry_path"] = click.prompt( - "Please enter a the path to your run_specific_agent function implementation", - default="/benchmarks.py", + "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder", + default="benchmarks.py", ) config["cutoff"] = click.prompt( - "Please enter a hard cutoff runtime for your agent", + "Please enter a hard cutoff runtime for your agent per test", default="60", ) @@ -65,7 +80,11 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: os.environ["MOCK_TEST"] = "True" if mock else "False" if not os.path.exists(REGRESSION_TESTS_PATH): - with open(REGRESSION_TESTS_PATH, "a"): + with open(REGRESSION_TESTS_PATH, "w"): + pass + + if not os.path.exists(INFO_TESTS_PATH): + with open(INFO_TESTS_PATH, "w"): pass print("Current configuration:") @@ -73,18 +92,22 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: print(f"{key}: {value}") pytest_args = ["-vs"] - if category: - pytest_args.extend(["-m", category]) - print("Starting benchmark tests ", category) + if test: + print("Running specific test:", test) + pytest_args.extend(["-k", test]) else: - print("Running all categories") + if category: + pytest_args.extend(["-m", category]) + print("Running tests of category:", category) + else: + print("Running all categories") - if maintain: - print("Running only regression tests") - pytest_args.append("--maintain") - elif improve: - print("Running only non-regression tests") - pytest_args.append("--improve") + if maintain: + print("Running only regression tests") + pytest_args.append("--maintain") + elif improve: + print("Running only non-regression tests") + pytest_args.append("--improve") if mock: pytest_args.append("--mock") diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index b05a7ac3..ffde0c6d 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -1 +1,17 @@ # radio charts, logs, helper functions for tests, anything else relevant. +import glob +from pathlib import Path + + +def calculate_info_test_path(benchmarks_folder_path: Path) -> str: + INFO_TESTS_PATH = benchmarks_folder_path / "reports" + + if not INFO_TESTS_PATH.exists(): + INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True) + return str(INFO_TESTS_PATH / "1.json") + else: + json_files = glob.glob(str(INFO_TESTS_PATH / "*.json")) + file_count = len(json_files) + run_name = f"{file_count + 1}.json" + new_file_path = INFO_TESTS_PATH / run_name + return str(new_file_path) diff --git a/agent/Auto-GPT b/agent/Auto-GPT index f360d503..dc2a7699 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit f360d503b113119f6b3ce0acff1dbb4dfae2223a +Subproject commit dc2a76990c75fafacbeaa76eb2e27d48de44cadd diff --git a/agent/SuperAGI b/agent/SuperAGI index 7ab2994d..a28224d8 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916 +Subproject commit a28224d82572b598ccee1057086fabaf33e1aaa9 diff --git a/agent/config_example.json b/agent/config_example.json index ba2ec0b8..7ab65bc2 100644 --- a/agent/config_example.json +++ b/agent/config_example.json @@ -1,6 +1,5 @@ { "workspace": "projects/my-new-project/workspace", - "entry_path": "benchmarks.py", - "home_path": "", + "entry_path": "agbenchmark/benchmarks.py", "cutoff": 60 } diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 4af8c137..cde9be3e 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 4af8c137e82cc51fdd31c23327ceffd64194b984 +Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333 diff --git a/agent/mini-agi b/agent/mini-agi index 4af8a7e6..ad2b3450 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 4af8a7e6085f0518f06180fbf87024a2c9db4c88 +Subproject commit ad2b345050e07efb7ad0bde68c93bc2b4e2d7a92 diff --git a/agent/smol-developer b/agent/smol-developer index a1e4a9ff..c52b14b1 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit a1e4a9ff3a75909c4a892e409a55f86a2c57b7c6 +Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d diff --git a/mypy.ini b/mypy.ini index 764c239f..d35c6962 100644 --- a/mypy.ini +++ b/mypy.ini @@ -15,5 +15,5 @@ ignore_errors = True [mypy-agbenchmark.mocks.tests.basic_mocks.*] ignore_errors = True -[mypy-agbenchmark.tests.regression.RegressionManager.*] +[mypy-agbenchmark.tests.regression.ReportManager.*] ignore_errors = True