Added --test, consolidate files, reports working (#83)

This commit is contained in:
Silen Naihin
2023-07-10 22:25:19 -04:00
committed by GitHub
parent 437e066a66
commit 8df82909b2
18 changed files with 289 additions and 102 deletions

View File

@@ -1,12 +1,17 @@
import json
from typing import Union
import os
import sys
import time
from datetime import datetime
from typing import Any, Dict, Union
class RegressionManager:
class ReportManager:
"""Abstracts interaction with the regression tests file"""
def __init__(self, filename: str):
self.filename = filename
self.start_time = time.time()
self.load()
def load(self) -> None:
@@ -40,6 +45,18 @@ class RegressionManager:
del self.tests[test_name]
self.save()
def end_info_report(self, config: Dict[str, Any]) -> None:
command = " ".join(sys.argv)
self.tests = {
"command": command.split(os.sep)[-1],
"completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
"time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
"tests": self.tests,
"config": config,
}
self.save()
def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
if isinstance(value, str):
return value.replace("\\\\", "/") # escape \ with \\

View File

@@ -3,6 +3,7 @@ import shutil
import subprocess
import sys
import time
from pathlib import Path
from typing import Any, Dict
from dotenv import load_dotenv
@@ -21,6 +22,7 @@ def run_agent(
"""Calling to get a response"""
if MOCK_FLAG:
print("ITS A MOCK TEST", challenge_location)
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", challenge_location
)
@@ -30,19 +32,13 @@ def run_agent(
f"Running Python function '{config['entry_path']}' with timeout {timeout}"
)
# Get the current working directory
cwd = os.path.join(os.getcwd(), config["home_path"])
# Add current directory to Python's import path
sys.path.append(cwd)
command = [sys.executable, config["entry_path"], str(task)]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
cwd=cwd,
cwd=os.getcwd(),
)
start_time = time.time()
@@ -79,7 +75,9 @@ def run_agent(
def copy_artifacts_into_workspace(
workspace: str, artifact_folder_name: str, challenge_dir_path: str
) -> None:
source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
# this file is at agbenchmark\agent_interface.py
script_dir = Path(__file__).resolve().parent.parent
source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name)
# Check if source_dir exists, if not then return immediately.
if not os.path.exists(source_dir):

View File

@@ -1,4 +1,5 @@
import json
from pathlib import Path
from typing import List, Optional
from pydantic import BaseModel
@@ -32,7 +33,12 @@ class ChallengeData(BaseModel):
@staticmethod
def deserialize(path: str) -> "ChallengeData":
# this script is in root/agbenchmark/challenges/define_task_types.py
script_dir = Path(__file__).resolve().parent.parent.parent
path = str(script_dir / path)
print("Deserializing", path)
with open(path, "r") as file:
data = json.load(file)
return ChallengeData(**data)

View File

@@ -2,7 +2,7 @@
"name": "TestSearch",
"category": ["interface"],
"task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
"dependencies": [],
"dependencies": ["TestWriteFile"],
"ground": {
"answer": "This is a Heading\nThis is a paragraph.",
"should_contain": ["Heading", "paragraph"],

View File

@@ -19,7 +19,7 @@ load_dotenv()
IMPROVE = os.getenv("IMPROVE", "False")
json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True)
json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)
def get_test_path(json_file: str) -> str:
@@ -55,7 +55,7 @@ def generate_tests() -> None:
)
sys.path.append(str(custom_python_location))
for (module_loader, name, ispkg) in pkgutil.iter_modules(
for module_loader, name, ispkg in pkgutil.iter_modules(
[str(custom_python_location)]
):
module = importlib.import_module(name)

View File

@@ -1,6 +1,5 @@
{
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "benchmarks.py",
"home_path": "agent/mini-agi",
"entry_path": "agbenchmark/benchmarks.py",
"cutoff": 60
}

View File

@@ -6,9 +6,10 @@ from typing import Any, Dict, Generator
import pytest
from agbenchmark.RegressionManager import RegressionManager
from agbenchmark.ReportManager import ReportManager
from agbenchmark.start_benchmark import (
CONFIG_PATH,
INFO_TESTS_PATH,
REGRESSION_TESTS_PATH,
get_regression_data,
)
@@ -106,7 +107,8 @@ def challenge_data(request: Any) -> None:
return request.param
regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
regression_manager = ReportManager(REGRESSION_TESTS_PATH)
info_manager = ReportManager(INFO_TESTS_PATH)
def pytest_runtest_makereport(item: Any, call: Any) -> None:
@@ -130,12 +132,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
print("pytest_runtest_makereport", test_details)
if call.excinfo is None:
regression_manager.add_test(item.nodeid.split("::")[1], test_details)
test_details["success"] = True
else:
regression_manager.remove_test(item.nodeid.split("::")[1])
test_details["success"] = False
test_details["fail_reason"] = str(call.excinfo.value)
info_manager.add_test(item.nodeid.split("::")[1], test_details)
def pytest_sessionfinish() -> None:
"""Called at the end of the session to save regression tests"""
def pytest_sessionfinish(session: Any) -> None:
"""Called at the end of the session to save regression tests and info"""
with open(CONFIG_PATH, "r") as f:
config = json.load(f)
info_manager.end_info_report(config)
regression_manager.save()

View File

@@ -1,11 +1,20 @@
{
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/read_file",
"success": true
},
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/memory/m1"
"test": "agbenchmark/challenges/memory/m1",
"success": true
},
"TestBasicRetrieval": {
"difficulty": "basic",
@@ -13,12 +22,62 @@
"TestWriteFile",
"TestSearch"
],
"test": "agbenchmark/challenges/retrieval/r1"
"test": "agbenchmark/challenges/retrieval/r1",
"success": true
},
"TestCreateSimpleWebServer": {
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [
"TestBasicMemory"
],
"test": "agbenchmark/challenges/memory/m2",
"success": true
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestBasicRetrieval"
],
"test": "agbenchmark/challenges/retrieval/r2",
"success": true
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark/challenges/memory/m3",
"success": true
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
],
"test": "agbenchmark/challenges/retrieval/r3",
"success": true
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark/challenges/memory/m4",
"success": true
},
"TestSearch": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/search",
"success": true
},
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/code/d3"
"test": "agbenchmark/challenges/interface/write_file",
"success": true
},
"TestDebugSimpleTypoWithGuidance": {
"difficulty": "basic",
@@ -26,65 +85,15 @@
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/code/d1"
"test": "agbenchmark/challenges/code/d1",
"success": true
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "medium",
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
],
"test": "agbenchmark/challenges/code/d2"
},
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/read_file"
},
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [
"TestBasicMemory"
],
"test": "agbenchmark/challenges/memory/m2"
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark/challenges/memory/m3"
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark/challenges/memory/m4"
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestBasicRetrieval"
],
"test": "agbenchmark/challenges/retrieval/r2"
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
],
"test": "agbenchmark/challenges/retrieval/r3"
},
"TestSearch": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/search"
},
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/write_file"
"test": "agbenchmark/challenges/code/d2",
"success": true
}
}

109
agbenchmark/reports/1.json Normal file
View File

@@ -0,0 +1,109 @@
{
"command": "agbenchmark start --mock",
"completion_time": "2023-07-10-21:19",
"time_elapsed": "8.75 seconds",
"tests": {
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/write_file",
"success": true
},
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/read_file",
"success": true
},
"TestSearch": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/search",
"success": true
},
"TestDebugSimpleTypoWithGuidance": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/code/d1",
"success": true
},
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/memory/m1",
"success": true
},
"TestBasicRetrieval": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile",
"TestSearch"
],
"test": "agbenchmark/challenges/retrieval/r1",
"success": true
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "medium",
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
],
"test": "agbenchmark/challenges/code/d2",
"success": true
},
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [
"TestBasicMemory"
],
"test": "agbenchmark/challenges/memory/m2",
"success": true
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestBasicRetrieval"
],
"test": "agbenchmark/challenges/retrieval/r2",
"success": true
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark/challenges/memory/m3",
"success": true
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
],
"test": "agbenchmark/challenges/retrieval/r3",
"success": true
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark/challenges/memory/m4",
"success": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "agbenchmark/benchmarks.py",
"cutoff": 60
}
}

View File

@@ -10,12 +10,16 @@ from dotenv import load_dotenv
load_dotenv()
from agbenchmark.utils import calculate_info_test_path
CURRENT_DIRECTORY = Path(__file__).resolve().parent
benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark"
CONFIG_PATH = str(Path(os.getcwd()) / "config.json")
CONFIG_PATH = str(benchmarks_folder_path / "config.json")
REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json")
REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json")
INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path)
@click.group()
@@ -25,10 +29,11 @@ def cli() -> None:
@cli.command()
@click.option("--category", default=None, help="Specific category to run")
@click.option("--test", default=None, help="Specific test to run")
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
@click.option("--mock", is_flag=True, help="Run with mock")
def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -> int:
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
# Check if configuration file exists and is not empty
if maintain and improve:
@@ -37,6 +42,16 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
)
return 1
if test and (category or maintain or improve):
print(
"Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
)
return 1
if not benchmarks_folder_path.exists():
benchmarks_folder_path.mkdir(exist_ok=True)
print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size)
if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
config = {}
@@ -46,12 +61,12 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
)
config["entry_path"] = click.prompt(
"Please enter a the path to your run_specific_agent function implementation",
default="/benchmarks.py",
"Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
default="benchmarks.py",
)
config["cutoff"] = click.prompt(
"Please enter a hard cutoff runtime for your agent",
"Please enter a hard cutoff runtime for your agent per test",
default="60",
)
@@ -65,7 +80,11 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
os.environ["MOCK_TEST"] = "True" if mock else "False"
if not os.path.exists(REGRESSION_TESTS_PATH):
with open(REGRESSION_TESTS_PATH, "a"):
with open(REGRESSION_TESTS_PATH, "w"):
pass
if not os.path.exists(INFO_TESTS_PATH):
with open(INFO_TESTS_PATH, "w"):
pass
print("Current configuration:")
@@ -73,18 +92,22 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
print(f"{key}: {value}")
pytest_args = ["-vs"]
if category:
pytest_args.extend(["-m", category])
print("Starting benchmark tests ", category)
if test:
print("Running specific test:", test)
pytest_args.extend(["-k", test])
else:
print("Running all categories")
if category:
pytest_args.extend(["-m", category])
print("Running tests of category:", category)
else:
print("Running all categories")
if maintain:
print("Running only regression tests")
pytest_args.append("--maintain")
elif improve:
print("Running only non-regression tests")
pytest_args.append("--improve")
if maintain:
print("Running only regression tests")
pytest_args.append("--maintain")
elif improve:
print("Running only non-regression tests")
pytest_args.append("--improve")
if mock:
pytest_args.append("--mock")

View File

@@ -1 +1,17 @@
# radio charts, logs, helper functions for tests, anything else relevant.
import glob
from pathlib import Path
def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
INFO_TESTS_PATH = benchmarks_folder_path / "reports"
if not INFO_TESTS_PATH.exists():
INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
return str(INFO_TESTS_PATH / "1.json")
else:
json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
file_count = len(json_files)
run_name = f"{file_count + 1}.json"
new_file_path = INFO_TESTS_PATH / run_name
return str(new_file_path)

View File

@@ -1,6 +1,5 @@
{
"workspace": "projects/my-new-project/workspace",
"entry_path": "benchmarks.py",
"home_path": "",
"entry_path": "agbenchmark/benchmarks.py",
"cutoff": 60
}

View File

@@ -15,5 +15,5 @@ ignore_errors = True
[mypy-agbenchmark.mocks.tests.basic_mocks.*]
ignore_errors = True
[mypy-agbenchmark.tests.regression.RegressionManager.*]
[mypy-agbenchmark.tests.regression.ReportManager.*]
ignore_errors = True