diff --git a/agbenchmark/ReportManager.py b/agbenchmark/ReportManager.py index cae13595..202574f9 100644 --- a/agbenchmark/ReportManager.py +++ b/agbenchmark/ReportManager.py @@ -3,7 +3,7 @@ import os import sys import time from datetime import datetime -from typing import Any, Dict +from typing import Any, Dict, Optional from agbenchmark.utils import get_highest_success_difficulty @@ -37,8 +37,18 @@ class ReportManager: with open(self.filename, "w") as f: json.dump(self.tests, f, indent=4) - def add_test(self, test_name: str, test_details: dict | list) -> None: - self.tests[test_name] = test_details + def add_test( + self, + test_name: str, + test_details: dict | list, + agent_name: Optional[str] = None, + ) -> None: + if agent_name: + if agent_name not in self.tests: + self.tests[agent_name] = {} + self.tests[agent_name][test_name] = test_details + else: + self.tests[test_name] = test_details self.save() diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 245df485..4a62af0b 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -15,7 +15,7 @@ from agbenchmark.start_benchmark import ( REGRESSION_TESTS_PATH, get_regression_data, ) -from agbenchmark.utils import calculate_success_percentage +from agbenchmark.utils import AGENT_NAME, calculate_success_percentage def resolve_workspace(workspace: str) -> str: @@ -128,9 +128,10 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH) # user facing reporting information info_manager = ReportManager(INFO_TESTS_PATH) -INTERNAL_LOGS = Path(__file__).resolve().parent # agbenchmark/conftest.py +INTERNAL_LOGS_PATH = Path(__file__).resolve().parent / "reports" + # internal db step in replacement track pass/fail rate -internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json")) +internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json")) def pytest_runtest_makereport(item: Any, call: Any) -> None: @@ -171,11 +172,22 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: regression_manager.remove_test(test_name) info_details["metrics"]["fail_reason"] = str(call.excinfo.value) - prev_test_results: list[bool] = internal_info.tests.get(test_name, []) + prev_test_results: list[bool] + agent_tests: dict[str, list[bool]] = {} + + # if the structure is nested inside of the agent name + if AGENT_NAME: + agent_tests = internal_info.tests.get(AGENT_NAME, {}) + + if agent_tests: + prev_test_results = agent_tests.get(test_name, []) + else: + prev_test_results = internal_info.tests.get(test_name, []) + if not mock: # only add if it's an actual test prev_test_results.append(info_details["metrics"]["success"]) - internal_info.add_test(test_name, prev_test_results) + internal_info.add_test(test_name, prev_test_results, AGENT_NAME) # can calculate success rate regardless of mock info_details["metrics"]["success_%"] = calculate_success_percentage( diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json deleted file mode 100644 index 0e34ad7a..00000000 --- a/agbenchmark/internal_info.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "TestBasicMemory": [ - true, - true, - true - ], - "TestBasicRetrieval": [ - true, - true, - true - ], - "TestCreateSimpleWebServer": [ - false, - false, - false - ], - "TestDebugSimpleTypoWithGuidance": [ - false, - false, - false, - false, - false - ], - "TestDebugSimpleTypoWithoutGuidance": [ - false, - false, - false - ], - "TestReadFile": [ - true, - true, - true, - true - ], - "TestRememberMultipleIds": [ - true, - true, - true - ], - "TestRememberMultipleIdsWithNoise": [ - true, - true, - true - ], - "TestRememberMultiplePhrasesWithNoise": [ - true, - true, - true - ], - "TestRetrieval2": [ - true, - true, - true - ], - "TestRetrieval3": [ - true, - true, - true - ], - "TestSearch": [ - true, - true, - true, - true - ], - "TestWriteFile": [ - true, - true, - true, - false, - false, - false, - false, - true, - false, - true, - false, - false, - false, - false, - true - ] -} \ No newline at end of file diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json new file mode 100644 index 00000000..97b525c0 --- /dev/null +++ b/agbenchmark/reports/internal_info.json @@ -0,0 +1,40 @@ +{ + "mini-agi": { + "TestBasicMemory": [true, true, true], + "TestBasicRetrieval": [true, true, true], + "TestCreateSimpleWebServer": [false, false, false], + "TestDebugSimpleTypoWithGuidance": [ + false, + false, + false, + false, + false, + false + ], + "TestDebugSimpleTypoWithoutGuidance": [false, false, false], + "TestReadFile": [true, true, true, true], + "TestRememberMultipleIds": [true, true, true], + "TestRememberMultipleIdsWithNoise": [true, true, true], + "TestRememberMultiplePhrasesWithNoise": [true, true, true], + "TestRetrieval2": [true, true, true], + "TestRetrieval3": [true, true, true], + "TestSearch": [true, true, true, true], + "TestWriteFile": [ + true, + true, + true, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + true + ] + } +} diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index c69509c7..e99a1fa0 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -17,7 +17,6 @@ HOME_ENV = os.getenv("HOME_ENV") def calculate_info_test_path(reports_path: Path) -> str: - print("reports_pathreports_pathreports_pathreports_path", reports_path) if not reports_path.exists(): reports_path.mkdir(parents=True, exist_ok=True) return str( @@ -129,6 +128,7 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]: CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths( benchmarks_folder_path ) + else: # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo) # used when its just a pip install @@ -139,4 +139,9 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]: if not benchmarks_folder_path.exists(): benchmarks_folder_path.mkdir(exist_ok=True) - return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH + return ( + HOME_DIRECTORY, + CONFIG_PATH, + REGRESSION_TESTS_PATH, + INFO_TESTS_PATH, + )