diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 45becaf7..489d298f 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge): [(data.task, data.mock_func)], indirect=True, ) - def test_retrieval(self, request, workspace): + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) + def test_retrieval(self, workspace, current_challenge_data): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 434f6dbd..78114c20 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager +from agbenchmark.challenges.define_task_types import ChallengeData @pytest.fixture(scope="module") @@ -64,21 +65,34 @@ def server_response(request, config): # print(f"Request succeeded with status code {response.status_code}") -regression_txt = "agbenchmark/tests/regression/regression_tests.txt" +regression_json = "agbenchmark/tests/regression/regression_tests.json" -regression_manager = RegressionManager(regression_txt) +regression_manager = RegressionManager(regression_json) + + +# this is to get the challenge_data from every test +@pytest.fixture(autouse=True) +def regression_data(request): + return request.param def pytest_runtest_makereport(item, call): - """Called for each test report. Generated for each stage - of a test run (setup, call, teardown).""" if call.when == "call": - if ( - call.excinfo is None - ): # if no error in the call stage, add it as a regression test - regression_manager.add_test(item.nodeid) - else: # otherwise, :( - regression_manager.remove_test(item.nodeid) + challenge_data = item.funcargs.get("regression_data", None) + difficulty = challenge_data.info.difficulty if challenge_data else "unknown" + dependencies = challenge_data.dependencies if challenge_data else [] + + test_details = { + "difficulty": difficulty, + "dependencies": dependencies, + "test": item.nodeid, + } + + print("pytest_runtest_makereport", test_details) + if call.excinfo is None: + regression_manager.add_test(item.nodeid.split("::")[1], test_details) + else: + regression_manager.remove_test(item.nodeid.split("::")[1]) def pytest_collection_modifyitems(items): @@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items): to add regression marker to collected test items.""" for item in items: print("pytest_collection_modifyitems", item.nodeid) - if item.nodeid + "\n" in regression_manager.tests: + if item.nodeid.split("::")[1] in regression_manager.tests: print(regression_manager.tests) item.add_marker(pytest.mark.regression) diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 494a9b07..7d14228c 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 562d1c36..1d262108 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -10,7 +10,7 @@ }, "mock_func": "basic_write_file_mock", "info": { - "difficulty": "easy", + "difficulty": "basic", "description": "Tests the writing to file", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 0a4ef4a2..33012889 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py index 9117d53f..a1379eca 100644 --- a/agbenchmark/tests/regression/RegressionManager.py +++ b/agbenchmark/tests/regression/RegressionManager.py @@ -1,3 +1,6 @@ +import json + + class RegressionManager: """Abstracts interaction with the regression tests file""" @@ -6,17 +9,21 @@ class RegressionManager: self.load() def load(self) -> None: - with open(self.filename, "r") as f: - self.tests = f.readlines() + try: + with open(self.filename, "r") as f: + self.tests = json.load(f) + except (FileNotFoundError, json.decoder.JSONDecodeError): + self.tests = {} def save(self) -> None: with open(self.filename, "w") as f: - f.writelines(self.tests) + json.dump(self.tests, f, indent=4) - def add_test(self, test_id) -> None: - if f"{test_id}\n" not in self.tests: - self.tests.append(f"{test_id}\n") + def add_test(self, test_name: str, test_details: dict) -> None: + self.tests[test_name] = test_details + self.save() - def remove_test(self, test_id) -> None: - if f"{test_id}\n" in self.tests: - self.tests.remove(f"{test_id}\n") + def remove_test(self, test_name: str) -> None: + if test_name in self.tests: + del self.tests[test_name] + self.save() diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/agbenchmark/tests/regression/regression_tests.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 57b94cd7..8af722f0 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,14 @@ -agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] -agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] +{ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": { + "difficulty": "easy", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" + }, + "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": { + "difficulty": "basic", + "dependencies": [ + "test_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" + } +} \ No newline at end of file