diff --git a/README.md b/README.md index 0a8d119a..0ad0cf34 100644 --- a/README.md +++ b/README.md @@ -51,15 +51,73 @@ Share your progress :) to create a test: -``` -@pytest.mark.parametrize( -"server_response", -["VARIABLE"], # VARIABLE = the query/goal you provide to the model -indirect=True, +```python +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from ..CategoryChallenge import CategoryChallenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") ) -@pytest.mark.(VARIABLE) # VARIABLE = category of the test -def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts -assert os.path.exists(os.path.join(workspace, "file_to_check.txt")) + +class TestSomething(CategoryChallenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + def test_retrieval( + self, workspace + ): + # scoring logic goes here +``` + +All challenges will inherit from parent class which has the mark + +```python +@pytest.mark.basic +class BasicChallenge(Challenge): + pass +``` + +If you want to add a custom mark to a Challenge, you must specify it before the test definition + +```python +@pytest.mark.other_mark +def test_retrieval(self, workspace): +``` + +To add a dependency to a challenge use the following + +```python +# to defining what a test depends on +from pytest_dependency import depends + +def test1(self, request, workspace): + depends(request, data.dependencies) +# for defining a test as a dependency +@pytest.mark.dependency() +def test2 +``` + +Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards + +```python +@pytest.mark.run(order=1) +``` + +To create a file to test a challenge, add this to the challenge file which will create a file before running the server + +```python +@pytest.fixture(scope="module", autouse=True) +def setup_module(workspace): + if data.ground.should_contain: + Challenge.write_to_file( + workspace, data.ground.files[0], "this is how we're doing" + ) ``` ## Api diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 50efe2c4..d5229e93 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -4,28 +4,25 @@ Input: -- **category** (str): information-retrieval -- **difficulty**(str): the difficulty of this query. choices from - -## Information-retrieval challenges - -Input: - -- **category** (str): information-retrieval -- **task** (str): the question the agent needs to be solve. +- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ +- **task** (str): The task that the agent needs to solve. +- **dependencies** (str[]): The dependencies that the challenge needs to run. - **ground** (dict): The ground truth. - - **answer** (str): The raw text of ground truth answer - - **should_contain** (list): the exact strings that is required in the final answer - - **should_not_contain** (list): the exact strings that should not be in the final answer - - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt -- **difficulty**(str): the difficulty of this query. choices from -- **mock_func**: function to mock the agent's response. This is used for testing purposes + - **answer** (str): The raw text of the ground truth answer. + - **should_contain** (list): The exact strings that are required in the final answer. + - **should_not_contain** (list): The exact strings that should not be in the final answer. + - **files** (list): Files that are used for retrieval. Can specify file here or an extension. +- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. +- **info** (dict): Additional info about the challenge. + - **difficulty** (str): The difficulty of this query. + - **description** (str): Description of the challenge. + - **side_effects** (str[]): Describes the effects of the challenge. Example: ```python { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -33,11 +30,16 @@ Example: "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } ``` -Output: +Current Output: - **score** (float): scores range from [0, 1] diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 879a46af..69467121 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -20,6 +20,7 @@ class Ground(BaseModel): class ChallengeData(BaseModel): category: List[str] task: str + dependencies: List[str] ground: Ground mock_func: Optional[str] = None info: Info diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 08b74d1b..fe05b6d5 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,6 @@ { "category": ["basic"], + "dependencies": ["test_write_file"], "task": "What is the capital of America?", "ground": { "answer": "Washington", diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py new file mode 100644 index 00000000..56320740 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -0,0 +1,7 @@ +import pytest +from agbenchmark.Challenge import Challenge + + +@pytest.mark.basic +class BasicChallenge(Challenge): + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 55319ddf..8c5ef62d 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ea794281..03b2d6ca 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,7 +1,9 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os +from pytest_dependency import depends data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r_file_data.json") @@ -16,7 +18,7 @@ def setup_module(workspace): ) -class TestReadFile(Challenge): +class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" @pytest.mark.parametrize( @@ -24,11 +26,9 @@ class TestReadFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(depends=["write_file"]) - def test_retrieval( - self, workspace - ): # create_file simply there for the function to depend on the fixture + def test_read_file(self, request, workspace): + depends(request, data.dependencies) + file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 4aaa1347..562d1c36 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "What is the capital of America?", + "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index b2c559c9..b09162e3 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,6 +1,6 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData -from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os data = ChallengeData.deserialize( @@ -8,7 +8,7 @@ data = ChallengeData.deserialize( ) -class TestWriteFile(Challenge): +class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" @pytest.mark.parametrize( @@ -16,9 +16,8 @@ class TestWriteFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(name="write_file") - def test_retrieval(self, workspace): + @pytest.mark.dependency() + def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 84e625af..b831003f 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1 +1,3 @@ agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 3bc37622..f6f24c5f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -608,6 +608,21 @@ files = [ [package.dependencies] pytest = ">=3.6.0" +[[package]] +name = "pytest-ordering" +version = "0.6" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = "*" +files = [ + {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, + {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, + {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, +] + +[package.dependencies] +pytest = "*" + [[package]] name = "requests" version = "2.31.0" @@ -778,4 +793,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d" +content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7" diff --git a/pyproject.toml b/pyproject.toml index 087ac844..faee61c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" pytest-dependency = "^0.5.1" +pytest-ordering = "^0.6" [build-system]