more elegant marking & dependency solution

2026-02-14 10:44:20 +01:00 · 2023-06-24 14:42:35 -04:00
parent 4fa9f72083
commit f895d54e02
12 changed files with 126 additions and 38 deletions
--- a/README.md
+++ b/README.md
@@ -51,15 +51,73 @@ Share your progress :)

 to create a test:

-```
-@pytest.mark.parametrize(
-"server_response",
-["VARIABLE"], # VARIABLE = the query/goal you provide to the model
-indirect=True,
+```python
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from ..CategoryChallenge import CategoryChallenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
 )
-@pytest.mark.(VARIABLE) # VARIABLE = category of the test
-def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts
-assert os.path.exists(os.path.join(workspace, "file_to_check.txt"))
+
+class TestSomething(CategoryChallenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    def test_retrieval(
+        self, workspace
+    ):
+        # scoring logic goes here
+```
+
+All challenges will inherit from parent class which has the mark
+
+```python
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
+```
+
+If you want to add a custom mark to a Challenge, you must specify it before the test definition
+
+```python
+@pytest.mark.other_mark
+def test_retrieval(self, workspace):
+```
+
+To add a dependency to a challenge use the following
+
+```python
+# to defining what a test depends on
+from pytest_dependency import depends
+
+def test1(self, request, workspace):
+   depends(request, data.dependencies)
+# for defining a test as a dependency
+@pytest.mark.dependency()
+def test2
+```
+
+Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
+
+```python
+@pytest.mark.run(order=1)
+```
+
+To create a file to test a challenge, add this to the challenge file which will create a file before running the server
+
+```python
+@pytest.fixture(scope="module", autouse=True)
+def setup_module(workspace):
+    if data.ground.should_contain:
+        Challenge.write_to_file(
+            workspace, data.ground.files[0], "this is how we're doing"
+        )
 ```

 ## Api
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -4,28 +4,25 @@

 Input:

- **category** (str): information-retrieval
- **difficulty**(str): the difficulty of this query. choices from
-
-## Information-retrieval challenges
-
-Input:
-
- **category** (str): information-retrieval
- **task** (str): the question the agent needs to be solve.
+- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
+- **task** (str): The task that the agent needs to solve.
+- **dependencies** (str[]): The dependencies that the challenge needs to run.
 - **ground** (dict): The ground truth.
-  - **answer** (str): The raw text of ground truth answer
-  - **should_contain** (list): the exact strings that is required in the final answer
-  - **should_not_contain** (list): the exact strings that should not be in the final answer
-  - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt
- **difficulty**(str): the difficulty of this query. choices from
- **mock_func**: function to mock the agent's response. This is used for testing purposes
+  - **answer** (str): The raw text of the ground truth answer.
+  - **should_contain** (list): The exact strings that are required in the final answer.
+  - **should_not_contain** (list): The exact strings that should not be in the final answer.
+  - **files** (list): Files that are used for retrieval. Can specify file here or an extension.
+- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+- **info** (dict): Additional info about the challenge.
+  - **difficulty** (str): The difficulty of this query.
+  - **description** (str): Description of the challenge.
+  - **side_effects** (str[]): Describes the effects of the challenge.

 Example:

 ```python
 {
-  "category": "retrieval",
+  "category": ["basic"],
  "task": "What is the capital of America?",
  "ground": {
    "answer": "Washington",
@@ -33,11 +30,16 @@ Example:
    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
    "files": ["file_to_check.txt"]
  },
-  "difficulty": "easy"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }

 ```

-Output:
+Current Output:

 - **score** (float): scores range from [0, 1]
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -20,6 +20,7 @@ class Ground(BaseModel):
 class ChallengeData(BaseModel):
    category: List[str]
    task: str
+    dependencies: List[str]
    ground: Ground
    mock_func: Optional[str] = None
    info: Info
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,6 @@
 {
  "category": ["basic"],
+  "dependencies": ["test_write_file"],
  "task": "What is the capital of America?",
  "ground": {
    "answer": "Washington",
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -0,0 +1,7 @@
+import pytest
+from agbenchmark.Challenge import Challenge
+
+
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,6 +1,7 @@
 {
  "category": ["basic"],
  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": ["test_write_file"],
  "ground": {
    "answer": "random string: this is how we're doing",
    "should_contain": ["random string: this is how we're doing"],
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,7 +1,9 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
+from pytest_dependency import depends

 data = ChallengeData.deserialize(
    os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -16,7 +18,7 @@ def setup_module(workspace):
        )


-class TestReadFile(Challenge):
+class TestReadFile(BasicChallenge):
    """Testing if LLM can read a file"""

    @pytest.mark.parametrize(
@@ -24,11 +26,9 @@ class TestReadFile(Challenge):
        [(data.task, data.mock_func)],
        indirect=True,
    )
-    @pytest.mark.basic
-    @pytest.mark.dependency(depends=["write_file"])
-    def test_retrieval(
-        self, workspace
-    ):  # create_file simply there for the function to depend on the fixture
+    def test_read_file(self, request, workspace):
+        depends(request, data.dependencies)
+
        file = self.open_file(workspace, data.ground.files[0])

        score = self.scoring(file, data.ground)
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -1,6 +1,7 @@
 {
  "category": ["basic"],
  "task": "What is the capital of America?",
+  "dependencies": [],
  "ground": {
    "answer": "Washington",
    "should_contain": ["Washington"],
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,6 +1,6 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
-from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os

 data = ChallengeData.deserialize(
@@ -8,7 +8,7 @@ data = ChallengeData.deserialize(
 )


-class TestWriteFile(Challenge):
+class TestWriteFile(BasicChallenge):
    """Testing if LLM can write to a file"""

    @pytest.mark.parametrize(
@@ -16,9 +16,8 @@ class TestWriteFile(Challenge):
        [(data.task, data.mock_func)],
        indirect=True,
    )
-    @pytest.mark.basic
-    @pytest.mark.dependency(name="write_file")
-    def test_retrieval(self, workspace):
+    @pytest.mark.dependency()
+    def test_write_file(self, workspace):
        file = self.open_file(workspace, data.ground.files[0])

        score = self.scoring(file, data.ground)
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1 +1,3 @@
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
--- a/poetry.lock
+++ b/poetry.lock
@@ -608,6 +608,21 @@ files = [
 [package.dependencies]
 pytest = ">=3.6.0"

+[[package]]
+name = "pytest-ordering"
+version = "0.6"
+description = "pytest plugin to run your tests in a specific order"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
+    {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
+    {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
+]
+
+[package.dependencies]
+pytest = "*"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -778,4 +793,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
+content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-dependency = "^0.5.1"
+pytest-ordering = "^0.6"


 [build-system]