more elegant marking & dependency solution

This commit is contained in:
Silen Naihin
2023-06-24 14:42:35 -04:00
parent 4fa9f72083
commit f895d54e02
12 changed files with 126 additions and 38 deletions

View File

@@ -51,15 +51,73 @@ Share your progress :)
to create a test:
```
@pytest.mark.parametrize(
"server_response",
["VARIABLE"], # VARIABLE = the query/goal you provide to the model
indirect=True,
```python
import pytest
from agbenchmark.challenges.define_task_types import ChallengeData
from ..CategoryChallenge import CategoryChallenge
import os
data = ChallengeData.deserialize(
os.path.join(os.path.dirname(__file__), "r_file_data.json")
)
@pytest.mark.(VARIABLE) # VARIABLE = category of the test
def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts
assert os.path.exists(os.path.join(workspace, "file_to_check.txt"))
class TestSomething(CategoryChallenge):
"""Testing if LLM can read a file"""
@pytest.mark.parametrize(
"server_response",
[(data.task, data.mock_func)],
indirect=True,
)
def test_retrieval(
self, workspace
):
# scoring logic goes here
```
All challenges will inherit from parent class which has the mark
```python
@pytest.mark.basic
class BasicChallenge(Challenge):
pass
```
If you want to add a custom mark to a Challenge, you must specify it before the test definition
```python
@pytest.mark.other_mark
def test_retrieval(self, workspace):
```
To add a dependency to a challenge use the following
```python
# to defining what a test depends on
from pytest_dependency import depends
def test1(self, request, workspace):
depends(request, data.dependencies)
# for defining a test as a dependency
@pytest.mark.dependency()
def test2
```
Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
```python
@pytest.mark.run(order=1)
```
To create a file to test a challenge, add this to the challenge file which will create a file before running the server
```python
@pytest.fixture(scope="module", autouse=True)
def setup_module(workspace):
if data.ground.should_contain:
Challenge.write_to_file(
workspace, data.ground.files[0], "this is how we're doing"
)
```
## Api

View File

@@ -4,28 +4,25 @@
Input:
- **category** (str): information-retrieval
- **difficulty**(str): the difficulty of this query. choices from
## Information-retrieval challenges
Input:
- **category** (str): information-retrieval
- **task** (str): the question the agent needs to be solve.
- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
- **task** (str): The task that the agent needs to solve.
- **dependencies** (str[]): The dependencies that the challenge needs to run.
- **ground** (dict): The ground truth.
- **answer** (str): The raw text of ground truth answer
- **should_contain** (list): the exact strings that is required in the final answer
- **should_not_contain** (list): the exact strings that should not be in the final answer
- **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt
- **difficulty**(str): the difficulty of this query. choices from
- **mock_func**: function to mock the agent's response. This is used for testing purposes
- **answer** (str): The raw text of the ground truth answer.
- **should_contain** (list): The exact strings that are required in the final answer.
- **should_not_contain** (list): The exact strings that should not be in the final answer.
- **files** (list): Files that are used for retrieval. Can specify file here or an extension.
- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
- **info** (dict): Additional info about the challenge.
- **difficulty** (str): The difficulty of this query.
- **description** (str): Description of the challenge.
- **side_effects** (str[]): Describes the effects of the challenge.
Example:
```python
{
"category": "retrieval",
"category": ["basic"],
"task": "What is the capital of America?",
"ground": {
"answer": "Washington",
@@ -33,11 +30,16 @@ Example:
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
"files": ["file_to_check.txt"]
},
"difficulty": "easy"
"mock_func": "write_file_mock",
"info": {
"difficulty": "easy",
"description": "Tests the writing to file",
"side_effects": ["tests if there is in fact an LLM attached"]
}
}
```
Output:
Current Output:
- **score** (float): scores range from [0, 1]

View File

@@ -20,6 +20,7 @@ class Ground(BaseModel):
class ChallengeData(BaseModel):
category: List[str]
task: str
dependencies: List[str]
ground: Ground
mock_func: Optional[str] = None
info: Info

View File

@@ -1,5 +1,6 @@
{
"category": ["basic"],
"dependencies": ["test_write_file"],
"task": "What is the capital of America?",
"ground": {
"answer": "Washington",

View File

@@ -0,0 +1,7 @@
import pytest
from agbenchmark.Challenge import Challenge
@pytest.mark.basic
class BasicChallenge(Challenge):
pass

View File

@@ -1,6 +1,7 @@
{
"category": ["basic"],
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
"dependencies": ["test_write_file"],
"ground": {
"answer": "random string: this is how we're doing",
"should_contain": ["random string: this is how we're doing"],

View File

@@ -1,7 +1,9 @@
import pytest
from agbenchmark.challenges.define_task_types import ChallengeData
from agbenchmark.Challenge import Challenge
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
import os
from pytest_dependency import depends
data = ChallengeData.deserialize(
os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -16,7 +18,7 @@ def setup_module(workspace):
)
class TestReadFile(Challenge):
class TestReadFile(BasicChallenge):
"""Testing if LLM can read a file"""
@pytest.mark.parametrize(
@@ -24,11 +26,9 @@ class TestReadFile(Challenge):
[(data.task, data.mock_func)],
indirect=True,
)
@pytest.mark.basic
@pytest.mark.dependency(depends=["write_file"])
def test_retrieval(
self, workspace
): # create_file simply there for the function to depend on the fixture
def test_read_file(self, request, workspace):
depends(request, data.dependencies)
file = self.open_file(workspace, data.ground.files[0])
score = self.scoring(file, data.ground)

View File

@@ -1,6 +1,7 @@
{
"category": ["basic"],
"task": "What is the capital of America?",
"dependencies": [],
"ground": {
"answer": "Washington",
"should_contain": ["Washington"],

View File

@@ -1,6 +1,6 @@
import pytest
from agbenchmark.challenges.define_task_types import ChallengeData
from agbenchmark.Challenge import Challenge
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
import os
data = ChallengeData.deserialize(
@@ -8,7 +8,7 @@ data = ChallengeData.deserialize(
)
class TestWriteFile(Challenge):
class TestWriteFile(BasicChallenge):
"""Testing if LLM can write to a file"""
@pytest.mark.parametrize(
@@ -16,9 +16,8 @@ class TestWriteFile(Challenge):
[(data.task, data.mock_func)],
indirect=True,
)
@pytest.mark.basic
@pytest.mark.dependency(name="write_file")
def test_retrieval(self, workspace):
@pytest.mark.dependency()
def test_write_file(self, workspace):
file = self.open_file(workspace, data.ground.files[0])
score = self.scoring(file, data.ground)

View File

@@ -1 +1,3 @@
agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]

17
poetry.lock generated
View File

@@ -608,6 +608,21 @@ files = [
[package.dependencies]
pytest = ">=3.6.0"
[[package]]
name = "pytest-ordering"
version = "0.6"
description = "pytest plugin to run your tests in a specific order"
optional = false
python-versions = "*"
files = [
{file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
{file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
{file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
]
[package.dependencies]
pytest = "*"
[[package]]
name = "requests"
version = "2.31.0"
@@ -778,4 +793,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"

View File

@@ -15,6 +15,7 @@ requests = "^2.31.0"
openai = "^0.27.8"
pydantic = "^1.10.9"
pytest-dependency = "^0.5.1"
pytest-ordering = "^0.6"
[build-system]