mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-31 04:44:33 +01:00
more elegant marking & dependency solution
This commit is contained in:
74
README.md
74
README.md
@@ -51,15 +51,73 @@ Share your progress :)
|
||||
|
||||
to create a test:
|
||||
|
||||
```
|
||||
@pytest.mark.parametrize(
|
||||
"server_response",
|
||||
["VARIABLE"], # VARIABLE = the query/goal you provide to the model
|
||||
indirect=True,
|
||||
```python
|
||||
import pytest
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData
|
||||
from ..CategoryChallenge import CategoryChallenge
|
||||
import os
|
||||
|
||||
data = ChallengeData.deserialize(
|
||||
os.path.join(os.path.dirname(__file__), "r_file_data.json")
|
||||
)
|
||||
@pytest.mark.(VARIABLE) # VARIABLE = category of the test
|
||||
def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts
|
||||
assert os.path.exists(os.path.join(workspace, "file_to_check.txt"))
|
||||
|
||||
class TestSomething(CategoryChallenge):
|
||||
"""Testing if LLM can read a file"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server_response",
|
||||
[(data.task, data.mock_func)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_retrieval(
|
||||
self, workspace
|
||||
):
|
||||
# scoring logic goes here
|
||||
```
|
||||
|
||||
All challenges will inherit from parent class which has the mark
|
||||
|
||||
```python
|
||||
@pytest.mark.basic
|
||||
class BasicChallenge(Challenge):
|
||||
pass
|
||||
```
|
||||
|
||||
If you want to add a custom mark to a Challenge, you must specify it before the test definition
|
||||
|
||||
```python
|
||||
@pytest.mark.other_mark
|
||||
def test_retrieval(self, workspace):
|
||||
```
|
||||
|
||||
To add a dependency to a challenge use the following
|
||||
|
||||
```python
|
||||
# to defining what a test depends on
|
||||
from pytest_dependency import depends
|
||||
|
||||
def test1(self, request, workspace):
|
||||
depends(request, data.dependencies)
|
||||
# for defining a test as a dependency
|
||||
@pytest.mark.dependency()
|
||||
def test2
|
||||
```
|
||||
|
||||
Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
|
||||
|
||||
```python
|
||||
@pytest.mark.run(order=1)
|
||||
```
|
||||
|
||||
To create a file to test a challenge, add this to the challenge file which will create a file before running the server
|
||||
|
||||
```python
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def setup_module(workspace):
|
||||
if data.ground.should_contain:
|
||||
Challenge.write_to_file(
|
||||
workspace, data.ground.files[0], "this is how we're doing"
|
||||
)
|
||||
```
|
||||
|
||||
## Api
|
||||
|
||||
@@ -4,28 +4,25 @@
|
||||
|
||||
Input:
|
||||
|
||||
- **category** (str): information-retrieval
|
||||
- **difficulty**(str): the difficulty of this query. choices from
|
||||
|
||||
## Information-retrieval challenges
|
||||
|
||||
Input:
|
||||
|
||||
- **category** (str): information-retrieval
|
||||
- **task** (str): the question the agent needs to be solve.
|
||||
- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
|
||||
- **task** (str): The task that the agent needs to solve.
|
||||
- **dependencies** (str[]): The dependencies that the challenge needs to run.
|
||||
- **ground** (dict): The ground truth.
|
||||
- **answer** (str): The raw text of ground truth answer
|
||||
- **should_contain** (list): the exact strings that is required in the final answer
|
||||
- **should_not_contain** (list): the exact strings that should not be in the final answer
|
||||
- **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt
|
||||
- **difficulty**(str): the difficulty of this query. choices from
|
||||
- **mock_func**: function to mock the agent's response. This is used for testing purposes
|
||||
- **answer** (str): The raw text of the ground truth answer.
|
||||
- **should_contain** (list): The exact strings that are required in the final answer.
|
||||
- **should_not_contain** (list): The exact strings that should not be in the final answer.
|
||||
- **files** (list): Files that are used for retrieval. Can specify file here or an extension.
|
||||
- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
|
||||
- **info** (dict): Additional info about the challenge.
|
||||
- **difficulty** (str): The difficulty of this query.
|
||||
- **description** (str): Description of the challenge.
|
||||
- **side_effects** (str[]): Describes the effects of the challenge.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
{
|
||||
"category": "retrieval",
|
||||
"category": ["basic"],
|
||||
"task": "What is the capital of America?",
|
||||
"ground": {
|
||||
"answer": "Washington",
|
||||
@@ -33,11 +30,16 @@ Example:
|
||||
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
|
||||
"files": ["file_to_check.txt"]
|
||||
},
|
||||
"difficulty": "easy"
|
||||
"mock_func": "write_file_mock",
|
||||
"info": {
|
||||
"difficulty": "easy",
|
||||
"description": "Tests the writing to file",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
Output:
|
||||
Current Output:
|
||||
|
||||
- **score** (float): scores range from [0, 1]
|
||||
|
||||
@@ -20,6 +20,7 @@ class Ground(BaseModel):
|
||||
class ChallengeData(BaseModel):
|
||||
category: List[str]
|
||||
task: str
|
||||
dependencies: List[str]
|
||||
ground: Ground
|
||||
mock_func: Optional[str] = None
|
||||
info: Info
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
{
|
||||
"category": ["basic"],
|
||||
"dependencies": ["test_write_file"],
|
||||
"task": "What is the capital of America?",
|
||||
"ground": {
|
||||
"answer": "Washington",
|
||||
|
||||
7
agbenchmark/tests/basic_abilities/BasicChallenge.py
Normal file
7
agbenchmark/tests/basic_abilities/BasicChallenge.py
Normal file
@@ -0,0 +1,7 @@
|
||||
import pytest
|
||||
from agbenchmark.Challenge import Challenge
|
||||
|
||||
|
||||
@pytest.mark.basic
|
||||
class BasicChallenge(Challenge):
|
||||
pass
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"category": ["basic"],
|
||||
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
|
||||
"dependencies": ["test_write_file"],
|
||||
"ground": {
|
||||
"answer": "random string: this is how we're doing",
|
||||
"should_contain": ["random string: this is how we're doing"],
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import pytest
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData
|
||||
from agbenchmark.Challenge import Challenge
|
||||
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
|
||||
import os
|
||||
from pytest_dependency import depends
|
||||
|
||||
data = ChallengeData.deserialize(
|
||||
os.path.join(os.path.dirname(__file__), "r_file_data.json")
|
||||
@@ -16,7 +18,7 @@ def setup_module(workspace):
|
||||
)
|
||||
|
||||
|
||||
class TestReadFile(Challenge):
|
||||
class TestReadFile(BasicChallenge):
|
||||
"""Testing if LLM can read a file"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -24,11 +26,9 @@ class TestReadFile(Challenge):
|
||||
[(data.task, data.mock_func)],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.basic
|
||||
@pytest.mark.dependency(depends=["write_file"])
|
||||
def test_retrieval(
|
||||
self, workspace
|
||||
): # create_file simply there for the function to depend on the fixture
|
||||
def test_read_file(self, request, workspace):
|
||||
depends(request, data.dependencies)
|
||||
|
||||
file = self.open_file(workspace, data.ground.files[0])
|
||||
|
||||
score = self.scoring(file, data.ground)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"category": ["basic"],
|
||||
"task": "What is the capital of America?",
|
||||
"dependencies": [],
|
||||
"ground": {
|
||||
"answer": "Washington",
|
||||
"should_contain": ["Washington"],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import pytest
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData
|
||||
from agbenchmark.Challenge import Challenge
|
||||
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
|
||||
import os
|
||||
|
||||
data = ChallengeData.deserialize(
|
||||
@@ -8,7 +8,7 @@ data = ChallengeData.deserialize(
|
||||
)
|
||||
|
||||
|
||||
class TestWriteFile(Challenge):
|
||||
class TestWriteFile(BasicChallenge):
|
||||
"""Testing if LLM can write to a file"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -16,9 +16,8 @@ class TestWriteFile(Challenge):
|
||||
[(data.task, data.mock_func)],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.basic
|
||||
@pytest.mark.dependency(name="write_file")
|
||||
def test_retrieval(self, workspace):
|
||||
@pytest.mark.dependency()
|
||||
def test_write_file(self, workspace):
|
||||
file = self.open_file(workspace, data.ground.files[0])
|
||||
|
||||
score = self.scoring(file, data.ground)
|
||||
|
||||
@@ -1 +1,3 @@
|
||||
agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
|
||||
agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
|
||||
agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
|
||||
|
||||
17
poetry.lock
generated
17
poetry.lock
generated
@@ -608,6 +608,21 @@ files = [
|
||||
[package.dependencies]
|
||||
pytest = ">=3.6.0"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-ordering"
|
||||
version = "0.6"
|
||||
description = "pytest plugin to run your tests in a specific order"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
|
||||
{file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
|
||||
{file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pytest = "*"
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.31.0"
|
||||
@@ -778,4 +793,4 @@ multidict = ">=4.0"
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
|
||||
content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
|
||||
|
||||
@@ -15,6 +15,7 @@ requests = "^2.31.0"
|
||||
openai = "^0.27.8"
|
||||
pydantic = "^1.10.9"
|
||||
pytest-dependency = "^0.5.1"
|
||||
pytest-ordering = "^0.6"
|
||||
|
||||
|
||||
[build-system]
|
||||
|
||||
Reference in New Issue
Block a user