mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-26 02:14:27 +01:00
basic challenges, more ChallengeData structure
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
from typing import Optional
|
||||
from agbenchmark.challenges.define_task_types import Ground
|
||||
|
||||
|
||||
class Challenge:
|
||||
@@ -30,3 +31,24 @@ class Challenge:
|
||||
for filename in os.listdir(workspace)
|
||||
if os.path.isfile(os.path.join(workspace, filename))
|
||||
]
|
||||
|
||||
def scoring(self, content: str, ground: Ground):
|
||||
if ground.should_contain:
|
||||
for should_contain_word in ground.should_contain:
|
||||
if should_contain_word not in content:
|
||||
return 0.0
|
||||
else:
|
||||
print(
|
||||
f"Word that should exist: {should_contain_word} exists in the content"
|
||||
)
|
||||
|
||||
if ground.should_not_contain:
|
||||
for should_not_contain_word in ground.should_not_contain:
|
||||
if should_not_contain_word in content:
|
||||
return 0.0
|
||||
else:
|
||||
print(
|
||||
f"Word that should not exist: {should_not_contain_word} does not exist in the content"
|
||||
)
|
||||
|
||||
return 1.0
|
||||
|
||||
@@ -4,6 +4,12 @@ import json
|
||||
import os
|
||||
|
||||
|
||||
class Info(BaseModel):
|
||||
difficulty: str
|
||||
description: str
|
||||
side_effects: List[str]
|
||||
|
||||
|
||||
class Ground(BaseModel):
|
||||
answer: str
|
||||
should_contain: Optional[List[str]]
|
||||
@@ -11,20 +17,20 @@ class Ground(BaseModel):
|
||||
files: List[str]
|
||||
|
||||
|
||||
class Challenge(BaseModel):
|
||||
category: str
|
||||
class ChallengeData(BaseModel):
|
||||
category: List[str]
|
||||
task: str
|
||||
ground: Ground
|
||||
difficulty: str
|
||||
mock_func: Optional[str] = None
|
||||
info: Info
|
||||
|
||||
def serialize(self, path: str) -> None:
|
||||
with open(path, "w") as file:
|
||||
file.write(self.json())
|
||||
|
||||
@staticmethod
|
||||
def deserialize(path: str) -> "Challenge":
|
||||
def deserialize(path: str) -> "ChallengeData":
|
||||
print("Deserializing", path)
|
||||
with open(path, "r") as file:
|
||||
data = json.load(file)
|
||||
return Challenge(**data)
|
||||
return ChallengeData(**data)
|
||||
|
||||
@@ -1,27 +1,7 @@
|
||||
from agbenchmark.Challenge import Challenge
|
||||
from agbenchmark.challenges.define_task_types import Ground
|
||||
|
||||
|
||||
class RetrievalChallenge(Challenge):
|
||||
"""Challenge for information-retrieval"""
|
||||
|
||||
def scoring(self, content: str, ground: Ground):
|
||||
if ground.should_contain:
|
||||
for should_contain_word in ground.should_contain:
|
||||
if should_contain_word not in content:
|
||||
return 0.0
|
||||
else:
|
||||
print(
|
||||
f"Word that should exist: {should_contain_word} exists in the content"
|
||||
)
|
||||
|
||||
if ground.should_not_contain:
|
||||
for should_not_contain_word in ground.should_not_contain:
|
||||
if should_not_contain_word in content:
|
||||
return 0.0
|
||||
else:
|
||||
print(
|
||||
f"Word that should not exist: {should_not_contain_word} does not exist in the content"
|
||||
)
|
||||
|
||||
return 1.0
|
||||
pass
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"category": "retrieval",
|
||||
"category": ["basic"],
|
||||
"task": "What is the capital of America?",
|
||||
"ground": {
|
||||
"answer": "Washington",
|
||||
@@ -7,6 +7,10 @@
|
||||
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
|
||||
"files": ["file_to_check.txt"]
|
||||
},
|
||||
"difficulty": "easy",
|
||||
"mock_func": "retrieval_1_mock"
|
||||
"mock_func": "write_file_mock",
|
||||
"info": {
|
||||
"difficulty": "easy",
|
||||
"description": "Tests the writing to file",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
import pytest
|
||||
from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
|
||||
from agbenchmark.challenges.define_task_types import Challenge, Ground
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData, Ground
|
||||
import os
|
||||
|
||||
data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json"))
|
||||
data = ChallengeData.deserialize(
|
||||
os.path.join(os.path.dirname(__file__), "r1_data.json")
|
||||
)
|
||||
|
||||
|
||||
class TestRetrieval1(RetrievalChallenge):
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
from agbenchmark.Challenge import Challenge
|
||||
from ..basic_gpt_agent import basic_gpt_agent
|
||||
|
||||
|
||||
def basic_read_file_mock(task: str, workspace: str):
|
||||
"""
|
||||
This mock reads a file and returns its content.
|
||||
"""
|
||||
|
||||
Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
|
||||
|
||||
file_contents = Challenge.open_file(workspace, "file_to_check.txt")
|
||||
|
||||
Challenge.write_to_file(
|
||||
workspace, "file_to_check.txt", f"random string: {file_contents}"
|
||||
)
|
||||
|
||||
|
||||
def basic_write_file_mock(task: str, workspace: str):
|
||||
"""
|
||||
This mock writes to a file (creates one if it doesn't exist)
|
||||
"""
|
||||
|
||||
# Call the basic_gpt_agent to get a response.
|
||||
response = basic_gpt_agent(task)
|
||||
|
||||
# Open the file in write mode.
|
||||
Challenge.write_to_file(workspace, "file_to_check.txt", response)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
from ..basic_gpt_agent import basic_gpt_agent
|
||||
from agbenchmark.Challenge import Challenge
|
||||
|
||||
|
||||
@@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge
|
||||
# Prerequisites here would be writing to a file (basic_abilities test).
|
||||
# Should also check if prerequisites exists in regression file
|
||||
def retrieval_1_mock(task: str, workspace: str):
|
||||
# Call the basic_gpt_agent to get a response.
|
||||
response = basic_gpt_agent(task)
|
||||
|
||||
# Open the file in write mode.
|
||||
Challenge.write_to_file(workspace, "file_to_check.txt", response)
|
||||
pass
|
||||
|
||||
15
agbenchmark/tests/basic_abilities/read_file/r_file_data.json
Normal file
15
agbenchmark/tests/basic_abilities/read_file/r_file_data.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"category": ["basic"],
|
||||
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
|
||||
"ground": {
|
||||
"answer": "random string: this is how we're doing",
|
||||
"should_contain": ["random string: this is how we're doing"],
|
||||
"files": ["file_to_check.txt"]
|
||||
},
|
||||
"mock_func": "basic_read_file_mock",
|
||||
"info": {
|
||||
"description": "This reads the file quickly",
|
||||
"difficulty": "basic",
|
||||
"side_effects": [""]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
import pytest
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData
|
||||
from agbenchmark.Challenge import Challenge
|
||||
import os
|
||||
|
||||
data = ChallengeData.deserialize(
|
||||
os.path.join(os.path.dirname(__file__), "r_file_data.json")
|
||||
)
|
||||
|
||||
|
||||
class TestReadFile(Challenge):
|
||||
"""Testing if LLM can read a file"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server_response",
|
||||
[(data.task, data.mock_func)],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.basic
|
||||
def test_retrieval(
|
||||
self, workspace
|
||||
): # create_file simply there for the function to depend on the fixture
|
||||
file = self.open_file(workspace, data.ground.files[0])
|
||||
|
||||
score = self.scoring(file, data.ground)
|
||||
|
||||
print("You score is:", score)
|
||||
|
||||
assert score
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"category": ["basic"],
|
||||
"task": "What is the capital of America?",
|
||||
"ground": {
|
||||
"answer": "Washington",
|
||||
"should_contain": ["Washington"],
|
||||
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
|
||||
"files": ["file_to_check.txt"]
|
||||
},
|
||||
"mock_func": "basic_write_file_mock",
|
||||
"info": {
|
||||
"difficulty": "easy",
|
||||
"description": "Tests the writing to file",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
import pytest
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData
|
||||
from agbenchmark.Challenge import Challenge
|
||||
import os
|
||||
|
||||
data = ChallengeData.deserialize(
|
||||
os.path.join(os.path.dirname(__file__), "w_file_data.json")
|
||||
)
|
||||
|
||||
|
||||
class TestWriteFile(Challenge):
|
||||
"""Testing if LLM can write to a file"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server_response",
|
||||
[(data.task, data.mock_func)],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.basic
|
||||
def test_retrieval(self, workspace):
|
||||
file = self.open_file(workspace, data.ground.files[0])
|
||||
|
||||
score = self.scoring(file, data.ground)
|
||||
|
||||
print("You score is:", score)
|
||||
|
||||
assert score
|
||||
@@ -28,7 +28,8 @@ testpaths = [
|
||||
]
|
||||
markers = [
|
||||
"retrieval",
|
||||
"regression"
|
||||
"regression",
|
||||
"basic"
|
||||
]
|
||||
|
||||
[tool.poetry.scripts]
|
||||
|
||||
Reference in New Issue
Block a user