From 8c44b9eddf7c566d5e39f7e11149772b96e23a5f Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 09:42:36 -0400 Subject: [PATCH] basic challenges, more ChallengeData structure --- agbenchmark/Challenge.py | 22 ++++++++++++++ agbenchmark/challenges/define_task_types.py | 16 ++++++---- agbenchmark/challenges/retrieval/Retrieval.py | 22 +------------- .../challenges/retrieval/r1/r1_data.json | 10 +++++-- .../challenges/retrieval/r1/r1_test.py | 6 ++-- agbenchmark/mocks/tests/basic_mocks.py | 28 ++++++++++++++++++ agbenchmark/mocks/tests/retrieval_mocks.py | 7 +---- .../read_file/r_file_data.json | 15 ++++++++++ .../read_file/read_file_test.py | 29 +++++++++++++++++++ .../tests/basic_abilities/read_file_test.py | 0 .../write_file/w_file_data.json | 16 ++++++++++ .../write_file/write_file_test.py | 27 +++++++++++++++++ .../tests/basic_abilities/write_file_test.py | 0 pyproject.toml | 3 +- 14 files changed, 163 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py create mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 20bf5585..9828a0e9 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,5 +1,6 @@ import os from typing import Optional +from agbenchmark.challenges.define_task_types import Ground class Challenge: @@ -30,3 +31,24 @@ class Challenge: for filename in os.listdir(workspace) if os.path.isfile(os.path.join(workspace, filename)) ] + + def scoring(self, content: str, ground: Ground): + if ground.should_contain: + for should_contain_word in ground.should_contain: + if should_contain_word not in content: + return 0.0 + else: + print( + f"Word that should exist: {should_contain_word} exists in the content" + ) + + if ground.should_not_contain: + for should_not_contain_word in ground.should_not_contain: + if should_not_contain_word in content: + return 0.0 + else: + print( + f"Word that should not exist: {should_not_contain_word} does not exist in the content" + ) + + return 1.0 diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index f1a841b5..879a46af 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -4,6 +4,12 @@ import json import os +class Info(BaseModel): + difficulty: str + description: str + side_effects: List[str] + + class Ground(BaseModel): answer: str should_contain: Optional[List[str]] @@ -11,20 +17,20 @@ class Ground(BaseModel): files: List[str] -class Challenge(BaseModel): - category: str +class ChallengeData(BaseModel): + category: List[str] task: str ground: Ground - difficulty: str mock_func: Optional[str] = None + info: Info def serialize(self, path: str) -> None: with open(path, "w") as file: file.write(self.json()) @staticmethod - def deserialize(path: str) -> "Challenge": + def deserialize(path: str) -> "ChallengeData": print("Deserializing", path) with open(path, "r") as file: data = json.load(file) - return Challenge(**data) + return ChallengeData(**data) diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index 2db22ae4..9434d69c 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,27 +1,7 @@ from agbenchmark.Challenge import Challenge -from agbenchmark.challenges.define_task_types import Ground class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" - def scoring(self, content: str, ground: Ground): - if ground.should_contain: - for should_contain_word in ground.should_contain: - if should_contain_word not in content: - return 0.0 - else: - print( - f"Word that should exist: {should_contain_word} exists in the content" - ) - - if ground.should_not_contain: - for should_not_contain_word in ground.should_not_contain: - if should_not_contain_word in content: - return 0.0 - else: - print( - f"Word that should not exist: {should_not_contain_word} does not exist in the content" - ) - - return 1.0 + pass diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index c7cc3100..08b74d1b 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,5 @@ { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -7,6 +7,10 @@ "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy", - "mock_func": "retrieval_1_mock" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index e20c9f7b..d37c5e79 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,9 +1,11 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge -from agbenchmark.challenges.define_task_types import Challenge, Ground +from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json")) +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r1_data.json") +) class TestRetrieval1(RetrievalChallenge): diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index e69de29b..eb7b9654 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -0,0 +1,28 @@ +from agbenchmark.Challenge import Challenge +from ..basic_gpt_agent import basic_gpt_agent + + +def basic_read_file_mock(task: str, workspace: str): + """ + This mock reads a file and returns its content. + """ + + Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") + + file_contents = Challenge.open_file(workspace, "file_to_check.txt") + + Challenge.write_to_file( + workspace, "file_to_check.txt", f"random string: {file_contents}" + ) + + +def basic_write_file_mock(task: str, workspace: str): + """ + This mock writes to a file (creates one if it doesn't exist) + """ + + # Call the basic_gpt_agent to get a response. + response = basic_gpt_agent(task) + + # Open the file in write mode. + Challenge.write_to_file(workspace, "file_to_check.txt", response) diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py index 23f4bde1..2481de06 100644 --- a/agbenchmark/mocks/tests/retrieval_mocks.py +++ b/agbenchmark/mocks/tests/retrieval_mocks.py @@ -1,4 +1,3 @@ -from ..basic_gpt_agent import basic_gpt_agent from agbenchmark.Challenge import Challenge @@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge # Prerequisites here would be writing to a file (basic_abilities test). # Should also check if prerequisites exists in regression file def retrieval_1_mock(task: str, workspace: str): - # Call the basic_gpt_agent to get a response. - response = basic_gpt_agent(task) - - # Open the file in write mode. - Challenge.write_to_file(workspace, "file_to_check.txt", response) + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json new file mode 100644 index 00000000..55319ddf --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -0,0 +1,15 @@ +{ + "category": ["basic"], + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "ground": { + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_read_file_mock", + "info": { + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] + } +} diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py new file mode 100644 index 00000000..610ccdab --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -0,0 +1,29 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") +) + + +class TestReadFile(Challenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval( + self, workspace + ): # create_file simply there for the function to depend on the fixture + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py deleted file mode 100644 index e69de29b..00000000 diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json new file mode 100644 index 00000000..4aaa1347 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -0,0 +1,16 @@ +{ + "category": ["basic"], + "task": "What is the capital of America?", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py new file mode 100644 index 00000000..ccb10fe7 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -0,0 +1,27 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "w_file_data.json") +) + + +class TestWriteFile(Challenge): + """Testing if LLM can write to a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval(self, workspace): + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pyproject.toml b/pyproject.toml index 5498381a..6f79e75c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,8 @@ testpaths = [ ] markers = [ "retrieval", - "regression" + "regression", + "basic" ] [tool.poetry.scripts]