basic challenges, more ChallengeData structure

2026-02-10 16:54:20 +01:00 · 2023-06-24 09:42:36 -04:00
parent b6562f3420
commit a5073ab577
14 changed files with 163 additions and 38 deletions
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,5 +1,6 @@
 import os
 from typing import Optional
+from agbenchmark.challenges.define_task_types import Ground


 class Challenge:
@@ -30,3 +31,24 @@ class Challenge:
            for filename in os.listdir(workspace)
            if os.path.isfile(os.path.join(workspace, filename))
        ]
+
+    def scoring(self, content: str, ground: Ground):
+        if ground.should_contain:
+            for should_contain_word in ground.should_contain:
+                if should_contain_word not in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should exist: {should_contain_word} exists in the content"
+                    )
+
+        if ground.should_not_contain:
+            for should_not_contain_word in ground.should_not_contain:
+                if should_not_contain_word in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
+                    )
+
+        return 1.0
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,6 +4,12 @@ import json
 import os


+class Info(BaseModel):
+    difficulty: str
+    description: str
+    side_effects: List[str]
+
+
 class Ground(BaseModel):
    answer: str
    should_contain: Optional[List[str]]
@@ -11,20 +17,20 @@ class Ground(BaseModel):
    files: List[str]


-class Challenge(BaseModel):
-    category: str
+class ChallengeData(BaseModel):
+    category: List[str]
    task: str
    ground: Ground
-    difficulty: str
    mock_func: Optional[str] = None
+    info: Info

    def serialize(self, path: str) -> None:
        with open(path, "w") as file:
            file.write(self.json())

    @staticmethod
-    def deserialize(path: str) -> "Challenge":
+    def deserialize(path: str) -> "ChallengeData":
        print("Deserializing", path)
        with open(path, "r") as file:
            data = json.load(file)
-        return Challenge(**data)
+        return ChallengeData(**data)
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,27 +1,7 @@
 from agbenchmark.Challenge import Challenge
-from agbenchmark.challenges.define_task_types import Ground


 class RetrievalChallenge(Challenge):
    """Challenge for information-retrieval"""

-    def scoring(self, content: str, ground: Ground):
-        if ground.should_contain:
-            for should_contain_word in ground.should_contain:
-                if should_contain_word not in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should exist: {should_contain_word} exists in the content"
-                    )
-
-        if ground.should_not_contain:
-            for should_not_contain_word in ground.should_not_contain:
-                if should_not_contain_word in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
-                    )
-
-        return 1.0
+    pass
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,5 @@
 {
-  "category": "retrieval",
+  "category": ["basic"],
  "task": "What is the capital of America?",
  "ground": {
    "answer": "Washington",
@@ -7,6 +7,10 @@
    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
    "files": ["file_to_check.txt"]
  },
-  "difficulty": "easy",
-  "mock_func": "retrieval_1_mock"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,9 +1,11 @@
 import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
-from agbenchmark.challenges.define_task_types import Challenge, Ground
+from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os

-data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json"))
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r1_data.json")
+)


 class TestRetrieval1(RetrievalChallenge):
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -0,0 +1,28 @@
+from agbenchmark.Challenge import Challenge
+from ..basic_gpt_agent import basic_gpt_agent
+
+
+def basic_read_file_mock(task: str, workspace: str):
+    """
+    This mock reads a file and returns its content.
+    """
+
+    Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
+
+    file_contents = Challenge.open_file(workspace, "file_to_check.txt")
+
+    Challenge.write_to_file(
+        workspace, "file_to_check.txt", f"random string: {file_contents}"
+    )
+
+
+def basic_write_file_mock(task: str, workspace: str):
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+
+    # Call the basic_gpt_agent to get a response.
+    response = basic_gpt_agent(task)
+
+    # Open the file in write mode.
+    Challenge.write_to_file(workspace, "file_to_check.txt", response)
--- a/agbenchmark/mocks/tests/retrieval_mocks.py
+++ b/agbenchmark/mocks/tests/retrieval_mocks.py
@@ -1,4 +1,3 @@
-from ..basic_gpt_agent import basic_gpt_agent
 from agbenchmark.Challenge import Challenge


@@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge
 # Prerequisites here would be writing to a file (basic_abilities test).
 # Should also check if prerequisites exists in regression file
 def retrieval_1_mock(task: str, workspace: str):
-    # Call the basic_gpt_agent to get a response.
-    response = basic_gpt_agent(task)
-
-    # Open the file in write mode.
-    Challenge.write_to_file(workspace, "file_to_check.txt", response)
+    pass
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -0,0 +1,15 @@
+{
+  "category": ["basic"],
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "ground": {
+    "answer": "random string: this is how we're doing",
+    "should_contain": ["random string: this is how we're doing"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_read_file_mock",
+  "info": {
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
+  }
+}
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -0,0 +1,29 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
+)
+
+
+class TestReadFile(Challenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(
+        self, workspace
+    ):  # create_file simply there for the function to depend on the fixture
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
--- a/agbenchmark/tests/basic_abilities/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file_test.py
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -0,0 +1,16 @@
+{
+  "category": ["basic"],
+  "task": "What is the capital of America?",
+  "ground": {
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -0,0 +1,27 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "w_file_data.json")
+)
+
+
+class TestWriteFile(Challenge):
+    """Testing if LLM can write to a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(self, workspace):
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
--- a/agbenchmark/tests/basic_abilities/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file_test.py
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,8 @@ testpaths = [
 ]
 markers = [
    "retrieval",
-    "regression"
+    "regression",
+    "basic"
 ]

 [tool.poetry.scripts]