Quality of life improvements & fixes (#75)

This commit is contained in:
Silen Naihin
2023-07-08 21:43:38 -04:00
committed by GitHub
parent db86ccdcb4
commit 69bd41f741
50 changed files with 220 additions and 462 deletions

2
.gitignore vendored
View File

@@ -1,4 +1,4 @@
agbenchmark/mocks/workspace/
agbenchmark/workspace/
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@@ -53,8 +53,7 @@ import os
class TestWriteFile(BasicChallenge):
"""Testing if LLM can write to a file"""
@pytest.mark.depends(on=[], name="basic_write_file")
def test_method(self, workspace):
def test_method(self, config):
# implement scoring logic by looking at workspace
```
@@ -82,7 +81,7 @@ Add the below to create a file in the workspace prior to running a challenge. On
## Workspace
If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
#### Dataset

View File

@@ -3,37 +3,27 @@ import shutil
import subprocess
import sys
import time
from typing import Any, Dict, Optional
from typing import Any, Dict
from dotenv import load_dotenv
from agbenchmark.mocks.mock_manager import MockManager
load_dotenv()
MOCK_FLAG = os.getenv("MOCK_TEST")
mock_test_str = os.getenv("MOCK_TEST")
MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
def run_agent(
task: str,
mock_func: Optional[str],
config: Dict[str, Any],
challenge_location: str,
) -> None:
"""Calling to get a response"""
if MOCK_FLAG == "True":
if MOCK_FLAG:
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", challenge_location
)
if mock_func is None:
print("No mock provided")
return
mock_manager = MockManager(
task, config
) # workspace doesn't need to be passed in, stays the same
print("Server unavailable, using mock", mock_func)
mock_manager.delegate(mock_func)
else:
timeout = config["cutoff"]
print(
@@ -99,6 +89,3 @@ def copy_artifacts_into_workspace(
full_file_name = os.path.join(source_dir, file_name)
if os.path.isfile(full_file_name):
shutil.copy(full_file_name, workspace)
ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"

View File

@@ -4,9 +4,8 @@ import os
import subprocess
import types
from abc import ABC, ABCMeta
from typing import Any, Dict, List, Optional, Tuple, Type, cast
from typing import Any, Dict, List, Tuple, Type, cast
import pytest
from dotenv import load_dotenv
from agbenchmark.challenges.define_task_types import ChallengeData, Ground
@@ -19,7 +18,6 @@ MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
class ChallengeMeta(ABCMeta):
def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
super().__init__(name, bases, dct)
try:
frame = cast(types.FrameType, inspect.currentframe())
@@ -40,18 +38,13 @@ class Challenge(ABC, metaclass=ChallengeMeta):
@property
def data(self) -> ChallengeData:
file_path = f"{self.CHALLENGE_LOCATION}/data.json"
Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
if file_path not in Challenge._data_cache:
Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
return Challenge._data_cache[file_path]
@property
def mock(self) -> Optional[str]:
return self.data.mock.mock_func if self.data.mock else None
@property
def task(self) -> str:
return str(
self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
)
return self.data.task
@property
def dependencies(self) -> list:
@@ -64,17 +57,8 @@ class Challenge(ABC, metaclass=ChallengeMeta):
config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
)
run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)
@property
def name(self) -> str:
return self.data.name
@pytest.mark.parametrize(
"challenge_data",
[data],
indirect=True,
)
def test_method(self, config: Dict[str, Any]) -> None:
raise NotImplementedError
@@ -151,3 +135,16 @@ class Challenge(ABC, metaclass=ChallengeMeta):
)
return 1.0
def get_scores(self, config: Dict[str, Any]) -> List[float]:
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
return scores

View File

@@ -25,10 +25,9 @@ Example:
```python
{
"name": "basic_write_file",
"category": ["basic"],
"task": "Print the the capital of America to a .txt file",
"dependencies": [],
"dependencies": ["TestWriteFile"], # the class name of the test
"ground": {
"answer": "Washington",
"should_contain": ["Washington"],
@@ -36,10 +35,6 @@ Example:
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_write_file_mock",
"mock_task": "What is the capital of America?"
},
"info": {
"difficulty": "basic",
"description": "Tests the writing to file",

View File

@@ -1,8 +0,0 @@
import pytest
from agbenchmark.challenge import Challenge
@pytest.mark.code
class CodeChallenge(Challenge):
"""Challenge for memory"""

View File

@@ -1,8 +1,7 @@
{
"name": "debug_simple_typo_with_guidance",
"category": ["code"],
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"dependencies": [],
"dependencies": ["TestReadFile", "TestWriteFile"],
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@@ -10,10 +9,6 @@
"files": ["test.py"],
"type": "execute_python_code"
},
"mock": {
"mock_func": null,
"mock_task": null
},
"info": {
"difficulty": "basic",
"description": "Tests ability for the agent to debug python code with a simple typo in it.",

View File

@@ -1,25 +1,13 @@
from typing import Any, Dict
import pytest
from agbenchmark.challenges.code.code import CodeChallenge
from agbenchmark.challenge import Challenge
class TestDebugSimpleTypoWithGuidance(CodeChallenge):
class TestDebugSimpleTypoWithGuidance(Challenge):
"""The first memory challenge"""
@pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,28 +1,14 @@
from typing import Any, Dict
import pytest
from agbenchmark.challenges.code.code import CodeChallenge
from agbenchmark.challenge import Challenge
class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
class TestDebugSimpleTypoWithoutGuidance(Challenge):
"""The first memory challenge"""
@pytest.mark.depends(
name="test_debug_simple_typo_without_guidance",
depends=["test_debug_simple_typo_with_guidance"],
)
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,8 +1,7 @@
{
"name": "debug_simple_typo_without_guidance",
"category": ["code"],
"task": "Make test.py run without errors.",
"dependencies": [],
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@@ -10,12 +9,8 @@
"files": ["test.py"],
"type": "execute_python_code"
},
"mock": {
"mock_func": null,
"mock_task": null
},
"info": {
"difficulty": "basic",
"difficulty": "medium",
"description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
"side_effects": ["tests if there is in fact an LLM attached"]
}

View File

@@ -4,11 +4,6 @@ from typing import List, Optional
from pydantic import BaseModel
class Mock(BaseModel):
mock_func: Optional[str] = None
mock_task: Optional[str] = None
class Info(BaseModel):
difficulty: str
description: str
@@ -24,12 +19,10 @@ class Ground(BaseModel):
class ChallengeData(BaseModel):
name: str
category: List[str]
task: str
dependencies: List[str]
ground: Ground
mock: Optional[Mock] = None
info: Info
def serialize(self, path: str) -> None:

View File

@@ -1,17 +1,14 @@
{
"name": "basic_read_file",
"category": ["basic"],
"name": "ReadFile",
"category": ["interface"],
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
"dependencies": ["basic_write_file"],
"dependencies": ["TestWriteFile"],
"ground": {
"answer": "random string Hello World!",
"should_contain": ["random string", "Hello World!"],
"files": ["file_to_check.txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_read_file_mock"
},
"info": {
"description": "This reads the file quickly",
"difficulty": "basic",

View File

@@ -0,0 +1,12 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestReadFile(Challenge):
"""Testing if LLM can read a file"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,6 +1,6 @@
{
"name": "basic_write_file",
"category": ["basic"],
"name": "WriteFile",
"category": ["interface"],
"task": "Print the the capital of America to a .txt file",
"dependencies": [],
"ground": {
@@ -10,10 +10,6 @@
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_write_file_mock",
"mock_task": "What is the capital of America?"
},
"info": {
"difficulty": "basic",
"description": "Tests the writing to file",

View File

@@ -0,0 +1,13 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestWriteFile(Challenge):
"""Testing if LLM can write to a file"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,8 +1,7 @@
{
"name": "basic_memory",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": [],
"dependencies": ["TestReadFile", "TestWriteFile"],
"ground": {
"answer": "2314",
"should_contain": ["2314"],

View File

@@ -1,25 +1,13 @@
from typing import Any, Dict
import pytest
from agbenchmark.challenges.memory.memory import MemoryChallenge
from agbenchmark.challenge import Challenge
class TestBasicMemory(MemoryChallenge):
class TestBasicMemory(Challenge):
"""The first memory challenge"""
@pytest.mark.depends(name="test_basic_memory")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,8 +1,7 @@
{
"name": "remember_multiple_ids",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": [],
"dependencies": ["TestBasicMemory"],
"ground": {
"answer": "3145\n3791\n9317\n9471",
"should_contain": ["3145", "3791", "9317", "9471"],
@@ -10,10 +9,6 @@
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "remember_multiple_ids_mock",
"mock_task": "Follow the instructions in the instructions_1.txt file"
},
"info": {
"difficulty": "basic",
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",

View File

@@ -1,27 +1,13 @@
from typing import Any, Dict
import pytest
from agbenchmark.challenges.memory.memory import MemoryChallenge
from agbenchmark.challenge import Challenge
class TestRememberMultipleIds(MemoryChallenge):
class TestRememberMultipleIds(Challenge):
"""The first memory challenge"""
@pytest.mark.depends(
name="test_remember_multiple_ids", depends=["test_basic_memory"]
)
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,8 +1,7 @@
{
"name": "remember_multiple_ids_with_noise_mock",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": [],
"dependencies": ["TestRememberMultipleIds"],
"ground": {
"answer": "3145\n3791\n9317\n9471",
"should_contain": ["3145", "3791", "9317", "9471"],
@@ -10,10 +9,6 @@
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "remember_multiple_ids_mock",
"mock_task": "Follow the instructions in the instructions_1.txt file"
},
"info": {
"difficulty": "medium",
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",

View File

@@ -1,28 +1,13 @@
from typing import Any, Dict
import pytest
from agbenchmark.challenges.memory.memory import MemoryChallenge
from agbenchmark.challenge import Challenge
class TestRememberMultipleIdsWithNoise(MemoryChallenge):
class TestRememberMultipleIdsWithNoise(Challenge):
"""The first memory challenge"""
@pytest.mark.depends(
name="test_remember_multiple_ids_with_noise",
depends=["test_remember_multiple_ids"],
)
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,8 +1,7 @@
{
"name": "remember_multiple_phrases_with_noise_mock",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": [],
"dependencies": ["TestRememberMultipleIdsWithNoise"],
"ground": {
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
"should_contain": [
@@ -15,10 +14,6 @@
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "remember_multiple_phrases_with_noise_mock",
"mock_task": "Follow the instructions in the instructions_1.txt file"
},
"info": {
"difficulty": "medium",
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",

View File

@@ -1,28 +1,13 @@
from typing import Any, Dict
import pytest
from agbenchmark.challenges.memory.memory import MemoryChallenge
from agbenchmark.challenge import Challenge
class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
class TestRememberMultiplePhrasesWithNoise(Challenge):
"""The first memory challenge"""
@pytest.mark.depends(
name="test_remember_multiple_phrases_with_noise",
depends=["test_remember_multiple_ids_with_noise"],
)
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,8 +0,0 @@
import pytest
from agbenchmark.challenge import Challenge
@pytest.mark.memory
class MemoryChallenge(Challenge):
"""Challenge for memory"""

View File

@@ -1,8 +1,7 @@
{
"name": "basic_information_retrieval",
"category": ["retrieval"],
"task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
"dependencies": [],
"dependencies": ["TestWriteFile"],
"ground": {
"answer": "£25.89",
"should_contain": ["25.89"],
@@ -10,10 +9,6 @@
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_retrieval_mock",
"mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file."
},
"info": {
"difficulty": "basic",
"description": "Tests ability to retrieve information from a website.",

View File

@@ -1,25 +1,13 @@
from typing import Any, Dict
import pytest
from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
from agbenchmark.challenge import Challenge
class TestRetrieval(RetrievalChallenge):
class TestRetrieval(Challenge):
"""The first information-retrieval challenge"""
@pytest.mark.depends(name="test_retrieval")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,8 +1,7 @@
{
"name": "basic_information_retrieval",
"category": ["retrieval"],
"task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
"dependencies": [],
"dependencies": ["TestRetrieval"],
"ground": {
"answer": "81,462",
"should_contain": ["81,462"],
@@ -10,10 +9,6 @@
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_retrieval_2_mock",
"mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
},
"info": {
"difficulty": "basic",
"description": "Tests ability to retrieve information.",

View File

@@ -1,25 +1,13 @@
from typing import Any, Dict
import pytest
from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
from agbenchmark.challenge import Challenge
class TestRetrieval2(RetrievalChallenge):
class TestRetrieval2(Challenge):
"""The first information-retrieval challenge"""
@pytest.mark.depends(on=["test_retrieval"], name="test_retrieval_2")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,19 +1,30 @@
{
"name": "basic_information_retrieval",
"category": ["retrieval"],
"task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
"dependencies": [],
"dependencies": ["TestRetrieval2"],
"ground": {
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
"should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
"should_contain": [
"15",
"112",
"117",
"204",
"413",
"2,014",
"3,198",
"4,046",
"7,000",
"11,759",
"21,461",
"24,578",
"31,536",
"53,823",
"81,462"
],
"should_not_contain": [],
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_retrieval_3_mock",
"mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
},
"info": {
"difficulty": "basic",
"description": "Tests ability to retrieve information.",

View File

@@ -1,25 +1,14 @@
from typing import Any, Dict
import pytest
from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
from agbenchmark.challenge import Challenge
class TestRetrieval3(RetrievalChallenge):
class TestRetrieval3(Challenge):
"""The first information-retrieval challenge"""
@pytest.mark.depends(on=["test_retrieval_2"], name="test_retrieval_3")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)
assert 1 in scores

View File

@@ -1,8 +0,0 @@
import pytest
from agbenchmark.challenge import Challenge
@pytest.mark.retrieval
class RetrievalChallenge(Challenge):
"""Challenge for information-retrieval"""

View File

@@ -2,12 +2,16 @@ import json
import os
import shutil
from pathlib import Path # noqa
from typing import Any, Dict, Generator, List
from typing import Any, Dict, Generator
import pytest
from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH
from agbenchmark.tests.regression.RegressionManager import RegressionManager
from agbenchmark.RegressionManager import RegressionManager
from agbenchmark.start_benchmark import (
CONFIG_PATH,
REGRESSION_TESTS_PATH,
get_regression_data,
)
def resolve_workspace(config: Dict[str, Any]) -> str:
@@ -37,7 +41,7 @@ def config(request: Any) -> None:
config = json.load(f)
if request.config.getoption("--mock"):
config["workspace"] = "agbenchmark/mocks/workspace"
config["workspace"] = "agbenchmark/workspace"
elif isinstance(config["workspace"], str):
config["workspace"] = resolve_workspace(config)
else: # it's a input output dict
@@ -77,9 +81,22 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
def pytest_addoption(parser: Any) -> None:
parser.addoption("--mock", action="store_true", default=False)
parser.addoption("--improve", action="store_true", default=False)
parser.addoption("--maintain", action="store_true", default=False)
regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
@pytest.fixture(autouse=True)
def check_regression(request: Any) -> None:
test_name = request.node.parent.name
data = get_regression_data()
# Check if the test name exists in the regression tests
if request.config.getoption("--improve") and data.get(test_name, None):
pytest.skip("Skipping test because it's a regression test and --improve is set")
elif request.config.getoption("--maintain") and not data.get(test_name, None):
pytest.skip(
"Skipping test because it's not a regression test and --maintain is set"
)
# this is to get the challenge_data from every test
@@ -88,6 +105,9 @@ def challenge_data(request: Any) -> None:
return request.param
regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
def pytest_runtest_makereport(item: Any, call: Any) -> None:
if call.when == "call":
challenge_data = item.funcargs.get("challenge_data", None)
@@ -109,16 +129,6 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
regression_manager.remove_test(item.nodeid.split("::")[1])
def pytest_collection_modifyitems(items: List[Any]) -> None:
"""Called once all test items are collected. Used
to add regression and depends markers to collected test items."""
for item in items:
# regression add
if item.nodeid.split("::")[1] in regression_manager.tests:
print(regression_manager.tests)
item.add_marker(pytest.mark.regression)
def pytest_sessionfinish() -> None:
"""Called at the end of the session to save regression tests"""
regression_manager.save()
@@ -135,3 +145,29 @@ def pytest_generate_tests(metafunc: Any) -> None:
# Add the parameters to the test function
metafunc.parametrize("challenge_data", [params], indirect=True)
# this is adding the dependency marker and category markers automatically from the json
def pytest_collection_modifyitems(items: Any, config: Any) -> None:
data = get_regression_data()
for item in items:
# Assuming item.cls is your test class
test_class_instance = item.cls()
# Then you can access your properties
name = item.parent.cls.__name__
dependencies = test_class_instance.data.dependencies
# Filter dependencies if they exist in regression data if its an improvement test
if config.getoption("--improve"):
dependencies = [dep for dep in dependencies if not data.get(dep, None)]
categories = test_class_instance.data.category
# Add depends marker dynamically
item.add_marker(pytest.mark.depends(on=dependencies, name=name))
# Add category marker dynamically
for category in categories:
item.add_marker(getattr(pytest.mark, category))

View File

@@ -1,28 +0,0 @@
from typing import Any, Dict, Optional
import agbenchmark.mocks.tests.basic_mocks as basic_mocks
class MockManager:
def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None:
self.task = task
self.workspace = config["workspace"]
self.modules = [basic_mocks]
def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None:
if hasattr(self, mock_function_name):
# Check if the mock function is an attribute of this class
getattr(self, mock_function_name)(*args, **kwargs)
elif mock_function_name in globals():
# Check if the function is imported in the file
func = globals()[mock_function_name]
func(self.task, self.workspace, *args, **kwargs)
elif len(self.modules) > 0:
# checks if function is in imported modules
for module in self.modules:
if hasattr(module, mock_function_name):
func = getattr(module, mock_function_name)
func(self.task, self.workspace, *args, **kwargs)
return
else:
raise ValueError(f"No such mock: {mock_function_name}")

View File

@@ -1,12 +0,0 @@
from agbenchmark.challenge import Challenge
def example_mock(task: str, workspace: str) -> None:
"""
This mock writes to a file (creates one if it doesn't exist)
"""
Challenge.write_to_file(
workspace,
"file_to_check.txt",
"This is an example showing how you can use mocks but here you can use artifacts_out folder instead of a mock.",
)

View File

@@ -2,11 +2,11 @@ import json
import os
import sys
from pathlib import Path
from typing import List
from typing import Any
import click
import pytest
from dotenv import load_dotenv, set_key
from dotenv import load_dotenv
load_dotenv()
@@ -26,10 +26,17 @@ def cli() -> None:
@cli.command()
@click.option("--category", default=None, help="Specific category to run")
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
@click.option("--mock", is_flag=True, help="Run with mock")
def start(category: str, maintain: bool, mock: bool) -> int:
def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
# Check if configuration file exists and is not empty
if maintain and improve:
print(
"Error: You can't use both --maintain and --improve at the same time. Please choose one."
)
return 1
if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
config = {}
@@ -55,7 +62,7 @@ def start(category: str, maintain: bool, mock: bool) -> int:
with open(CONFIG_PATH, "r") as f:
config = json.load(f)
set_key(".env", "MOCK_TEST", "True" if mock else "False")
os.environ["MOCK_TEST"] = "True" if mock else "False"
if not os.path.exists(REGRESSION_TESTS_PATH):
with open(REGRESSION_TESTS_PATH, "a"):
@@ -65,42 +72,31 @@ def start(category: str, maintain: bool, mock: bool) -> int:
for key, value in config.items():
print(f"{key}: {value}")
print("Starting benchmark tests...", category)
tests_to_run = []
pytest_args = ["-vs"]
if category:
pytest_args.extend(["-m", category])
print("Starting benchmark tests ", category)
else:
if maintain:
print("Running all regression tests")
tests_to_run = get_regression_tests()
else:
print("Running all categories")
print("Running all categories")
if maintain:
print("Running only regression tests")
pytest_args.append("--maintain")
elif improve:
print("Running only non-regression tests")
pytest_args.append("--improve")
if mock:
pytest_args.append("--mock")
# Run pytest with the constructed arguments
if not tests_to_run:
tests_to_run = [str(CURRENT_DIRECTORY)]
pytest_args.extend(tests_to_run)
return sys.exit(pytest.main(pytest_args))
def get_regression_tests() -> List[str]:
if not Path(REGRESSION_TESTS_PATH).exists():
with open(REGRESSION_TESTS_PATH, "w") as file:
json.dump({}, file)
def get_regression_data() -> Any:
with open(REGRESSION_TESTS_PATH, "r") as file:
data = json.load(file)
regression_tests = [
str(CURRENT_DIRECTORY / ".." / value["test"]) for key, value in data.items()
]
return regression_tests
return data
if __name__ == "__main__":

View File

@@ -1,8 +0,0 @@
import pytest
from agbenchmark.challenge import Challenge
@pytest.mark.basic
class BasicChallenge(Challenge):
pass

View File

@@ -1,24 +0,0 @@
from typing import Any, Dict
import pytest
from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
class TestReadFile(BasicChallenge):
"""Testing if LLM can read a file"""
@pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
assert 1 in scores

View File

@@ -1,25 +0,0 @@
from typing import Any, Dict
import pytest
from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
class TestWriteFile(BasicChallenge):
"""Testing if LLM can write to a file"""
@pytest.mark.depends(name="basic_write_file")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
assert 1 in scores

View File

@@ -1,6 +1,6 @@
{
"workspace": "projects/my-new-project/workspace",
"entry_path": "agent/gpt-engineer/benchmarks.py",
"home_path": "agent/gpt-engineer",
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "benchmarks.py",
"home_path": "agent/mini-agi",
"cutoff": 60
}

View File

@@ -37,7 +37,7 @@ testpaths = [
markers = [
"retrieval",
"regression",
"basic",
"interface",
"code",
"memory"
]

View File

@@ -1,14 +1,4 @@
{
"TestDebugSimpleTypoWithGuidance": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/code/d2/d2_test.py"
},
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [],
@@ -16,44 +6,54 @@
},
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [],
"dependencies": [
"TestBasicMemory"
],
"test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [],
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
},
"TestRetrieval": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval"
],
"test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
],
"test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
},
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
"test": "agbenchmark/challenges/interface/write_file/write_file_test.py"
},
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"basic_write_file"
"TestWriteFile"
],
"test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [],
"test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
"test": "agbenchmark/challenges/interface/read_file/read_file_test.py"
}
}