Abandon tests/challenges in favor of benchmark

This commit is contained in:
Reinier van der Leer
2023-10-17 17:05:57 -07:00
parent 9dc059fa3a
commit 03992b3232
28 changed files with 0 additions and 1499 deletions

View File

@@ -1,10 +0,0 @@
If the goal oriented task pipeline fails, it means:
- you somehow changed the way the system prompt is generated
- or you broke autogpt.
To know which one, you can run the following command:
```bash
pytest -s -k tests/integration/goal_oriented
If the test is successful, it will record new cassettes in VCR. Then you can just push these to your branch and the pipeline
will pass

View File

@@ -1,34 +0,0 @@
import pytest
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import run_challenge
CYCLE_COUNT = 2
USER_INPUTS = [
"Use the browse_website command to visit http://books.toscrape.com/catalogue/meditations_33/index.html and answer the question 'What is the price of the book?'\nWrite the price of the book to a file named 'browse_website.txt'.'\nUse the task_complete command to complete the task.\nDo not use any other commands."
]
@challenge()
def test_browse_website(
patched_api_requestor: None,
monkeypatch: pytest.MonkeyPatch,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
run_challenge(
challenge_name,
level_to_run,
monkeypatch,
USER_INPUTS[level_to_run - 1],
CYCLE_COUNT,
)
file_path = workspace.get_path("browse_website.txt")
with open(file_path, "r") as file:
content = file.read()
assert "£25.89" in content, f"Expected £25.89, got {content}"

View File

@@ -1,47 +0,0 @@
import pytest
from autogpt.config import Config
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import get_workspace_path, run_challenge
CYCLE_COUNT_PER_LEVEL = [1, 1]
EXPECTED_OUTPUTS_PER_LEVEL = [
{"hello_world.txt": ["Hello World"]},
{"hello_world_1.txt": ["Hello World"], "hello_world_2.txt": ["Hello World"]},
]
USER_INPUTS = [
"Write 'Hello World' into a file named \"hello_world.txt\".",
'Write \'Hello World\' into 2 files named "hello_world_1.txt"and "hello_world_2.txt".',
]
@challenge()
def test_write_file(
config: Config,
patched_api_requestor: None,
monkeypatch: pytest.MonkeyPatch,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
run_challenge(
challenge_name,
level_to_run,
monkeypatch,
USER_INPUTS[level_to_run - 1],
CYCLE_COUNT_PER_LEVEL[level_to_run - 1],
)
expected_outputs = EXPECTED_OUTPUTS_PER_LEVEL[level_to_run - 1]
for file_name, expected_lines in expected_outputs.items():
file_path = get_workspace_path(workspace, file_name)
with open(file_path, "r") as file:
content = file.read()
for expected_line in expected_lines:
assert (
expected_line in content
), f"Expected '{expected_line}' in file {file_name}, but it was not found"

View File

@@ -1,24 +0,0 @@
from typing import Optional
class Challenge:
BEAT_CHALLENGES = False
DEFAULT_CHALLENGE_NAME = "default_challenge_name"
def __init__(
self,
name: str,
category: str,
max_level: int,
is_new_challenge: bool,
max_level_beaten: Optional[int] = None,
level_to_run: Optional[int] = None,
) -> None:
self.name = name
self.category = category
self.max_level_beaten = max_level_beaten
self.max_level = max_level
self.succeeded = False
self.skipped = False
self.level_to_run = level_to_run
self.is_new_challenge = is_new_challenge

View File

@@ -1,84 +0,0 @@
import os
from functools import wraps
from typing import Any, Callable, Optional
import pytest
from tests.challenges.challenge_decorator.challenge import Challenge
from tests.challenges.challenge_decorator.challenge_utils import create_challenge
from tests.challenges.challenge_decorator.score_utils import (
get_scores,
update_new_score,
)
MAX_LEVEL_TO_IMPROVE_ON = (
1 # we will attempt to beat 1 level above the current level for now.
)
CHALLENGE_FAILED_MESSAGE = "Challenges can sometimes fail randomly, please run this test again and if it fails reach out to us on https://discord.gg/autogpt in the 'challenges' channel to let us know the challenge you're struggling with."
def challenge() -> Callable[[Callable[..., Any]], Callable[..., None]]:
def decorator(func: Callable[..., Any]) -> Callable[..., None]:
@pytest.mark.requires_openai_api_key
@pytest.mark.vcr
@wraps(func)
def wrapper(*args: Any, **kwargs: Any) -> None:
run_remaining = MAX_LEVEL_TO_IMPROVE_ON if Challenge.BEAT_CHALLENGES else 1
original_error: Optional[Exception] = None
while run_remaining > 0:
current_score, new_score, new_score_location = get_scores()
level_to_run = (
kwargs["level_to_run"] if "level_to_run" in kwargs else None
)
challenge = create_challenge(
func, current_score, Challenge.BEAT_CHALLENGES, level_to_run
)
if challenge.level_to_run is not None:
kwargs["level_to_run"] = challenge.level_to_run
kwargs["challenge_name"] = challenge.name
try:
func(*args, **kwargs)
challenge.succeeded = True
except AssertionError as err:
original_error = AssertionError(
f"{CHALLENGE_FAILED_MESSAGE}\n{err}"
)
challenge.succeeded = False
except Exception as err:
original_error = err
challenge.succeeded = False
else:
challenge.skipped = True
if os.environ.get("CI") == "true":
new_max_level_beaten = get_new_max_level_beaten(
challenge, Challenge.BEAT_CHALLENGES
)
update_new_score(
new_score_location, new_score, challenge, new_max_level_beaten
)
if challenge.level_to_run is None:
pytest.skip("This test has not been unlocked yet.")
if not challenge.succeeded:
if Challenge.BEAT_CHALLENGES or challenge.is_new_challenge:
pytest.xfail(str(original_error))
if original_error:
raise original_error
run_remaining -= 1
return wrapper
return decorator
def get_new_max_level_beaten(
challenge: Challenge, beat_challenges: bool
) -> Optional[int]:
if challenge.succeeded:
return challenge.level_to_run
if challenge.skipped:
return challenge.max_level_beaten
# Challenge failed
return challenge.max_level_beaten if beat_challenges else None

View File

@@ -1,85 +0,0 @@
import os
from typing import Any, Callable, Dict, Optional, Tuple
from tests.challenges.challenge_decorator.challenge import Challenge
CHALLENGE_PREFIX = "test_"
def create_challenge(
func: Callable[..., Any],
current_score: Dict[str, Any],
is_beat_challenges: bool,
level_to_run: Optional[int] = None,
) -> Challenge:
challenge_category, challenge_name = get_challenge_identifiers(func)
is_new_challenge = challenge_name not in current_score.get(challenge_category, {})
max_level = get_max_level(current_score, challenge_category, challenge_name)
max_level_beaten = get_max_level_beaten(
current_score, challenge_category, challenge_name
)
level_to_run = get_level_to_run(
is_beat_challenges, level_to_run, max_level, max_level_beaten, is_new_challenge
)
return Challenge(
name=challenge_name,
category=challenge_category,
max_level=max_level,
max_level_beaten=max_level_beaten,
level_to_run=level_to_run,
is_new_challenge=is_new_challenge,
)
def get_level_to_run(
is_beat_challenges: bool,
level_to_run: Optional[int],
max_level: int,
max_level_beaten: Optional[int],
is_new_challenge: bool,
) -> Optional[int]:
if is_new_challenge:
return 1
if level_to_run is not None:
if level_to_run > max_level:
raise ValueError(
f"Level to run ({level_to_run}) is greater than max level ({max_level})"
)
return level_to_run
if is_beat_challenges:
if max_level_beaten == max_level:
return None
return 1 if max_level_beaten is None else max_level_beaten + 1
return max_level_beaten
def get_challenge_identifiers(func: Callable[..., Any]) -> Tuple[str, str]:
full_path = os.path.dirname(os.path.abspath(func.__code__.co_filename))
challenge_category = os.path.basename(full_path)
challenge_name = func.__name__.replace(CHALLENGE_PREFIX, "")
return challenge_category, challenge_name
def get_max_level(
current_score: Dict[str, Any],
challenge_category: str,
challenge_name: str,
) -> int:
return (
current_score.get(challenge_category, {})
.get(challenge_name, {})
.get("max_level", 1)
)
def get_max_level_beaten(
current_score: Dict[str, Any],
challenge_category: str,
challenge_name: str,
) -> Optional[int]:
return (
current_score.get(challenge_category, {})
.get(challenge_name, {})
.get("max_level_beaten", None)
)

View File

@@ -1,59 +0,0 @@
import json
import os
from typing import Any, Dict, Optional, Tuple
from tests.challenges.challenge_decorator.challenge import Challenge
CURRENT_SCORE_LOCATION = "../current_score"
NEW_SCORE_LOCATION = "../new_score"
def update_new_score(
filename_new_score: str,
new_score: Dict[str, Any],
challenge: Challenge,
new_max_level_beaten: Optional[int],
) -> None:
write_new_score(new_score, challenge, new_max_level_beaten)
write_new_score_to_file(new_score, filename_new_score)
def write_new_score(
new_score: Dict[str, Any], challenge: Challenge, new_max_level_beaten: Optional[int]
) -> Dict[str, Any]:
new_score.setdefault(challenge.category, {})
new_score[challenge.category][challenge.name] = {
"max_level_beaten": new_max_level_beaten,
"max_level": challenge.max_level,
}
return new_score
def write_new_score_to_file(new_score: Dict[str, Any], filename: str) -> None:
with open(filename, "w") as file:
json.dump(new_score, file, indent=4)
def get_scores() -> Tuple[Dict[str, Any], Dict[str, Any], str]:
filename_current_score, filename_new_score = get_score_locations()
current_score = load_json(filename_current_score)
new_score = load_json(filename_new_score)
return current_score, new_score, filename_new_score
def load_json(filename: str) -> Dict[str, Any]:
if os.path.isfile(filename):
with open(filename, "r") as file:
return json.load(file)
else:
return {}
def get_score_locations() -> Tuple[str, str]:
pid = os.getpid()
project_root = os.path.dirname(os.path.abspath(__file__))
filename_current_score = os.path.join(
project_root, f"{CURRENT_SCORE_LOCATION}.json"
)
filename_new_score = os.path.join(project_root, f"{NEW_SCORE_LOCATION}_{pid}.json")
return filename_current_score, filename_new_score

View File

@@ -1,79 +0,0 @@
from typing import Any, Dict, Generator, Optional
import pytest
from _pytest.config import Config
from _pytest.config.argparsing import Parser
from _pytest.fixtures import FixtureRequest
from pytest_mock import MockerFixture
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge import Challenge
from tests.vcr import before_record_response
def before_record_response_filter_errors(
response: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
"""In challenges we don't want to record errors (See issue #4461)"""
if response["status"]["code"] >= 400:
return None
return before_record_response(response)
@pytest.fixture(scope="module")
def vcr_config(get_base_vcr_config: Dict[str, Any]) -> Dict[str, Any]:
# this fixture is called by the pytest-recording vcr decorator.
return get_base_vcr_config | {
"before_record_response": before_record_response_filter_errors,
}
def pytest_addoption(parser: Parser) -> None:
parser.addoption(
"--level", action="store", default=None, type=int, help="Specify test level"
)
parser.addoption(
"--beat-challenges",
action="store_true",
help="Spepcifies whether the test suite should attempt to beat challenges",
)
def pytest_configure(config: Config) -> None:
level = config.getoption("--level", default=None)
config.option.level = level
beat_challenges = config.getoption("--beat-challenges", default=False)
config.option.beat_challenges = beat_challenges
@pytest.fixture
def level_to_run(request: FixtureRequest) -> int:
## used for challenges in the goal oriented tests
return request.config.option.level
@pytest.fixture
def challenge_name() -> str:
return Challenge.DEFAULT_CHALLENGE_NAME
@pytest.fixture(autouse=True)
def check_beat_challenges(request: FixtureRequest) -> None:
Challenge.BEAT_CHALLENGES = request.config.getoption("--beat-challenges")
@pytest.fixture
def patched_make_workspace(
mocker: MockerFixture, workspace: FileWorkspace
) -> Generator:
def patched_make_workspace(*args: Any, **kwargs: Any) -> str:
return workspace.root
mocker.patch.object(
FileWorkspace,
"make_workspace",
new=patched_make_workspace,
)
yield

View File

@@ -1,56 +0,0 @@
{
"basic_abilities": {
"browse_website": {
"max_level": 1,
"max_level_beaten": null
},
"write_file": {
"max_level": 2,
"max_level_beaten": 1
}
},
"debug_code": {
"debug_code_challenge_a": {
"max_level": 2,
"max_level_beaten": 1
}
},
"information_retrieval": {
"information_retrieval_challenge_a": {
"max_level": 3,
"max_level_beaten": null
},
"information_retrieval_challenge_b": {
"max_level": 1,
"max_level_beaten": null
},
"information_retrieval_challenge_c": {
"max_level": 3,
"max_level_beaten": null
}
},
"kubernetes": {
"kubernetes_template_challenge_a": {
"max_level": 1,
"max_level_beaten": null
}
},
"memory": {
"memory_challenge_a": {
"max_level": 3,
"max_level_beaten": 3
},
"memory_challenge_b": {
"max_level": 5,
"max_level_beaten": null
},
"memory_challenge_c": {
"max_level": 5,
"max_level_beaten": null
},
"memory_challenge_d": {
"max_level": 5,
"max_level_beaten": null
}
}
}

View File

@@ -1,13 +0,0 @@
# mypy: ignore-errors
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
typo
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@@ -1,31 +0,0 @@
# mypy: ignore-errors
from code import two_sum
from typing import List
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@@ -1,68 +0,0 @@
from pathlib import Path
import pytest
from pytest_mock import MockerFixture
from autogpt.agents import Agent
from autogpt.commands.execute_code import execute_python_file
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import (
copy_file_into_workspace,
get_workspace_path,
run_challenge,
)
CYCLE_COUNT = 5
EXPECTED_VALUES = ["[0, 1]", "[2, 5]", "[0, 3]"]
DIRECTORY_PATH = Path(__file__).parent / "data"
CODE_FILE_PATH = "code.py"
TEST_FILE_PATH = "test.py"
USER_INPUTS = [
"1- Run test.py using the execute_python_file command.\n2- Read code.py using the read_file command.\n3- Modify code.py using the write_to_file command.\nRepeat step 1, 2 and 3 until test.py runs without errors. Do not modify the test.py file.",
"1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"Make test.py run without errors.",
]
@challenge()
def test_debug_code_challenge_a(
dummy_agent: Agent,
monkeypatch: pytest.MonkeyPatch,
patched_api_requestor: MockerFixture,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
"""
Test whether the agent can debug a simple code snippet.
:param debug_code_agent: The agent to test.
:param monkeypatch: pytest's monkeypatch utility for modifying builtins.
:patched_api_requestor: Sends api requests to our API CI pipeline
:level_to_run: The level to run.
"""
copy_file_into_workspace(workspace, DIRECTORY_PATH, CODE_FILE_PATH)
copy_file_into_workspace(workspace, DIRECTORY_PATH, TEST_FILE_PATH)
run_challenge(
challenge_name,
level_to_run,
monkeypatch,
USER_INPUTS[level_to_run - 1],
CYCLE_COUNT,
)
output = execute_python_file(
get_workspace_path(workspace, TEST_FILE_PATH),
agent=dummy_agent,
)
assert "error" not in output.lower(), f"Errors found in output: {output}!"
for expected_value in EXPECTED_VALUES:
assert (
expected_value in output
), f"Expected output to contain {expected_value}, but it was not found in {output}!"

View File

@@ -1,50 +0,0 @@
import pytest
from pytest_mock import MockerFixture
from autogpt.file_workspace import FileWorkspace
from tests.challenges.basic_abilities.test_browse_website import USER_INPUTS
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import get_workspace_path, run_challenge
CYCLE_COUNT = 3
EXPECTED_REVENUES = [["81"], ["81"], ["81", "53", "24", "21", "11", "7", "4", "3", "2"]]
OUTPUT_LOCATION = "output.txt"
USER_INPUTS = [
"Write to a file called output.txt containing tesla's revenue in 2022 after searching for 'tesla revenue 2022'.",
"Write to a file called output.txt containing tesla's revenue in 2022.",
"Write to a file called output.txt containing tesla's revenue every year since its creation.",
]
@challenge()
def test_information_retrieval_challenge_a(
monkeypatch: pytest.MonkeyPatch,
patched_api_requestor: MockerFixture,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
"""
Test the challenge_a function in a given agent by mocking user inputs and checking the output file content.
:param get_company_revenue_agent: The agent to test.
:param monkeypatch: pytest's monkeypatch utility for modifying builtins.
"""
run_challenge(
challenge_name,
level_to_run,
monkeypatch,
USER_INPUTS[level_to_run - 1],
CYCLE_COUNT,
)
file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
with open(file_path, "r") as file:
content = file.read()
expected_revenues = EXPECTED_REVENUES[level_to_run - 1]
for revenue in expected_revenues:
assert (
f"{revenue}." in content or f"{revenue}," in content
), f"Expected the file to contain {revenue}"

View File

@@ -1,52 +0,0 @@
import pytest
from pytest_mock import MockerFixture
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import get_workspace_path, run_challenge
CYCLE_COUNT = 3
OUTPUT_LOCATION = "2010_nobel_prize_winners.txt"
USER_INPUTS = [
"Write to file the winner's name(s), affiliated university, and discovery of the 2010 nobel prize in physics. Write your final answer to 2010_nobel_prize_winners.txt."
]
@challenge()
def test_information_retrieval_challenge_b(
monkeypatch: pytest.MonkeyPatch,
patched_api_requestor: MockerFixture,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
"""
Test the challenge_b function in a given agent by mocking user inputs and checking the output file content.
:param get_nobel_prize_agent: The agent to test.
:param monkeypatch: pytest's monkeypatch utility for modifying builtins.
:param patched_api_requestor: APIRequestor Patch to override the openai.api_requestor module for testing.
:param level_to_run: The level to run.
"""
run_challenge(
challenge_name,
level_to_run,
monkeypatch,
USER_INPUTS[level_to_run - 1],
CYCLE_COUNT,
)
file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
with open(file_path, "r") as file:
content = file.read()
assert "Andre Geim" in content, "Expected the file to contain Andre Geim"
assert (
"Konstantin Novoselov" in content
), "Expected the file to contain Konstantin Novoselov"
assert (
"University of Manchester" in content
), "Expected the file to contain University of Manchester"
assert "graphene" in content, "Expected the file to contain graphene"

View File

@@ -1,47 +0,0 @@
import pytest
from pytest_mock import MockerFixture
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import get_workspace_path, run_challenge
CYCLE_COUNT = 3
COO = [["Luke Lafreniere"], ["Luke Lafreniere"], ["Luke Lafreniere 2017"]]
OUTPUT_LOCATION = "output.txt"
USER_INPUTS = [
"Write to a file called output.txt containing the name and title of the current Chief Operating Officer of Floatplane Media.",
"Write to a file called output.txt containing the name and title of the current Chief Operating Officer of https://www.floatplane.com.",
"Write to a file called output.txt containing the name and title of the current Chief Operating Officer of https://www.floatplane.com and the year it was formed.",
]
@challenge()
def test_information_retrieval_challenge_c(
monkeypatch: pytest.MonkeyPatch,
patched_api_requestor: MockerFixture,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
"""
Test the challenge_c function in a given agent by mocking user inputs and checking the output file content.
:param get_floatplane_ceo_agent: The agent to test.
:param monkeypatch: pytest's monkeypatch utility for modifying builtins.
"""
run_challenge(
challenge_name,
level_to_run,
monkeypatch,
USER_INPUTS[level_to_run - 1],
CYCLE_COUNT,
)
file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
with open(file_path, "r") as file:
content = file.read()
coo_name = COO[level_to_run - 1]
for chief in coo_name:
assert chief in content, f"Expected the file to contain {chief}"

View File

@@ -1,53 +0,0 @@
from typing import Any, Dict
import pytest
import yaml
from pytest_mock import MockerFixture
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import get_workspace_path, run_challenge
CYCLE_COUNT = 3
OUTPUT_LOCATION = "kube.yaml"
USER_INPUTS = ["Write a simple kubernetes deployment file and save it as a kube.yaml."]
@challenge()
def test_kubernetes_template_challenge_a(
monkeypatch: pytest.MonkeyPatch,
patched_api_requestor: MockerFixture,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
"""
Test the challenge_a function in a given agent by mocking user inputs
and checking the output file content.
Args:
kubernetes_agent (Agent)
monkeypatch (pytest.MonkeyPatch)
level_to_run (int)
"""
run_challenge(
challenge_name,
level_to_run,
monkeypatch,
USER_INPUTS[level_to_run - 1],
CYCLE_COUNT,
)
file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
with open(file_path, "r") as file:
content_string = file.read()
for word in ["apiVersion", "kind", "metadata", "spec"]:
assert word in content_string, f"Expected the file to contain {word}"
yaml_as_dict: Dict[str, Any] = yaml.safe_load(content_string)
for word in ["Service", "Deployment", "Pod"]:
assert word in yaml_as_dict.get(
"kind", ""
), f"Expected the file to contain {word}"

View File

@@ -1,86 +0,0 @@
import pytest
from pytest_mock import MockerFixture
from autogpt.config import Config
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import get_workspace_path, run_challenge
OUTPUT_LOCATION = "output.txt"
USER_INPUT = "Use the command read_file to read the instructions_1.txt file\nFollow the instructions in the instructions_1.txt file"
@challenge()
def test_memory_challenge_a(
config: Config,
patched_api_requestor: MockerFixture,
monkeypatch: pytest.MonkeyPatch,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
"""
The agent reads a file containing a task_id. Then, it reads a series of other files.
After reading 'n' files, the agent must write the task_id into a new file.
Args:
workspace (Workspace)
patched_api_requestor (MockerFixture)
monkeypatch (pytest.MonkeyPatch)
level_to_run (int)
"""
task_id = "2314"
create_instructions_files(workspace, level_to_run, task_id)
run_challenge(
challenge_name, level_to_run, monkeypatch, USER_INPUT, level_to_run + 2
)
file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
with open(file_path, "r") as file:
content = file.read()
assert task_id in content, f"Expected the file to contain {task_id}"
def create_instructions_files(
workspace: FileWorkspace,
num_files: int,
task_id: str,
base_filename: str = "instructions_",
) -> None:
"""
Creates a series of instructions files for the memory challenge.
Args:
workspace (Workspace)
num_files (int)
task_id (str)
base_filename (str, optional)
"""
for i in range(1, num_files + 1):
content = generate_content(i, task_id, base_filename, num_files)
file_name = f"{base_filename}{i}.txt"
file_path = get_workspace_path(workspace, file_name)
with open(file_path, "w") as file:
file.write(content)
def generate_content(
index: int, task_id: str, base_filename: str, num_files: int
) -> str:
"""
Args:
index: int
task_id: str
base_filename: str
num_files: int
Returns: str
"""
if index == 1:
return (
f"This task_id is {task_id}\nRead the file {base_filename}{index + 1}.txt"
)
if index != num_files:
return f"Read the file {base_filename}{index + 1}.txt"
return "Write the task_id into the file output.txt\nShutdown"

View File

@@ -1,86 +0,0 @@
import pytest
from pytest_mock import MockerFixture
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import generate_noise, get_workspace_path, run_challenge
NOISE = 1000
OUTPUT_LOCATION = "output.txt"
USER_INPUT = "Use the command read_file to read the instructions_1.txt file\nFollow the instructions in the instructions_1.txt file"
@challenge()
def test_memory_challenge_b(
patched_api_requestor: MockerFixture,
monkeypatch: pytest.MonkeyPatch,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
"""
The agent reads a series of files, each containing a task_id and noise. After reading 'n' files,
the agent must write all the task_ids into a new file, filtering out the noise.
Args:
workspace (Workspace)
patched_api_requestor (MockerFixture)
monkeypatch (pytest.MonkeyPatch)
level_to_run (int)
"""
task_ids = [str(i * 1111) for i in range(1, level_to_run + 1)]
create_instructions_files(workspace, level_to_run, task_ids)
run_challenge(
challenge_name, level_to_run, monkeypatch, USER_INPUT, level_to_run + 2
)
file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
with open(file_path, "r") as file:
content = file.read()
for task_id in task_ids:
assert task_id in content, f"Expected the file to contain {task_id}"
def create_instructions_files(
workspace: FileWorkspace,
level: int,
task_ids: list,
base_filename: str = "instructions_",
) -> None:
"""
Creates a series of instructions files for the memory challenge.
Args:
level:
workspace (Workspace)
num_files (int)
task_ids (list)
base_filename (str, optional)
"""
for i in range(1, level + 1):
content = generate_content(i, task_ids, base_filename, level)
file_name = f"{base_filename}{i}.txt"
file_path = get_workspace_path(workspace, file_name)
with open(file_path, "w") as file:
file.write(content)
def generate_content(index: int, task_ids: list, base_filename: str, level: int) -> str:
"""
Args:
index: int
task_ids: list
base_filename: str
num_files: int
Returns: str
"""
task_id = task_ids[index - 1]
noise = generate_noise(NOISE)
if index != level:
if level == 1:
return f"{noise}\nThe current task_id is {task_id}.\n{noise}\nWrite all the task_ids into the file output.txt. The file has not been created yet. After that, use the task_complete command."
return f"{noise}\nThe current task_id is {task_id}.\n{noise}\nRead the file {base_filename}{index + 1}.txt using the read_file command."
return f"{noise}\nThis task_id is {task_id}\n{noise}\nWrite all the task_ids into the file output.txt. The file has not been created yet. After that, use the task_complete command.\n"

View File

@@ -1,105 +0,0 @@
import pytest
from pytest_mock import MockerFixture
from autogpt.commands.file_operations import read_file
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import generate_noise, get_workspace_path, run_challenge
NOISE = 1200
OUTPUT_LOCATION = "output.txt"
USER_INPUT = "Use the command read_file to read the instructions_1.txt file\nFollow the instructions in the instructions_1.txt file"
@challenge()
def test_memory_challenge_c(
patched_api_requestor: MockerFixture,
monkeypatch: pytest.MonkeyPatch,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
"""
Instead of reading task Ids from files as with the previous challenges, the agent now must remember
phrases which may have semantically similar meaning and the agent must write the phrases to a file
after seeing several of them.
Args:
workspace (Workspace)
patched_api_requestor (MockerFixture)
monkeypatch (pytest.MonkeyPatch)
level_to_run (int)
"""
silly_phrases = [
"The purple elephant danced on a rainbow while eating a taco",
"The sneaky toaster stole my socks and ran away to Hawaii",
"My pet rock sings better than Beyoncé on Tuesdays",
"The giant hamster rode a unicycle through the crowded mall",
"The talking tree gave me a high-five and then flew away",
"I have a collection of invisible hats that I wear on special occasions",
"The flying spaghetti monster stole my sandwich and left a note saying 'thanks for the snack'",
"My imaginary friend is a dragon who loves to play video games",
"I once saw a cloud shaped like a giant chicken eating a pizza",
"The ninja unicorn disguised itself as a potted plant and infiltrated the office",
]
level_silly_phrases = silly_phrases[:level_to_run]
create_instructions_files(
workspace,
level_to_run,
level_silly_phrases,
)
run_challenge(
challenge_name, level_to_run, monkeypatch, USER_INPUT, level_to_run + 2
)
file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
content = read_file(file_path, agent=workspace)
for phrase in level_silly_phrases:
assert phrase in content, f"Expected the file to contain {phrase}"
def create_instructions_files(
workspace: FileWorkspace,
level: int,
task_ids: list,
base_filename: str = "instructions_",
) -> None:
"""
Creates a series of instructions files for the memory challenge.
Args:
level:
workspace (Workspace)
num_files (int)
task_ids (list)
base_filename (str, optional)
"""
for i in range(1, level + 1):
content = generate_content(i, task_ids, base_filename, level)
file_name = f"{base_filename}{i}.txt"
file_path = get_workspace_path(workspace, file_name)
with open(file_path, "w") as file:
file.write(content)
def generate_content(
index: int, silly_phrases: list, base_filename: str, level: int
) -> str:
"""
Args:
index: int
task_ids: list
base_filename: str
num_files: int
Returns: str
"""
phrase = silly_phrases[index - 1]
noise = generate_noise(NOISE)
if index != level:
if level == 1:
return f"{noise}\nThe current phrase to remember is '{phrase}'.\n{noise}\nWrite all the phrases into the file output.txt. The file has not been created yet. After that, use the task_complete command."
return f"{noise}\nThe current phrase is '{phrase}'.\n{noise}\nRead the file {base_filename}{index + 1}.txt using the read_file command."
return f"{noise}\nThis phrase is '{phrase}'\n{noise}\nWrite all the phrases into the file output.txt. The file has not been created yet. After that, use the task_complete command.\n"

View File

@@ -1,239 +0,0 @@
import json
from typing import Dict
import pytest
from pytest_mock import MockerFixture
from autogpt.commands.file_operations import read_file
from autogpt.file_workspace import FileWorkspace
from tests.challenges.challenge_decorator.challenge_decorator import challenge
from tests.challenges.utils import get_workspace_path, run_challenge
LEVEL_CURRENTLY_BEATEN = 1
MAX_LEVEL = 5
OUTPUT_LOCATION = "output.txt"
USER_INPUT = "Use the command read_file to read the instructions_1.txt file\nFollow the instructions in the instructions_1.txt file"
@challenge()
def test_memory_challenge_d(
patched_api_requestor: MockerFixture,
monkeypatch: pytest.MonkeyPatch,
level_to_run: int,
challenge_name: str,
workspace: FileWorkspace,
patched_make_workspace: pytest.fixture,
) -> None:
"""
The agent is given a series of events and must remember the respective beliefs of the characters.
Args:
workspace (Workspace)
user_selected_level (int)
"""
sally_anne_test_phrases = [
"Sally has a marble (marble A) and she puts it in her basket (basket S), then leaves the room. Anne moves marble A from Sally's basket (basket S) to her own basket (basket A).",
"Sally gives a new marble (marble B) to Bob who is outside with her. Bob goes into the room and places marble B into Anne's basket (basket A). Anne tells Bob to tell Sally that he lost the marble b. Bob leaves the room and speaks to Sally about the marble B. Meanwhile, after Bob left the room, Anne moves marble A into the green box, but tells Charlie to tell Sally that marble A is under the sofa. Charlie leaves the room and speaks to Sally about the marble A as instructed by Anne.",
"Sally gives a new marble (marble C) to Charlie who is outside with her. Charlie enters the room and exchanges marble C with marble B in Anne's basket (basket A). Anne tells Charlie to tell Sally that he put marble C into the red box. Charlie leaves the room and speak to Sally about marble C as instructed by Anne. Meanwhile, after Charlie leaves the room, Bob enters into the room and moves marble A from the green box to under the sofa, but tells Anne to tell Sally that marble A is in the green box. Anne leaves the room and speak to Sally about the marble A as instructed by Bob",
"Sally gives a new marble (marble D) to Anne. Anne gives the marble to Charlie. Charlie enters the room and gives marble D to Bob. Bob tells Charlie to tell Sally that he put marble D under the sofa. Bob put marble D under the sofa Charlie leaves the room and speaks to Sally about marble D. Meanwhile, after Charlie leaves the room, Bob takes marble A from under the sofa and places it in the blue box.",
"Sally gives a new marble (marble E) to Charlie who is outside with her. Charlie enters the room and places marble E in the red box. Anne, who is already in the room, takes marble E from the red box, and hides it under the sofa. Then Anne leaves the room and tells Sally that marble E is in the green box. Meanwhile, after Anne leaves the room, Charlie who re-enters the room takes marble D from under the sofa and places it in his own basket (basket C).",
]
level_sally_anne_test_phrases = sally_anne_test_phrases[:level_to_run]
create_instructions_files(workspace, level_to_run, level_sally_anne_test_phrases)
run_challenge(
challenge_name, level_to_run, monkeypatch, USER_INPUT, level_to_run + 2
)
file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
content = read_file(file_path, workspace)
check_beliefs(content, level_to_run)
def check_beliefs(content: str, level: int) -> None:
# Define the expected beliefs for each level
expected_beliefs = {
1: {
"Sally": {
"marble A": "basket S",
},
"Anne": {
"marble A": "basket A",
},
},
2: {
"Sally": {
"marble A": "sofa", # Because Charlie told her
"marble B": "lost", # Because Bob told her
},
"Anne": {
"marble A": "green box", # Because she moved it there
"marble B": "basket A", # Because Bob put it there and she was in the room
},
"Bob": {
"marble B": "basket A", # Last place he put it
},
"Charlie": {
"marble A": "sofa", # Because Anne told him to tell Sally so
},
},
3: {
"Sally": {
"marble A": "green box", # Because Anne told her
"marble C": "red box", # Because Charlie told her
},
"Anne": {
"marble A": "sofa", # Because Bob moved it there and told her
"marble B": "basket A", # Because Charlie exchanged marble C with marble B in her basket
"marble C": "basket A", # Because Charlie exchanged marble C with marble B in her basket
},
"Bob": {
"marble A": "sofa", # Because he moved it there
"marble B": "basket A",
# Because Charlie exchanged marble C with marble B in Anne's basket, and he was in the room
"marble C": "basket A",
# Because Charlie exchanged marble C with marble B in Anne's basket, and he was in the room
},
"Charlie": {
"marble A": "sofa", # Last place he knew it was
"marble B": "basket A", # Because he exchanged marble C with marble B in Anne's basket
"marble C": "red box", # Because Anne told him to tell Sally so
},
},
4: {
"Sally": {
"marble A": "green box", # Because Anne told her in the last conversation
"marble C": "red box", # Because Charlie told her
"marble D": "sofa", # Because Charlie told her
},
"Anne": {
"marble A": "blue box", # Because Bob moved it there, and she was not in the room to see
"marble B": "basket A", # Last place she knew it was
"marble C": "basket A", # Last place she knew it was
"marble D": "sofa", # Because Bob moved it there, and she was in the room to see
},
"Bob": {
"marble A": "blue box", # Because he moved it there
"marble B": "basket A", # Last place he knew it was
"marble C": "basket A", # Last place he knew it was
"marble D": "sofa", # Because he moved it there
},
"Charlie": {
"marble A": "sofa", # Last place he knew it was
"marble B": "basket A", # Last place he knew it was
"marble C": "red box", # Last place he knew it was
"marble D": "sofa", # Because Bob told him to tell Sally so
},
},
5: {
"Sally": {
"marble A": "green box", # Because Anne told her in the last level
"marble C": "red box", # Because Charlie told her
"marble D": "sofa", # Because Charlie told her
"marble E": "green box", # Because Anne told her
},
"Anne": {
"marble A": "blue box", # Last place she knew it was
"marble B": "basket A", # Last place she knew it was
"marble C": "basket A", # Last place she knew it was
"marble D": "basket C", # Last place she knew it was
"marble E": "sofa", # Because she moved it there
},
"Charlie": {
"marble A": "blue box", # Last place he knew it was
"marble B": "basket A", # Last place he knew it was
"marble C": "basket A", # Last place he knew it was
"marble D": "basket C", # Because he moved it there
"marble E": "red box", # Last place he knew it was
},
"Bob": {
"marble A": "blue box", # Last place he knew it was
"marble C": "red box", # Last place he knew it was
"marble D": "sofa", # Last place he knew it was
},
},
}
# Extract the beliefs from the AI's response
ai_beliefs = extract_beliefs(content)
# Check the AI's beliefs against the expected beliefs
for character, belief in expected_beliefs[level].items():
for marble, location in belief.items():
ai_belief = ai_beliefs.get(character, {}).get(marble, "")
assert (
location in ai_belief
), f"For {character}'s {marble}, expected '{location}' to be in '{ai_belief}'"
def extract_beliefs(content: str) -> Dict[str, Dict[str, str]]:
"""Extract the beliefs of each character from the AI's output."""
# Parse the JSON content
content_dict = json.loads(content)
beliefs = content_dict.get("beliefs", {})
return beliefs
def create_instructions_files(
workspace: FileWorkspace,
level: int,
test_phrases: list,
base_filename: str = "instructions_",
) -> None:
"""
Creates a series of instructions files for the memory challenge.
Args:
level:
workspace (Workspace)
test_phrases (list)
base_filename (str, optional)
"""
for i in range(1, level + 1):
content = generate_content(i, test_phrases, base_filename, level)
file_name = f"{base_filename}{i}.txt"
file_path = get_workspace_path(workspace, file_name)
with open(file_path, "w") as file:
file.write(content)
def generate_content(
index: int, test_phrases: list, base_filename: str, level: int
) -> str:
"""
Args:
index: int
test_phrases: list
base_filename: str
level: int
Returns: str
"""
phrase = test_phrases[index - 1]
if index != level:
return f"The current event is '{phrase}'.\nRead the file {base_filename}{index + 1}.txt using the read_file command."
else:
return f"""
This event is '{phrase}'
rules:
1. Everyone inside the room see and know what are the actions of the people inside the room.
2. Everyone outside the room do not see and do not know the actions of the people inside the room.
3. Only write about the marbles that are present in the level
4. Marbles names are marble A, marble B, marble C, marble D, ...
5. Believed_location_of_the_specific_marble describes where is the marble like drawer, basket S, sofa, ...
6. Do not use expression like <Anne's basket> use <basket A> instead.
7. All characters believe what they're being told by other characters.
Instructions:
I) Write the following information in the file output.txt in JSON format:
1. The respective beliefs of the characters (which means where every marble is according to character x, y, z. Character x should say where it believes every marble it is aware exist is)
The format should be as follows:
{{
"beliefs": {{
"<character_name>": {{
"<marble_name>": "<believed_location_of_the_specific_marble>",
...
}},
...
}},
}}
Example of output (only use this to understand and learn how to use the format. Nothing else):
{{"beliefs": {{"Sally": {{"marble A": "basket A"}}, "Bob": {{"marble B": "basket S"}}, "Anne": {{"marble A": "green box"}}, "Charlie": {{"marble B": "sofa"}}}}
II) The file output.txt has not been created yet. You need to create it. After that, use the task_complete command.
"""

View File

@@ -1,7 +0,0 @@
from pydantic import BaseModel
class Task(BaseModel):
"""Jsonifiable representation of a task"""
user_input: str

View File

@@ -1,59 +0,0 @@
import importlib.util
import inspect
import os
from types import ModuleType
from typing import List
# Path to the challenges folder
CHALLENGES_DIR = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "../challenges"
)
def get_python_files(directory: str, exclude_file: str) -> List[str]:
"""Recursively get all python files in a directory and subdirectories."""
python_files: List[str] = []
for root, dirs, files in os.walk(directory):
for file in files:
if (
file.endswith(".py")
and file.startswith("test_")
and file != exclude_file
):
python_files.append(os.path.join(root, file))
return python_files
def load_module_from_file(test_file: str) -> ModuleType:
spec = importlib.util.spec_from_file_location("module.name", test_file)
assert spec is not None, f"Unable to get spec for module in file {test_file}"
module = importlib.util.module_from_spec(spec)
assert (
spec.loader is not None
), f"Unable to get loader for module in file {test_file}"
spec.loader.exec_module(module)
return module
def get_test_functions(module: ModuleType) -> List:
return [
o
for o in inspect.getmembers(module)
if inspect.isfunction(o[1]) and o[0].startswith("test_")
]
def assert_single_test_function(functions_list: List, test_file: str) -> None:
assert len(functions_list) == 1, f"{test_file} should contain only one function"
assert (
functions_list[0][0][5:] == os.path.basename(test_file)[5:-3]
), f"The function in {test_file} should have the same name as the file without 'test_' prefix"
def test_method_name_and_count() -> None:
current_file: str = os.path.basename(__file__)
test_files: List[str] = get_python_files(CHALLENGES_DIR, current_file)
for test_file in test_files:
module = load_module_from_file(test_file)
functions_list = get_test_functions(module)
assert_single_test_function(functions_list, test_file)

View File

@@ -1,81 +0,0 @@
import contextlib
import random
import shutil
from pathlib import Path
from typing import Any, AsyncIterator
import pytest
from agbenchmark_config.benchmarks import run_specific_agent
from autogpt.file_workspace import FileWorkspace
from autogpt.logs import LogCycleHandler
from tests.challenges.schema import Task
def generate_noise(noise_size: int) -> str:
random.seed(42)
return "".join(
random.choices(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
k=noise_size,
)
)
def setup_mock_input(monkeypatch: pytest.MonkeyPatch, cycle_count: int) -> None:
"""
Sets up the mock input for testing.
:param monkeypatch: pytest's monkeypatch utility for modifying builtins.
:param cycle_count: The number of cycles to mock.
"""
input_sequence = ["y"] * (cycle_count) + ["EXIT"]
async def input_generator() -> AsyncIterator[str]:
"""
Creates a generator that yields input strings from the given sequence.
"""
for input in input_sequence:
yield input
gen = input_generator()
monkeypatch.setattr(
"autogpt.app.utils.session.prompt_async", lambda _, **kwargs: anext(gen)
)
def setup_mock_log_cycle_agent_name(
monkeypatch: pytest.MonkeyPatch, challenge_name: str, level_to_run: int
) -> None:
def mock_get_agent_short_name(*args: Any, **kwargs: Any) -> str:
return f"{challenge_name}_level_{level_to_run}"
monkeypatch.setattr(
LogCycleHandler, "get_agent_short_name", mock_get_agent_short_name
)
def get_workspace_path(workspace: FileWorkspace, file_name: str) -> str:
return str(workspace.get_path(file_name))
def copy_file_into_workspace(
workspace: FileWorkspace, directory_path: Path, file_path: str
) -> None:
workspace_code_file_path = get_workspace_path(workspace, file_path)
code_file_path = directory_path / file_path
shutil.copy(code_file_path, workspace_code_file_path)
def run_challenge(
challenge_name: str,
level_to_run: int,
monkeypatch: pytest.MonkeyPatch,
user_input: str,
cycle_count: int,
) -> None:
setup_mock_input(monkeypatch, cycle_count)
setup_mock_log_cycle_agent_name(monkeypatch, challenge_name, level_to_run)
task = Task(user_input=user_input)
with contextlib.suppress(SystemExit):
run_specific_agent(task.user_input)

View File

@@ -1,44 +0,0 @@
import glob
import json
import os
from typing import Any, Dict
def deep_merge(source: Dict[Any, Any], dest: Dict[Any, Any]) -> Dict[Any, Any]:
for key, value in source.items():
if isinstance(value, Dict):
dest[key] = deep_merge(value, dest.get(key, {}))
else:
dest[key] = value
return dest
import collections
def recursive_sort_dict(data: dict) -> dict:
for key, value in data.items():
if isinstance(value, dict):
data[key] = recursive_sort_dict(value)
return collections.OrderedDict(sorted(data.items()))
# setup
cwd = os.getcwd() # get current working directory
new_score_filename_pattern = os.path.join(cwd, "tests/challenges/new_score_*.json")
current_score_filename = os.path.join(cwd, "tests/challenges/current_score.json")
merged_data: Dict[str, Any] = {}
for filename in glob.glob(new_score_filename_pattern):
with open(filename, "r") as f_new:
data = json.load(f_new)
merged_data = deep_merge(
data, merged_data
) # deep merge the new data with the merged data
os.remove(filename) # remove the individual file
sorted_data = recursive_sort_dict(merged_data)
with open(current_score_filename, "w") as f_current:
json_data = json.dumps(sorted_data, indent=4)
f_current.write(json_data + "\n")