Link all challenges to benchmark python hook (#4786)

2025-12-22 08:24:26 +01:00 · 2023-06-24 06:20:58 -07:00
parent 307f6e50ad
commit cfdb24efac
13 changed files with 218 additions and 402 deletions
--- a/benchmarks.py
+++ b/benchmarks.py
@@ -1,9 +1,10 @@
 from autogpt.agent import Agent
 from autogpt.config import AIConfig, Config
+from autogpt.main import COMMAND_CATEGORIES
 from autogpt.memory.vector import get_memory
+from autogpt.models.command_registry import CommandRegistry
 from autogpt.prompts.prompt import DEFAULT_TRIGGERING_PROMPT
 from autogpt.workspace import Workspace
-from tests.integration.agent_factory import get_command_registry


 def run_task(task) -> None:
@@ -39,3 +40,13 @@ def bootstrap_agent(task):
        triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
        workspace_directory=str(workspace_directory_path),
    )
+
+
+def get_command_registry(config):
+    command_registry = CommandRegistry()
+    enabled_command_categories = [
+        x for x in COMMAND_CATEGORIES if x not in config.disabled_command_categories
+    ]
+    for command_category in enabled_command_categories:
+        command_registry.import_commands(command_category)
+    return command_registry
--- a/tests/challenges/basic_abilities/test_browse_website.py
+++ b/tests/challenges/basic_abilities/test_browse_website.py
@@ -1,25 +1,34 @@
 import pytest

-from autogpt.agent import Agent
+from autogpt.workspace import Workspace
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
-from tests.challenges.utils import run_interaction_loop
+from tests.challenges.utils import run_challenge

 CYCLE_COUNT = 2
+USER_INPUTS = [
+    "Use the browse_website command to visit http://books.toscrape.com/catalogue/meditations_33/index.html and answer the question 'What is the price of the book?'\nWrite the price of the book to a file named 'browse_website.txt'.'\nUse the task_complete command to complete the task.\nDo not use any other commands."
+]


@challenge()
 def test_browse_website(
-    browser_agent: Agent,
    patched_api_requestor: None,
    monkeypatch: pytest.MonkeyPatch,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
+    patched_make_workspace: pytest.fixture,
 ) -> None:
-    file_path = browser_agent.workspace.get_path("browse_website.txt")
-    run_interaction_loop(
-        monkeypatch, browser_agent, CYCLE_COUNT, challenge_name, level_to_run
+    run_challenge(
+        challenge_name,
+        level_to_run,
+        monkeypatch,
+        USER_INPUTS[level_to_run - 1],
+        CYCLE_COUNT,
    )

-    # content = read_file(file_path, config)
-    content = open(file_path, encoding="utf-8").read()
+    file_path = workspace.get_path("browse_website.txt")
+
+    with open(file_path, "r") as file:
+        content = file.read()
    assert "£25.89" in content, f"Expected £25.89, got {content}"
--- a/tests/challenges/basic_abilities/test_write_file.py
+++ b/tests/challenges/basic_abilities/test_write_file.py
@@ -1,14 +1,8 @@
 import pytest

 from autogpt.workspace import Workspace
-from benchmarks import run_task
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
-from tests.challenges.schema import Task
-from tests.challenges.utils import (
-    get_workspace_path,
-    setup_mock_input,
-    setup_mock_log_cycle_agent_name,
-)
+from tests.challenges.utils import get_workspace_path, run_challenge

 CYCLE_COUNT_PER_LEVEL = [1, 1]
 EXPECTED_OUTPUTS_PER_LEVEL = [
@@ -23,18 +17,20 @@ USER_INPUTS = [

@challenge()
 def test_write_file(
-    workspace: Workspace,
    patched_api_requestor: None,
    monkeypatch: pytest.MonkeyPatch,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
    patched_make_workspace: pytest.fixture,
 ) -> None:
-    user_input = USER_INPUTS[level_to_run - 1]
-    setup_mock_input(monkeypatch, CYCLE_COUNT_PER_LEVEL[level_to_run - 1])
-    setup_mock_log_cycle_agent_name(monkeypatch, challenge_name, level_to_run)
-    task = Task(user_input=user_input)
-    run_task(task)
+    run_challenge(
+        challenge_name,
+        level_to_run,
+        monkeypatch,
+        USER_INPUTS[level_to_run - 1],
+        CYCLE_COUNT_PER_LEVEL[level_to_run - 1],
+    )

    expected_outputs = EXPECTED_OUTPUTS_PER_LEVEL[level_to_run - 1]

--- a/tests/challenges/debug_code/test_debug_code_challenge_a.py
+++ b/tests/challenges/debug_code/test_debug_code_challenge_a.py
@@ -5,11 +5,12 @@ from pytest_mock import MockerFixture

 from autogpt.agent import Agent
 from autogpt.commands.execute_code import execute_python_file
+from autogpt.workspace import Workspace
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
 from tests.challenges.utils import (
    copy_file_into_workspace,
-    get_workspace_path_from_agent,
-    run_interaction_loop,
+    get_workspace_path,
+    run_challenge,
 )

 CYCLE_COUNT = 5
@@ -17,15 +18,22 @@ EXPECTED_VALUES = ["[0, 1]", "[2, 5]", "[0, 3]"]
 DIRECTORY_PATH = Path(__file__).parent / "data"
 CODE_FILE_PATH = "code.py"
 TEST_FILE_PATH = "test.py"
+USER_INPUTS = [
+    "1- Run test.py using the execute_python_file command.\n2- Read code.py using the read_file command.\n3- Modify code.py using the write_to_file command.\nRepeat step 1, 2 and 3 until test.py runs without errors. Do not modify the test.py file.",
+    "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+    "Make test.py run without errors.",
+]


@challenge()
 def test_debug_code_challenge_a(
-    debug_code_agents: Agent,
+    dummy_agent: Agent,
    monkeypatch: pytest.MonkeyPatch,
    patched_api_requestor: MockerFixture,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
+    patched_make_workspace: pytest.fixture,
 ) -> None:
    """
    Test whether the agent can debug a simple code snippet.
@@ -35,18 +43,21 @@ def test_debug_code_challenge_a(
    :patched_api_requestor: Sends api requests to our API CI pipeline
    :level_to_run: The level to run.
    """
-    debug_code_agent = debug_code_agents[level_to_run - 1]

-    copy_file_into_workspace(debug_code_agent, DIRECTORY_PATH, CODE_FILE_PATH)
-    copy_file_into_workspace(debug_code_agent, DIRECTORY_PATH, TEST_FILE_PATH)
+    copy_file_into_workspace(workspace, DIRECTORY_PATH, CODE_FILE_PATH)
+    copy_file_into_workspace(workspace, DIRECTORY_PATH, TEST_FILE_PATH)

-    run_interaction_loop(
-        monkeypatch, debug_code_agent, CYCLE_COUNT, challenge_name, level_to_run
+    run_challenge(
+        challenge_name,
+        level_to_run,
+        monkeypatch,
+        USER_INPUTS[level_to_run - 1],
+        CYCLE_COUNT,
    )

    output = execute_python_file(
-        get_workspace_path_from_agent(debug_code_agent, TEST_FILE_PATH),
-        debug_code_agent,
+        get_workspace_path(workspace, TEST_FILE_PATH),
+        dummy_agent,
    )

    assert "error" not in output.lower(), f"Errors found in output: {output}!"
--- a/tests/challenges/information_retrieval/test_information_retrieval_challenge_a.py
+++ b/tests/challenges/information_retrieval/test_information_retrieval_challenge_a.py
@@ -1,24 +1,30 @@
 import pytest
 from pytest_mock import MockerFixture

-from autogpt.commands.file_operations import read_file
+from autogpt.workspace import Workspace
+from tests.challenges.basic_abilities.test_browse_website import USER_INPUTS
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
-from tests.challenges.utils import get_workspace_path_from_agent, run_interaction_loop
+from tests.challenges.utils import get_workspace_path, run_challenge

 CYCLE_COUNT = 3
 EXPECTED_REVENUES = [["81"], ["81"], ["81", "53", "24", "21", "11", "7", "4", "3", "2"]]
-from autogpt.agent import Agent

 OUTPUT_LOCATION = "output.txt"
+USER_INPUTS = [
+    "Write to a file called output.txt containing tesla's revenue in 2022 after searching for 'tesla revenue 2022'.",
+    "Write to a file called output.txt containing tesla's revenue in 2022.",
+    "Write to a file called output.txt containing tesla's revenue every year since its creation.",
+]


@challenge()
 def test_information_retrieval_challenge_a(
-    information_retrieval_agents: Agent,
    monkeypatch: pytest.MonkeyPatch,
    patched_api_requestor: MockerFixture,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
+    patched_make_workspace: pytest.fixture,
 ) -> None:
    """
    Test the challenge_a function in a given agent by mocking user inputs and checking the output file content.
@@ -26,19 +32,17 @@ def test_information_retrieval_challenge_a(
    :param get_company_revenue_agent: The agent to test.
    :param monkeypatch: pytest's monkeypatch utility for modifying builtins.
    """
-    information_retrieval_agent = information_retrieval_agents[level_to_run - 1]
-    run_interaction_loop(
-        monkeypatch,
-        information_retrieval_agent,
-        CYCLE_COUNT,
+    run_challenge(
        challenge_name,
        level_to_run,
+        monkeypatch,
+        USER_INPUTS[level_to_run - 1],
+        CYCLE_COUNT,
    )

-    file_path = get_workspace_path_from_agent(
-        information_retrieval_agent, OUTPUT_LOCATION
-    )
-    content = read_file(file_path, information_retrieval_agent)
+    file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
+    with open(file_path, "r") as file:
+        content = file.read()
    expected_revenues = EXPECTED_REVENUES[level_to_run - 1]
    for revenue in expected_revenues:
        assert (
--- a/tests/challenges/information_retrieval/test_information_retrieval_challenge_b.py
+++ b/tests/challenges/information_retrieval/test_information_retrieval_challenge_b.py
@@ -1,24 +1,25 @@
-import contextlib
-
 import pytest
 from pytest_mock import MockerFixture

-from autogpt.agent import Agent
-from autogpt.commands.file_operations import read_file
+from autogpt.workspace import Workspace
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
-from tests.challenges.utils import get_workspace_path_from_agent, run_interaction_loop
+from tests.challenges.utils import get_workspace_path, run_challenge

 CYCLE_COUNT = 3
 OUTPUT_LOCATION = "2010_nobel_prize_winners.txt"
+USER_INPUTS = [
+    "Write to file the winner's name(s), affiliated university, and discovery of the 2010 nobel prize in physics. Write your final answer to 2010_nobel_prize_winners.txt."
+]


@challenge()
 def test_information_retrieval_challenge_b(
-    get_nobel_prize_agent: Agent,
    monkeypatch: pytest.MonkeyPatch,
    patched_api_requestor: MockerFixture,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
+    patched_make_workspace: pytest.fixture,
 ) -> None:
    """
    Test the challenge_b function in a given agent by mocking user inputs and checking the output file content.
@@ -29,17 +30,18 @@ def test_information_retrieval_challenge_b(
    :param level_to_run: The level to run.
    """

-    with contextlib.suppress(SystemExit):
-        run_interaction_loop(
-            monkeypatch,
-            get_nobel_prize_agent,
-            CYCLE_COUNT,
-            challenge_name,
-            level_to_run,
-        )
-    file_path = get_workspace_path_from_agent(get_nobel_prize_agent, OUTPUT_LOCATION)
+    run_challenge(
+        challenge_name,
+        level_to_run,
+        monkeypatch,
+        USER_INPUTS[level_to_run - 1],
+        CYCLE_COUNT,
+    )

-    content = read_file(file_path, get_nobel_prize_agent)
+    file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
+
+    with open(file_path, "r") as file:
+        content = file.read()
    assert "Andre Geim" in content, "Expected the file to contain Andre Geim"
    assert (
        "Konstantin Novoselov" in content
--- a/tests/challenges/kubernetes/test_kubernetes_template_challenge_a.py
+++ b/tests/challenges/kubernetes/test_kubernetes_template_challenge_a.py
@@ -1,23 +1,26 @@
+from typing import Any, Dict
+
 import pytest
 import yaml
 from pytest_mock import MockerFixture

-from autogpt.agent import Agent
-from autogpt.commands.file_operations import read_file
+from autogpt.workspace import Workspace
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
-from tests.challenges.utils import get_workspace_path_from_agent, run_interaction_loop
+from tests.challenges.utils import get_workspace_path, run_challenge

 CYCLE_COUNT = 3
 OUTPUT_LOCATION = "kube.yaml"
+USER_INPUTS = ["Write a simple kubernetes deployment file and save it as a kube.yaml."]


@challenge()
 def test_kubernetes_template_challenge_a(
-    kubernetes_agent: Agent,
    monkeypatch: pytest.MonkeyPatch,
    patched_api_requestor: MockerFixture,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
+    patched_make_workspace: pytest.fixture,
 ) -> None:
    """
    Test the challenge_a function in a given agent by mocking user inputs
@@ -28,16 +31,23 @@ def test_kubernetes_template_challenge_a(
        monkeypatch (pytest.MonkeyPatch)
        level_to_run (int)
    """
-    run_interaction_loop(
-        monkeypatch, kubernetes_agent, CYCLE_COUNT, challenge_name, level_to_run
+    run_challenge(
+        challenge_name,
+        level_to_run,
+        monkeypatch,
+        USER_INPUTS[level_to_run - 1],
+        CYCLE_COUNT,
    )

-    file_path = get_workspace_path_from_agent(kubernetes_agent, OUTPUT_LOCATION)
-    content = read_file(file_path, kubernetes_agent)
+    file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
+    with open(file_path, "r") as file:
+        content_string = file.read()

    for word in ["apiVersion", "kind", "metadata", "spec"]:
-        assert word in content, f"Expected the file to contain {word}"
+        assert word in content_string, f"Expected the file to contain {word}"

-    content = yaml.safe_load(content)
+    yaml_as_dict: Dict[str, Any] = yaml.safe_load(content_string)
    for word in ["Service", "Deployment", "Pod"]:
-        assert word in content["kind"], f"Expected the file to contain {word}"
+        assert word in yaml_as_dict.get(
+            "kind", ""
+        ), f"Expected the file to contain {word}"
--- a/tests/challenges/memory/test_memory_challenge_a.py
+++ b/tests/challenges/memory/test_memory_challenge_a.py
@@ -1,49 +1,48 @@
 import pytest
 from pytest_mock import MockerFixture

-from autogpt.agent import Agent
-from autogpt.commands.file_operations import read_file, write_to_file
+from autogpt.workspace import Workspace
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
-from tests.challenges.utils import get_workspace_path_from_agent, run_interaction_loop
+from tests.challenges.utils import get_workspace_path, run_challenge

 OUTPUT_LOCATION = "output.txt"

+USER_INPUT = "Use the command read_file to read the instructions_1.txt file\nFollow the instructions in the instructions_1.txt file"
+

@challenge()
 def test_memory_challenge_a(
-    memory_management_agent: Agent,
    patched_api_requestor: MockerFixture,
    monkeypatch: pytest.MonkeyPatch,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
+    patched_make_workspace: pytest.fixture,
 ) -> None:
    """
    The agent reads a file containing a task_id. Then, it reads a series of other files.
    After reading 'n' files, the agent must write the task_id into a new file.
    Args:
-        memory_management_agent (Agent)
+        workspace (Workspace)
        patched_api_requestor (MockerFixture)
        monkeypatch (pytest.MonkeyPatch)
        level_to_run (int)
    """
    task_id = "2314"
-    create_instructions_files(memory_management_agent, level_to_run, task_id)
+    create_instructions_files(workspace, level_to_run, task_id)

-    run_interaction_loop(
-        monkeypatch,
-        memory_management_agent,
-        level_to_run + 2,
-        challenge_name,
-        level_to_run,
+    run_challenge(
+        challenge_name, level_to_run, monkeypatch, USER_INPUT, level_to_run + 2
    )

-    file_path = get_workspace_path_from_agent(memory_management_agent, OUTPUT_LOCATION)
-    content = read_file(file_path, memory_management_agent)
+    file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
+    with open(file_path, "r") as file:
+        content = file.read()
    assert task_id in content, f"Expected the file to contain {task_id}"


 def create_instructions_files(
-    memory_management_agent: Agent,
+    workspace: Workspace,
    num_files: int,
    task_id: str,
    base_filename: str = "instructions_",
@@ -51,7 +50,7 @@ def create_instructions_files(
    """
    Creates a series of instructions files for the memory challenge.
    Args:
-        memory_management_agent (Agent)
+        workspace (Workspace)
        num_files (int)
        task_id (str)
        base_filename (str, optional)
@@ -59,8 +58,9 @@ def create_instructions_files(
    for i in range(1, num_files + 1):
        content = generate_content(i, task_id, base_filename, num_files)
        file_name = f"{base_filename}{i}.txt"
-        file_path = get_workspace_path_from_agent(memory_management_agent, file_name)
-        write_to_file(file_path, content, memory_management_agent)
+        file_path = get_workspace_path(workspace, file_name)
+        with open(file_path, "w") as file:
+            file.write(content)


 def generate_content(
--- a/tests/challenges/memory/test_memory_challenge_b.py
+++ b/tests/challenges/memory/test_memory_challenge_b.py
@@ -1,56 +1,50 @@
 import pytest
 from pytest_mock import MockerFixture

-from autogpt.agent import Agent
-from autogpt.commands.file_operations import read_file, write_to_file
+from autogpt.workspace import Workspace
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
-from tests.challenges.utils import (
-    generate_noise,
-    get_workspace_path_from_agent,
-    run_interaction_loop,
-)
+from tests.challenges.utils import generate_noise, get_workspace_path, run_challenge

 NOISE = 1000
 OUTPUT_LOCATION = "output.txt"
+USER_INPUT = "Use the command read_file to read the instructions_1.txt file\nFollow the instructions in the instructions_1.txt file"


@challenge()
 def test_memory_challenge_b(
-    memory_management_agent: Agent,
    patched_api_requestor: MockerFixture,
    monkeypatch: pytest.MonkeyPatch,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
+    patched_make_workspace: pytest.fixture,
 ) -> None:
    """
    The agent reads a series of files, each containing a task_id and noise. After reading 'n' files,
    the agent must write all the task_ids into a new file, filtering out the noise.

    Args:
-        memory_management_agent (Agent)
+        workspace (Workspace)
        patched_api_requestor (MockerFixture)
        monkeypatch (pytest.MonkeyPatch)
        level_to_run (int)
    """
    task_ids = [str(i * 1111) for i in range(1, level_to_run + 1)]
-    create_instructions_files(memory_management_agent, level_to_run, task_ids)
+    create_instructions_files(workspace, level_to_run, task_ids)

-    run_interaction_loop(
-        monkeypatch,
-        memory_management_agent,
-        level_to_run + 2,
-        challenge_name,
-        level_to_run,
+    run_challenge(
+        challenge_name, level_to_run, monkeypatch, USER_INPUT, level_to_run + 2
    )

-    file_path = get_workspace_path_from_agent(memory_management_agent, OUTPUT_LOCATION)
-    content = read_file(file_path, memory_management_agent)
+    file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
+    with open(file_path, "r") as file:
+        content = file.read()
    for task_id in task_ids:
        assert task_id in content, f"Expected the file to contain {task_id}"


 def create_instructions_files(
-    memory_management_agent: Agent,
+    workspace: Workspace,
    level: int,
    task_ids: list,
    base_filename: str = "instructions_",
@@ -59,7 +53,7 @@ def create_instructions_files(
    Creates a series of instructions files for the memory challenge.
    Args:
        level:
-        memory_management_agent (Agent)
+        workspace (Workspace)
        num_files (int)
        task_ids (list)
        base_filename (str, optional)
@@ -67,9 +61,10 @@ def create_instructions_files(
    for i in range(1, level + 1):
        content = generate_content(i, task_ids, base_filename, level)
        file_name = f"{base_filename}{i}.txt"
-        file_path = get_workspace_path_from_agent(memory_management_agent, file_name)
+        file_path = get_workspace_path(workspace, file_name)

-        write_to_file(file_path, content, memory_management_agent)
+        with open(file_path, "w") as file:
+            file.write(content)


 def generate_content(index: int, task_ids: list, base_filename: str, level: int) -> str:
--- a/tests/challenges/memory/test_memory_challenge_c.py
+++ b/tests/challenges/memory/test_memory_challenge_c.py
@@ -1,26 +1,24 @@
 import pytest
 from pytest_mock import MockerFixture

-from autogpt.agent import Agent
-from autogpt.commands.file_operations import read_file, write_to_file
+from autogpt.commands.file_operations import read_file
+from autogpt.workspace import Workspace
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
-from tests.challenges.utils import (
-    generate_noise,
-    get_workspace_path_from_agent,
-    run_interaction_loop,
-)
+from tests.challenges.utils import generate_noise, get_workspace_path, run_challenge

 NOISE = 1200
 OUTPUT_LOCATION = "output.txt"
+USER_INPUT = "Use the command read_file to read the instructions_1.txt file\nFollow the instructions in the instructions_1.txt file"


@challenge()
 def test_memory_challenge_c(
-    memory_management_agent: Agent,
    patched_api_requestor: MockerFixture,
    monkeypatch: pytest.MonkeyPatch,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
+    patched_make_workspace: pytest.fixture,
 ) -> None:
    """
    Instead of reading task Ids from files as with the previous challenges, the agent now must remember
@@ -28,7 +26,7 @@ def test_memory_challenge_c(
    after seeing several of them.

    Args:
-        memory_management_agent (Agent)
+        workspace (Workspace)
        patched_api_requestor (MockerFixture)
        monkeypatch (pytest.MonkeyPatch)
        level_to_run (int)
@@ -48,26 +46,23 @@ def test_memory_challenge_c(

    level_silly_phrases = silly_phrases[:level_to_run]
    create_instructions_files(
-        memory_management_agent,
+        workspace,
        level_to_run,
        level_silly_phrases,
    )

-    run_interaction_loop(
-        monkeypatch,
-        memory_management_agent,
-        level_to_run + 2,
-        challenge_name,
-        level_to_run,
+    run_challenge(
+        challenge_name, level_to_run, monkeypatch, USER_INPUT, level_to_run + 2
    )
-    file_path = get_workspace_path_from_agent(memory_management_agent, OUTPUT_LOCATION)
-    content = read_file(file_path, agent=memory_management_agent)
+
+    file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
+    content = read_file(file_path, agent=workspace)
    for phrase in level_silly_phrases:
        assert phrase in content, f"Expected the file to contain {phrase}"


 def create_instructions_files(
-    memory_management_agent: Agent,
+    workspace: Workspace,
    level: int,
    task_ids: list,
    base_filename: str = "instructions_",
@@ -76,7 +71,7 @@ def create_instructions_files(
    Creates a series of instructions files for the memory challenge.
    Args:
        level:
-        memory_management_agent (Agent)
+        workspace (Workspace)
        num_files (int)
        task_ids (list)
        base_filename (str, optional)
@@ -84,8 +79,9 @@ def create_instructions_files(
    for i in range(1, level + 1):
        content = generate_content(i, task_ids, base_filename, level)
        file_name = f"{base_filename}{i}.txt"
-        file_path = get_workspace_path_from_agent(memory_management_agent, file_name)
-        write_to_file(file_path, content, memory_management_agent)
+        file_path = get_workspace_path(workspace, file_name)
+        with open(file_path, "w") as file:
+            file.write(content)


 def generate_content(
--- a/tests/challenges/memory/test_memory_challenge_d.py
+++ b/tests/challenges/memory/test_memory_challenge_d.py
@@ -4,28 +4,30 @@ from typing import Dict
 import pytest
 from pytest_mock import MockerFixture

-from autogpt.agent import Agent
-from autogpt.commands.file_operations import read_file, write_to_file
+from autogpt.commands.file_operations import read_file
+from autogpt.workspace import Workspace
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
-from tests.challenges.utils import get_workspace_path_from_agent, run_interaction_loop
+from tests.challenges.utils import get_workspace_path, run_challenge

 LEVEL_CURRENTLY_BEATEN = 1
 MAX_LEVEL = 5
 OUTPUT_LOCATION = "output.txt"
+USER_INPUT = "Use the command read_file to read the instructions_1.txt file\nFollow the instructions in the instructions_1.txt file"


@challenge()
 def test_memory_challenge_d(
-    memory_management_agent: Agent,
    patched_api_requestor: MockerFixture,
    monkeypatch: pytest.MonkeyPatch,
    level_to_run: int,
    challenge_name: str,
+    workspace: Workspace,
+    patched_make_workspace: pytest.fixture,
 ) -> None:
    """
    The agent is given a series of events and must remember the respective beliefs of the characters.
    Args:
-        memory_management_agent (Agent)
+        workspace (Workspace)
        user_selected_level (int)
    """
    sally_anne_test_phrases = [
@@ -36,19 +38,14 @@ def test_memory_challenge_d(
        "Sally gives a new marble (marble E) to Charlie who is outside with her. Charlie enters the room and places marble E in the red box. Anne, who is already in the room, takes marble E from the red box, and hides it under the sofa. Then Anne leaves the room and tells Sally that marble E is in the green box. Meanwhile, after Anne leaves the room, Charlie who re-enters the room takes marble D from under the sofa and places it in his own basket (basket C).",
    ]
    level_sally_anne_test_phrases = sally_anne_test_phrases[:level_to_run]
-    create_instructions_files(
-        memory_management_agent, level_to_run, level_sally_anne_test_phrases
+    create_instructions_files(workspace, level_to_run, level_sally_anne_test_phrases)
+    run_challenge(
+        challenge_name, level_to_run, monkeypatch, USER_INPUT, level_to_run + 2
    )
-    run_interaction_loop(
-        monkeypatch,
-        memory_management_agent,
-        level_to_run + 2,
-        challenge_name,
-        level_to_run,
-    )
-    file_path = get_workspace_path_from_agent(memory_management_agent, OUTPUT_LOCATION)

-    content = read_file(file_path, memory_management_agent)
+    file_path = get_workspace_path(workspace, OUTPUT_LOCATION)
+
+    content = read_file(file_path, workspace)
    check_beliefs(content, level_to_run)


@@ -176,7 +173,7 @@ def extract_beliefs(content: str) -> Dict[str, Dict[str, str]]:


 def create_instructions_files(
-    memory_management_agent: Agent,
+    workspace: Workspace,
    level: int,
    test_phrases: list,
    base_filename: str = "instructions_",
@@ -185,15 +182,16 @@ def create_instructions_files(
    Creates a series of instructions files for the memory challenge.
    Args:
        level:
-        memory_management_agent (Agent)
+        workspace (Workspace)
        test_phrases (list)
        base_filename (str, optional)
    """
    for i in range(1, level + 1):
        content = generate_content(i, test_phrases, base_filename, level)
        file_name = f"{base_filename}{i}.txt"
-        file_path = get_workspace_path_from_agent(memory_management_agent, file_name)
-        write_to_file(file_path, content, memory_management_agent)
+        file_path = get_workspace_path(workspace, file_name)
+        with open(file_path, "w") as file:
+            file.write(content)


 def generate_content(
--- a/tests/challenges/utils.py
+++ b/tests/challenges/utils.py
@@ -6,9 +6,10 @@ from typing import Any, Generator

 import pytest

-from autogpt.agent import Agent
 from autogpt.log_cycle.log_cycle import LogCycleHandler
 from autogpt.workspace import Workspace
+from benchmarks import run_task
+from tests.challenges.schema import Task


 def generate_noise(noise_size: int) -> str:
@@ -40,20 +41,6 @@ def setup_mock_input(monkeypatch: pytest.MonkeyPatch, cycle_count: int) -> None:
    monkeypatch.setattr("autogpt.utils.session.prompt", lambda _: next(gen))


-def run_interaction_loop(
-    monkeypatch: pytest.MonkeyPatch,
-    agent: Agent,
-    cycle_count: int,
-    challenge_name: str,
-    level_to_run: int,
-) -> None:
-    setup_mock_input(monkeypatch, cycle_count)
-
-    setup_mock_log_cycle_agent_name(monkeypatch, challenge_name, level_to_run)
-    with contextlib.suppress(SystemExit):
-        agent.start_interaction_loop()
-
-
 def setup_mock_log_cycle_agent_name(
    monkeypatch: pytest.MonkeyPatch, challenge_name: str, level_to_run: int
 ) -> None:
@@ -69,13 +56,23 @@ def get_workspace_path(workspace: Workspace, file_name: str) -> str:
    return str(workspace.get_path(file_name))


-def get_workspace_path_from_agent(agent: Agent, file_name: str) -> str:
-    return str(agent.workspace.get_path(file_name))
-
-
 def copy_file_into_workspace(
-    agent: Agent, directory_path: Path, file_path: str
+    workspace: Workspace, directory_path: Path, file_path: str
 ) -> None:
-    workspace_code_file_path = get_workspace_path_from_agent(agent, file_path)
+    workspace_code_file_path = get_workspace_path(workspace, file_path)
    code_file_path = directory_path / file_path
    shutil.copy(code_file_path, workspace_code_file_path)
+
+
+def run_challenge(
+    challenge_name: str,
+    level_to_run: int,
+    monkeypatch: pytest.MonkeyPatch,
+    user_input: str,
+    cycle_count: int,
+) -> None:
+    setup_mock_input(monkeypatch, cycle_count)
+    setup_mock_log_cycle_agent_name(monkeypatch, challenge_name, level_to_run)
+    task = Task(user_input=user_input)
+    with contextlib.suppress(SystemExit):
+        run_task(task)
--- a/tests/integration/agent_factory.py
+++ b/tests/integration/agent_factory.py
@@ -2,259 +2,46 @@ import pytest

 from autogpt.agent import Agent
 from autogpt.config import AIConfig, Config
-from autogpt.main import COMMAND_CATEGORIES
-from autogpt.memory.vector import NoMemory, get_memory
+from autogpt.memory.vector import get_memory
 from autogpt.models.command_registry import CommandRegistry
-from autogpt.prompts.prompt import DEFAULT_TRIGGERING_PROMPT
 from autogpt.workspace import Workspace


@pytest.fixture
-def agent_test_config(config: Config):
-    config.set_continuous_mode(False)
-    config.set_temperature(0)
-    config.plain_output = True
-    return config
+def memory_json_file(config: Config):
+    was_memory_backend = config.memory_backend

-
-@pytest.fixture
-def memory_json_file(agent_test_config: Config):
-    was_memory_backend = agent_test_config.memory_backend
-
-    agent_test_config.set_memory_backend("json_file")
-    memory = get_memory(agent_test_config)
+    config.set_memory_backend("json_file")
+    memory = get_memory(config)
    memory.clear()
    yield memory

-    agent_test_config.set_memory_backend(was_memory_backend)
+    config.set_memory_backend(was_memory_backend)


@pytest.fixture
-def browser_agent(agent_test_config, memory_none: NoMemory, workspace: Workspace):
+def dummy_agent(config: Config, memory_json_file, workspace: Workspace):
    command_registry = CommandRegistry()
-    command_registry.import_commands("autogpt.commands.file_operations")
-    command_registry.import_commands("autogpt.commands.web_selenium")
-    command_registry.import_commands("autogpt.app")
-    command_registry.import_commands("autogpt.commands.task_statuses")

    ai_config = AIConfig(
-        ai_name="browse_website-GPT",
-        ai_role="an AI designed to use the browse_website command to visit http://books.toscrape.com/catalogue/meditations_33/index.html, answer the question 'What is the price of the book?' and write the price to a file named \"browse_website.txt\", and use the task_complete command to complete the task.",
+        ai_name="Dummy Agent",
+        ai_role="Dummy Role",
        ai_goals=[
-            "Use the browse_website command to visit http://books.toscrape.com/catalogue/meditations_33/index.html and answer the question 'What is the price of the book?'",
-            'Write the price of the book to a file named "browse_website.txt".',
-            "Use the task_complete command to complete the task.",
-            "Do not use any other commands.",
+            "Dummy Task",
        ],
    )
    ai_config.command_registry = command_registry

-    system_prompt = ai_config.construct_full_prompt(agent_test_config)
-
    agent = Agent(
-        ai_name="",
-        memory=memory_none,
-        command_registry=command_registry,
-        ai_config=ai_config,
-        config=agent_test_config,
-        next_action_count=0,
-        system_prompt=system_prompt,
-        triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
-        workspace_directory=workspace.root,
-    )
-
-    return agent
-
-
-@pytest.fixture
-def memory_management_agent(agent_test_config, memory_json_file, workspace: Workspace):
-    command_registry = get_command_registry(agent_test_config)
-
-    ai_config = AIConfig(
-        ai_name="Follow-Instructions-GPT",
-        ai_role="an AI designed to read the instructions_1.txt file using the read_file method and follow the instructions in the file.",
-        ai_goals=[
-            "Use the command read_file to read the instructions_1.txt file",
-            "Follow the instructions in the instructions_1.txt file",
-        ],
-    )
-    ai_config.command_registry = command_registry
-
-    system_prompt = ai_config.construct_full_prompt(agent_test_config)
-
-    agent = Agent(
-        ai_name="Follow-Instructions-GPT",
+        ai_name="Dummy Agent",
        memory=memory_json_file,
        command_registry=command_registry,
        ai_config=ai_config,
-        config=agent_test_config,
+        config=config,
        next_action_count=0,
-        system_prompt=system_prompt,
-        triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
+        system_prompt="dummy_prompt",
+        triggering_prompt="dummy triggering prompt",
        workspace_directory=workspace.root,
    )

    return agent
-
-
-@pytest.fixture
-def information_retrieval_agents(
-    agent_test_config, memory_json_file, workspace: Workspace
-):
-    agents = []
-    command_registry = get_command_registry(agent_test_config)
-
-    ai_goals = [
-        "Write to a file called output.txt containing tesla's revenue in 2022 after searching for 'tesla revenue 2022'.",
-        "Write to a file called output.txt containing tesla's revenue in 2022.",
-        "Write to a file called output.txt containing tesla's revenue every year since its creation.",
-    ]
-    for ai_goal in ai_goals:
-        ai_config = AIConfig(
-            ai_name="Information Retrieval Agent",
-            ai_role="an autonomous agent that specializes in retrieving information.",
-            ai_goals=[ai_goal],
-        )
-        ai_config.command_registry = command_registry
-        system_prompt = ai_config.construct_full_prompt(agent_test_config)
-        agent_test_config.set_continuous_mode(False)
-        agents.append(
-            Agent(
-                ai_name="Information Retrieval Agent",
-                memory=memory_json_file,
-                command_registry=command_registry,
-                ai_config=ai_config,
-                config=agent_test_config,
-                next_action_count=0,
-                system_prompt=system_prompt,
-                triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
-                workspace_directory=workspace.root,
-            )
-        )
-    return agents
-
-
-@pytest.fixture
-def kubernetes_agent(
-    agent_test_config: Config, memory_json_file: NoMemory, workspace: Workspace
-) -> Agent:
-    command_registry = CommandRegistry()
-    command_registry.import_commands("autogpt.commands.file_operations")
-    command_registry.import_commands("autogpt.app")
-
-    ai_config = AIConfig(
-        ai_name="Kubernetes",
-        ai_role="an autonomous agent that specializes in creating Kubernetes deployment templates.",
-        ai_goals=[
-            "Write a simple kubernetes deployment file and save it as a kube.yaml.",
-            # You should make a simple nginx web server that uses docker and exposes the port 80.
-        ],
-    )
-    ai_config.command_registry = command_registry
-
-    system_prompt = ai_config.construct_full_prompt(agent_test_config)
-    agent_test_config.set_continuous_mode(False)
-    agent = Agent(
-        ai_name="Kubernetes-Demo",
-        memory=memory_json_file,
-        command_registry=command_registry,
-        ai_config=ai_config,
-        config=agent_test_config,
-        next_action_count=0,
-        system_prompt=system_prompt,
-        triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
-        workspace_directory=workspace.root,
-    )
-
-    return agent
-
-
-@pytest.fixture
-def get_nobel_prize_agent(agent_test_config, memory_json_file, workspace: Workspace):
-    command_registry = CommandRegistry()
-    command_registry.import_commands("autogpt.commands.file_operations")
-    command_registry.import_commands("autogpt.app")
-    command_registry.import_commands("autogpt.commands.web_selenium")
-
-    ai_config = AIConfig(
-        ai_name="Get-PhysicsNobelPrize",
-        ai_role="An autonomous agent that specializes in physics history.",
-        ai_goals=[
-            "Write to file the winner's name(s), affiliated university, and discovery of the 2010 nobel prize in physics. Write your final answer to 2010_nobel_prize_winners.txt.",
-        ],
-    )
-    ai_config.command_registry = command_registry
-
-    system_prompt = ai_config.construct_full_prompt(agent_test_config)
-    agent_test_config.set_continuous_mode(False)
-
-    agent = Agent(
-        ai_name="Get-PhysicsNobelPrize",
-        memory=memory_json_file,
-        command_registry=command_registry,
-        ai_config=ai_config,
-        config=agent_test_config,
-        next_action_count=0,
-        system_prompt=system_prompt,
-        triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
-        workspace_directory=workspace.root,
-    )
-
-    return agent
-
-
-@pytest.fixture
-def debug_code_agents(agent_test_config, memory_json_file, workspace: Workspace):
-    agents = []
-    goals = [
-        [
-            "1- Run test.py using the execute_python_file command.",
-            "2- Read code.py using the read_file command.",
-            "3- Modify code.py using the write_to_file command."
-            "Repeat step 1, 2 and 3 until test.py runs without errors. Do not modify the test.py file.",
-        ],
-        [
-            "1- Run test.py.",
-            "2- Read code.py.",
-            "3- Modify code.py."
-            "Repeat step 1, 2 and 3 until test.py runs without errors.",
-        ],
-        ["1- Make test.py run without errors."],
-    ]
-
-    for goal in goals:
-        ai_config = AIConfig(
-            ai_name="Debug Code Agent",
-            ai_role="an autonomous agent that specializes in debugging python code",
-            ai_goals=goal,
-        )
-        command_registry = get_command_registry(agent_test_config)
-        ai_config.command_registry = command_registry
-        system_prompt = ai_config.construct_full_prompt(agent_test_config)
-        agent_test_config.set_continuous_mode(False)
-        agents.append(
-            Agent(
-                ai_name="Debug Code Agent",
-                memory=memory_json_file,
-                command_registry=command_registry,
-                ai_config=ai_config,
-                config=agent_test_config,
-                next_action_count=0,
-                system_prompt=system_prompt,
-                triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
-                workspace_directory=workspace.root,
-            )
-        )
-    return agents
-
-
-def get_command_registry(agent_test_config):
-    command_registry = CommandRegistry()
-    enabled_command_categories = [
-        x
-        for x in COMMAND_CATEGORIES
-        if x not in agent_test_config.disabled_command_categories
-    ]
-    for command_category in enabled_command_categories:
-        command_registry.import_commands(command_category)
-    return command_registry