diff --git a/autogpt/core/runner/cli_web_app/client/__init__.py b/agbenchmark/__init__.py
similarity index 100%
rename from autogpt/core/runner/cli_web_app/client/__init__.py
rename to agbenchmark/__init__.py
diff --git a/agbenchmark/benchmarks.py b/agbenchmark/benchmarks.py
new file mode 100644
index 00000000..b3df8020
--- /dev/null
+++ b/agbenchmark/benchmarks.py
@@ -0,0 +1,63 @@
+import sys
+from pathlib import Path
+from typing import Tuple
+
+from autogpt.agents import Agent
+from autogpt.app.main import run_interaction_loop
+from autogpt.commands import COMMAND_CATEGORIES
+from autogpt.config import AIConfig, Config, ConfigBuilder
+from autogpt.memory.vector import get_memory
+from autogpt.models.command_registry import CommandRegistry
+from autogpt.prompts.prompt import DEFAULT_TRIGGERING_PROMPT
+from autogpt.workspace import Workspace
+
+PROJECT_DIR = Path().resolve()
+
+
+def run_specific_agent(task, continuous_mode=False) -> Tuple[str, int]:
+    agent = bootstrap_agent(task, continuous_mode)
+    run_interaction_loop(agent)
+
+
+def bootstrap_agent(task, continuous_mode) -> Agent:
+    config = ConfigBuilder.build_config_from_env(workdir=PROJECT_DIR)
+    config.debug_mode = True
+    config.continuous_mode = continuous_mode
+    config.temperature = 0
+    config.plain_output = True
+    command_registry = get_command_registry(config)
+    config.memory_backend = "no_memory"
+    config.workspace_path = Workspace.init_workspace_directory(config)
+    config.file_logger_path = Workspace.build_file_logger_path(config.workspace_path)
+    ai_config = AIConfig(
+        ai_name="Auto-GPT",
+        ai_role="a multi-purpose AI assistant.",
+        ai_goals=[task],
+    )
+    ai_config.command_registry = command_registry
+    return Agent(
+        memory=get_memory(config),
+        command_registry=command_registry,
+        ai_config=ai_config,
+        config=config,
+        triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
+    )
+
+
+def get_command_registry(config: Config):
+    command_registry = CommandRegistry()
+    enabled_command_categories = [
+        x for x in COMMAND_CATEGORIES if x not in config.disabled_command_categories
+    ]
+    for command_category in enabled_command_categories:
+        command_registry.import_commands(command_category)
+    return command_registry
+
+
+if __name__ == "__main__":
+    # The first argument is the script name itself, second is the task
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <task>")
+        sys.exit(1)
+    task = sys.argv[1]
+    run_specific_agent(task, continuous_mode=True)
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
new file mode 100644
index 00000000..47785864
--- /dev/null
+++ b/agbenchmark/config.json
@@ -0,0 +1,4 @@
+{
+  "workspace": "auto_gpt_workspace",
+  "entry_path": "agbenchmark.benchmarks"
+}
diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json
new file mode 100644
index 00000000..8d59b1a4
--- /dev/null
+++ b/agbenchmark/regression_tests.json
@@ -0,0 +1,24 @@
+{
+    "TestBasicCodeGeneration": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "data_path": "agbenchmark/challenges/code/d3"
+    },
+    "TestBasicMemory": {
+        "difficulty": "basic",
+        "data_path": "agbenchmark/challenges/memory/m1"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "data_path": "agbenchmark/challenges/interface/read_file"
+    },
+    "TestWriteFile": {
+        "dependencies": [],
+        "data_path": "agbenchmark/challenges/interface/write_file"
+    }
+}
diff --git a/autogpt/agents/agent.py b/autogpt/agents/agent.py
index 93d3de86..563c6823 100644
--- a/autogpt/agents/agent.py
+++ b/autogpt/agents/agent.py
@@ -17,6 +17,7 @@ from autogpt.llm.base import Message
 from autogpt.llm.utils import count_string_tokens
 from autogpt.logs import logger
 from autogpt.logs.log_cycle import (
+    CURRENT_CONTEXT_FILE_NAME,
     FULL_MESSAGE_HISTORY_FILE_NAME,
     NEXT_ACTION_FILE_NAME,
     USER_INPUT_FILE_NAME,
@@ -109,6 +110,13 @@ class Agent(BaseAgent):
             self.history.raw(),
             FULL_MESSAGE_HISTORY_FILE_NAME,
         )
+        self.log_cycle_handler.log_cycle(
+            self.ai_config.ai_name,
+            self.created_at,
+            self.cycle_count,
+            prompt.raw(),
+            CURRENT_CONTEXT_FILE_NAME,
+        )
         return prompt
 
     def execute(
diff --git a/autogpt/agents/base.py b/autogpt/agents/base.py
index c0133ea7..e6b24be1 100644
--- a/autogpt/agents/base.py
+++ b/autogpt/agents/base.py
@@ -105,7 +105,6 @@ class BaseAgent(metaclass=ABCMeta):
 
         prompt: ChatSequence = self.construct_prompt(instruction)
         prompt = self.on_before_think(prompt, instruction)
-
         raw_response = create_chat_completion(
             prompt,
             self.config,
diff --git a/autogpt/app/main.py b/autogpt/app/main.py
index 5abaaac8..d73a511d 100644
--- a/autogpt/app/main.py
+++ b/autogpt/app/main.py
@@ -126,10 +126,12 @@ def run_auto_gpt(
     # TODO: have this directory live outside the repository (e.g. in a user's
     #   home directory) and have it come in as a command line argument or part of
     #   the env file.
-    Workspace.set_workspace_directory(config, workspace_directory)
+    config.workspace_path = Workspace.init_workspace_directory(
+        config, workspace_directory
+    )
 
     # HACK: doing this here to collect some globals that depend on the workspace.
-    Workspace.set_file_logger_path(config, config.workspace_path)
+    config.file_logger_path = Workspace.build_file_logger_path(config.workspace_path)
 
     config.plugins = scan_plugins(config, config.debug_mode)
     # Create a CommandRegistry instance and scan default folder
diff --git a/autogpt/commands/file_operations.py b/autogpt/commands/file_operations.py
index 939b7dc1..adafe14e 100644
--- a/autogpt/commands/file_operations.py
+++ b/autogpt/commands/file_operations.py
@@ -25,7 +25,7 @@ def text_checksum(text: str) -> str:
 
 
 def operations_from_log(
-    log_path: str,
+    log_path: str | Path,
 ) -> Generator[tuple[Operation, str, str | None], None, None]:
     """Parse the file operations log and return a tuple containing the log entries"""
     try:
@@ -52,7 +52,7 @@ def operations_from_log(
     log.close()
 
 
-def file_operations_state(log_path: str) -> dict[str, str]:
+def file_operations_state(log_path: str | Path) -> dict[str, str]:
     """Iterates over the operations log and returns the expected state.
 
     Parses a log file at config.file_logger_path to construct a dictionary that maps
@@ -274,37 +274,6 @@ def append_to_file(
         return f"Error: {err}"
 
 
-@command(
-    "delete_file",
-    "Deletes a file",
-    {
-        "filename": {
-            "type": "string",
-            "description": "The name of the file to delete",
-            "required": True,
-        }
-    },
-)
-@sanitize_path_arg("filename")
-def delete_file(filename: str, agent: Agent) -> str:
-    """Delete a file
-
-    Args:
-        filename (str): The name of the file to delete
-
-    Returns:
-        str: A message indicating success or failure
-    """
-    if is_duplicate_operation("delete", filename, agent):
-        return "Error: File has already been deleted."
-    try:
-        os.remove(filename)
-        log_operation("delete", filename, agent)
-        return "File deleted successfully."
-    except Exception as err:
-        return f"Error: {err}"
-
-
 @command(
     "list_files",
     "Lists Files in a Directory",
diff --git a/autogpt/config/config.py b/autogpt/config/config.py
index 8fba182c..66f2e871 100644
--- a/autogpt/config/config.py
+++ b/autogpt/config/config.py
@@ -51,7 +51,7 @@ class Config(SystemSettings, arbitrary_types_allowed=True):
     prompt_settings_file: str = PROMPT_SETTINGS_FILE
     workdir: Path = None
     workspace_path: Optional[Path] = None
-    file_logger_path: Optional[str] = None
+    file_logger_path: Optional[Path] = None
     # Model configuration
     fast_llm: str = "gpt-3.5-turbo"
     smart_llm: str = "gpt-4"
diff --git a/autogpt/core/planning/templates.py b/autogpt/core/planning/templates.py
index e28f2ed7..59792f65 100644
--- a/autogpt/core/planning/templates.py
+++ b/autogpt/core/planning/templates.py
@@ -17,7 +17,6 @@ ABILITIES = (
     'analyze_code: Analyze Code, args: "code": "<full_code_string>"',
     'execute_python_file: Execute Python File, args: "filename": "<filename>"',
     'append_to_file: Append to file, args: "filename": "<filename>", "text": "<text>"',
-    'delete_file: Delete file, args: "filename": "<filename>"',
     'list_files: List Files in Directory, args: "directory": "<directory>"',
     'read_file: Read a file, args: "filename": "<filename>"',
     'write_to_file: Write to file, args: "filename": "<filename>", "text": "<text>"',
diff --git a/autogpt/core/runner/cli_web_app/cli.py b/autogpt/core/runner/cli_web_app/cli.py
index 6600b8e1..e933739b 100644
--- a/autogpt/core/runner/cli_web_app/cli.py
+++ b/autogpt/core/runner/cli_web_app/cli.py
@@ -1,19 +1,13 @@
-import contextlib
 import pathlib
-import shlex
-import subprocess
-import sys
-import time
 
 import click
-import requests
-import uvicorn
 import yaml
+from agent_protocol import Agent as AgentProtocol
 
+from autogpt.core.runner.cli_web_app.server.api import task_handler
 from autogpt.core.runner.client_lib.shared_click_commands import (
     DEFAULT_SETTINGS_FILE,
     make_settings,
-    status,
 )
 from autogpt.core.runner.client_lib.utils import coroutine
 
@@ -25,17 +19,9 @@ def autogpt():
 
 
 autogpt.add_command(make_settings)
-autogpt.add_command(status)
 
 
 @autogpt.command()
-@click.option(
-    "host",
-    "--host",
-    default="localhost",
-    help="The host for the webserver.",
-    type=click.STRING,
-)
 @click.option(
     "port",
     "--port",
@@ -43,16 +29,10 @@ autogpt.add_command(status)
     help="The port of the webserver.",
     type=click.INT,
 )
-def server(host: str, port: int) -> None:
+def server(port: int) -> None:
     """Run the Auto-GPT runner httpserver."""
     click.echo("Running Auto-GPT runner httpserver...")
-    uvicorn.run(
-        "autogpt.core.runner.cli_web_app.server.api:app",
-        workers=1,
-        host=host,
-        port=port,
-        reload=True,
-    )
+    AgentProtocol.handle_task(task_handler).start(port)
 
 
 @autogpt.command()
@@ -69,32 +49,7 @@ async def client(settings_file) -> None:
     if settings_file.exists():
         settings = yaml.safe_load(settings_file.read_text())
 
-    from autogpt.core.runner.cli_web_app.client.client import run
-
-    with autogpt_server():
-        run()
-
-
-@contextlib.contextmanager
-def autogpt_server():
-    host = "localhost"
-    port = 8080
-    cmd = shlex.split(
-        f"{sys.executable} autogpt/core/runner/cli_web_app/cli.py server --host {host} --port {port}"
-    )
-    server_process = subprocess.Popen(
-        args=cmd,
-    )
-    started = False
-
-    while not started:
-        try:
-            requests.get(f"http://{host}:{port}")
-            started = True
-        except requests.exceptions.ConnectionError:
-            time.sleep(0.2)
-    yield server_process
-    server_process.terminate()
+    # TODO: Call the API server with the settings and task, using the Python API client for agent protocol.
 
 
 if __name__ == "__main__":
diff --git a/autogpt/core/runner/cli_web_app/client/client.py b/autogpt/core/runner/cli_web_app/client/client.py
deleted file mode 100644
index 346203f7..00000000
--- a/autogpt/core/runner/cli_web_app/client/client.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import json
-
-import requests
-
-
-def run():
-    body = json.dumps(
-        {"ai_name": "HelloBot", "ai_role": "test", "ai_goals": ["goal1", "goal2"]}
-    )
-
-    header = {"Content-Type": "application/json", "openai_api_key": "asdf"}
-    print("Sending: ", header, body)
-    response = requests.post(
-        "http://localhost:8080/api/v1/agents", data=body, headers=header
-    )
-    print(response.content.decode("utf-8"))
diff --git a/autogpt/core/runner/cli_web_app/server/api.py b/autogpt/core/runner/cli_web_app/server/api.py
index 01c50b06..1ba0974b 100644
--- a/autogpt/core/runner/cli_web_app/server/api.py
+++ b/autogpt/core/runner/cli_web_app/server/api.py
@@ -1,48 +1,114 @@
-import uuid
+from pathlib import Path
 
-from fastapi import APIRouter, FastAPI, Request
+from agent_protocol import StepHandler, StepResult
+from colorama import Fore
 
-from autogpt.core.runner.cli_web_app.server.schema import InteractRequestBody
+from autogpt.agents import Agent
+from autogpt.app.main import UserFeedback
+from autogpt.commands import COMMAND_CATEGORIES
+from autogpt.config import AIConfig, Config, ConfigBuilder
+from autogpt.logs import logger
+from autogpt.memory.vector import get_memory
+from autogpt.models.command_registry import CommandRegistry
+from autogpt.prompts.prompt import DEFAULT_TRIGGERING_PROMPT
+from autogpt.workspace import Workspace
 
-router = APIRouter()
+PROJECT_DIR = Path().resolve()
 
 
-@router.post("/agents")
-async def create_agent(request: Request):
-    """Create a new agent."""
-    agent_id = uuid.uuid4().hex
-    return {"agent_id": agent_id}
+async def task_handler(task_input) -> StepHandler:
+    task = task_input.__root__ if task_input else {}
+    agent = bootstrap_agent(task.get("user_input"), False)
+
+    next_command_name: str | None = None
+    next_command_args: dict[str, str] | None = None
+
+    async def step_handler(step_input) -> StepResult:
+        step = step_input.__root__ if step_input else {}
+
+        nonlocal next_command_name, next_command_args
+
+        result = await interaction_step(
+            agent,
+            step.get("user_input"),
+            step.get("user_feedback"),
+            next_command_name,
+            next_command_args,
+        )
+
+        next_command_name = result["next_step_command_name"] if result else None
+        next_command_args = result["next_step_command_args"] if result else None
+
+        if not result:
+            return StepResult(output=None, is_last=True)
+        return StepResult(output=result)
+
+    return step_handler
 
 
-@router.post("/agents/{agent_id}")
-async def interact(request: Request, agent_id: str, body: InteractRequestBody):
-    """Interact with an agent."""
+async def interaction_step(
+    agent: Agent,
+    user_input,
+    user_feedback: UserFeedback | None,
+    command_name: str | None,
+    command_args: dict[str, str] | None,
+):
+    """Run one step of the interaction loop."""
+    if user_feedback == UserFeedback.EXIT:
+        return
+    if user_feedback == UserFeedback.TEXT:
+        command_name = "human_feedback"
 
-    # check headers
+    result: str | None = None
 
-    # check if agent_id exists
+    if command_name is not None:
+        result = agent.execute(command_name, command_args, user_input)
+        if result is None:
+            logger.typewriter_log("SYSTEM: ", Fore.YELLOW, "Unable to execute command")
+            return
 
-    # get agent object from somewhere, e.g. a database/disk/global dict
-
-    # continue agent interaction with user input
+    next_command_name, next_command_args, assistant_reply_dict = agent.think()
 
     return {
-        "thoughts": {
-            "thoughts": {
-                "text": "text",
-                "reasoning": "reasoning",
-                "plan": "plan",
-                "criticism": "criticism",
-                "speak": "speak",
-            },
-            "commands": {
-                "name": "name",
-                "args": {"arg_1": "value_1", "arg_2": "value_2"},
-            },
-        },
-        "messages": ["message1", agent_id],
+        "config": agent.config,
+        "ai_config": agent.ai_config,
+        "result": result,
+        "assistant_reply_dict": assistant_reply_dict,
+        "next_step_command_name": next_command_name,
+        "next_step_command_args": next_command_args,
     }
 
 
-app = FastAPI()
-app.include_router(router, prefix="/api/v1")
+def bootstrap_agent(task, continuous_mode) -> Agent:
+    config = ConfigBuilder.build_config_from_env(workdir=PROJECT_DIR)
+    config.debug_mode = True
+    config.continuous_mode = continuous_mode
+    config.temperature = 0
+    config.plain_output = True
+    command_registry = get_command_registry(config)
+    config.memory_backend = "no_memory"
+    config.workspace_path = Workspace.init_workspace_directory(config)
+    config.file_logger_path = Workspace.build_file_logger_path(config.workspace_path)
+    ai_config = AIConfig(
+        ai_name="Auto-GPT",
+        ai_role="a multi-purpose AI assistant.",
+        ai_goals=[task],
+    )
+    ai_config.command_registry = command_registry
+    return Agent(
+        memory=get_memory(config),
+        command_registry=command_registry,
+        ai_config=ai_config,
+        config=config,
+        triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
+    )
+
+
+def get_command_registry(config: Config):
+    command_registry = CommandRegistry()
+    enabled_command_categories = [
+        x for x in COMMAND_CATEGORIES if x not in config.disabled_command_categories
+    ]
+    for command_category in enabled_command_categories:
+        command_registry.import_commands(command_category)
+    return command_registry
diff --git a/autogpt/core/runner/cli_web_app/server/schema.py b/autogpt/core/runner/cli_web_app/server/schema.py
deleted file mode 100644
index 272fbc78..00000000
--- a/autogpt/core/runner/cli_web_app/server/schema.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from uuid import UUID
-
-from pydantic import BaseModel, validator
-
-
-class AgentInfo(BaseModel):
-    id: UUID = None
-    objective: str = ""
-    name: str = ""
-    role: str = ""
-    goals: list[str] = []
-
-
-class AgentConfiguration(BaseModel):
-    """Configuration for creation of a new agent."""
-
-    # We'll want to get this schema from the configuration, so it needs to be dynamic.
-    user_configuration: dict
-    agent_goals: AgentInfo
-
-    @validator("agent_goals")
-    def only_objective_or_name_role_goals(cls, agent_goals):
-        goals_specification = [agent_goals.name, agent_goals.role, agent_goals.goals]
-        if agent_goals.objective and any(goals_specification):
-            raise ValueError("Cannot specify both objective and name, role, or goals")
-        if not agent_goals.objective and not all(goals_specification):
-            raise ValueError("Must specify either objective or name, role, and goals")
-
-
-class InteractRequestBody(BaseModel):
-    user_input: str = ""
-
-
-class InteractResponseBody(BaseModel):
-    thoughts: dict[str, str]  # TBD
-    messages: list[str]  # for example
diff --git a/autogpt/core/runner/cli_web_app/server/services/__init__.py b/autogpt/core/runner/cli_web_app/server/services/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/autogpt/core/runner/cli_web_app/server/services/users.py b/autogpt/core/runner/cli_web_app/server/services/users.py
deleted file mode 100644
index 5192dcdb..00000000
--- a/autogpt/core/runner/cli_web_app/server/services/users.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import uuid
-
-from fastapi import Request
-
-
-class UserService:
-    def __init__(self):
-        self.users = {}
-
-    def get_user_id(self, request: Request) -> uuid.UUID:
-        # TODO: something real.  I don't know how this works.
-        hostname = request.client.host
-        port = request.client.port
-        user = f"{hostname}:{port}"
-        if user not in self.users:
-            self.users[user] = uuid.uuid4()
-        return self.users[user]
-
-
-USER_SERVICE = UserService()
diff --git a/autogpt/workspace/workspace.py b/autogpt/workspace/workspace.py
index 6e77c21a..2176d414 100644
--- a/autogpt/workspace/workspace.py
+++ b/autogpt/workspace/workspace.py
@@ -144,21 +144,24 @@ class Workspace:
         return full_path
 
     @staticmethod
-    def set_file_logger_path(config: Config, workspace_directory: Path):
+    def build_file_logger_path(workspace_directory: Path) -> Path:
         file_logger_path = workspace_directory / "file_logger.txt"
         if not file_logger_path.exists():
             with file_logger_path.open(mode="w", encoding="utf-8") as f:
                 f.write("File Operation Logger ")
-        config.file_logger_path = str(file_logger_path)
+        return file_logger_path
 
     @staticmethod
-    def set_workspace_directory(
-        config: Config, workspace_directory: Optional[str | Path] = None
-    ) -> None:
-        if workspace_directory is None:
-            workspace_directory = config.workdir / "auto_gpt_workspace"
-        elif type(workspace_directory) == str:
-            workspace_directory = Path(workspace_directory)
+    def init_workspace_directory(
+        config: Config, override_workspace_path: Optional[str | Path] = None
+    ) -> Path:
+        if override_workspace_path is None:
+            workspace_path = config.workdir / "auto_gpt_workspace"
+        elif type(override_workspace_path) == str:
+            workspace_path = Path(override_workspace_path)
+        else:
+            workspace_path = override_workspace_path
+
         # TODO: pass in the ai_settings file and the env file and have them cloned into
         #   the workspace directory so we can bind them to the agent.
-        config.workspace_path = Workspace.make_workspace(workspace_directory)
+        return Workspace.make_workspace(workspace_path)
diff --git a/benchmarks.py b/benchmarks.py
index 04153f4b..9cf93aca 100644
--- a/benchmarks.py
+++ b/benchmarks.py
@@ -24,8 +24,8 @@ def bootstrap_agent(task):
     config.plain_output = True
     command_registry = get_command_registry(config)
     config.memory_backend = "no_memory"
-    Workspace.set_workspace_directory(config)
-    Workspace.set_file_logger_path(config, config.workspace_path)
+    config.workspace_path = Workspace.init_workspace_directory(config)
+    config.file_logger_path = Workspace.build_file_logger_path(config.workspace_path)
     ai_config = AIConfig(
         ai_name="Auto-GPT",
         ai_role="a multi-purpose AI assistant.",
diff --git a/docs/challenges/building_challenges.md b/docs/challenges/building_challenges.md
index a4d0fa08..9caf5cdd 100644
--- a/docs/challenges/building_challenges.md
+++ b/docs/challenges/building_challenges.md
@@ -59,7 +59,6 @@ def kubernetes_agent(
         config=ai_config,
         next_action_count=0,
         triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
-        workspace_directory=workspace.root,
     )
 
     return agent
diff --git a/requirements.txt b/requirements.txt
index 4af8bccd..e401e266 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 beautifulsoup4>=4.12.2
 colorama==0.4.6
 distro==1.8.0
-openai==0.27.2
+openai==0.27.8
 playsound==1.2.2
-python-dotenv==1.0.0
+python-dotenv==0.21
 pyyaml==6.0
 PyPDF2
 python-docx
@@ -31,6 +31,8 @@ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_
 prompt_toolkit>=3.0.38
 pydantic
 inflection
+agbenchmark
+agent-protocol>=0.1.1
 
 # web server
 fastapi
diff --git a/tests/Auto-GPT-test-cassettes b/tests/Auto-GPT-test-cassettes
index 47e26290..6b4f8552 160000
--- a/tests/Auto-GPT-test-cassettes
+++ b/tests/Auto-GPT-test-cassettes
@@ -1 +1 @@
-Subproject commit 47e262905edc1380bc0539fd298fd94d99667e89
+Subproject commit 6b4f855269dfc7ec220cc7774d675940dcaa78ef
diff --git a/tests/challenges/utils.py b/tests/challenges/utils.py
index 9d1b76e7..dd661b6e 100644
--- a/tests/challenges/utils.py
+++ b/tests/challenges/utils.py
@@ -6,9 +6,9 @@ from typing import Any, Generator
 
 import pytest
 
+from agbenchmark.benchmarks import run_specific_agent
 from autogpt.logs import LogCycleHandler
 from autogpt.workspace import Workspace
-from benchmarks import run_task
 from tests.challenges.schema import Task
 
 
@@ -75,4 +75,4 @@ def run_challenge(
     setup_mock_log_cycle_agent_name(monkeypatch, challenge_name, level_to_run)
     task = Task(user_input=user_input)
     with contextlib.suppress(SystemExit):
-        run_task(task)
+        run_specific_agent(task.user_input)
diff --git a/tests/conftest.py b/tests/conftest.py
index c3076d54..cfcebcb7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -52,7 +52,7 @@ def config(
     if not os.environ.get("OPENAI_API_KEY"):
         os.environ["OPENAI_API_KEY"] = "sk-dummy"
 
-    Workspace.set_workspace_directory(config, workspace.root)
+    config.workspace_path = workspace.root
 
     # HACK: this is necessary to ensure PLAIN_OUTPUT takes effect
     logger.config = config
diff --git a/tests/unit/test_file_operations.py b/tests/unit/test_file_operations.py
index d7d870a5..b3f1fb8f 100644
--- a/tests/unit/test_file_operations.py
+++ b/tests/unit/test_file_operations.py
@@ -282,24 +282,6 @@ def test_append_to_file_uses_checksum_from_appended_file(
     )
 
 
-def test_delete_file(test_file_with_content_path: Path, agent: Agent):
-    result = file_ops.delete_file(str(test_file_with_content_path), agent=agent)
-    assert result == "File deleted successfully."
-    assert os.path.exists(test_file_with_content_path) is False
-
-
-def test_delete_missing_file(agent: Agent):
-    filename = "path/to/file/which/does/not/exist"
-    # confuse the log
-    file_ops.log_operation("write", filename, agent=agent, checksum="fake")
-    try:
-        os.remove(agent.workspace.get_path(filename))
-    except FileNotFoundError as err:
-        assert str(err) in file_ops.delete_file(filename, agent=agent)
-        return
-    assert False, f"Failed to test delete_file; {filename} not expected to exist"
-
-
 def test_list_files(workspace: Workspace, test_directory: Path, agent: Agent):
     # Case 1: Create files A and B, search for A, and ensure we don't return A and B
     file_a = workspace.get_path("file_a.txt")