diff --git a/agbenchmark/__init__.py b/agbenchmark/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks.py b/agbenchmark/benchmarks.py
similarity index 73%
rename from benchmarks.py
rename to agbenchmark/benchmarks.py
index 589b3f75..e8675f92 100644
--- a/benchmarks.py
+++ b/agbenchmark/benchmarks.py
@@ -1,4 +1,6 @@
+import sys
 from pathlib import Path
+from typing import Tuple
 
 from autogpt.agents import Agent
 from autogpt.app.main import run_interaction_loop
@@ -12,14 +14,15 @@ from autogpt.workspace import Workspace
 PROJECT_DIR = Path().resolve()
 
 
-def run_task(task) -> None:
-    agent = bootstrap_agent(task)
+def run_specific_agent(task, continuous_mode=False) -> Tuple[str, int]:
+    agent = bootstrap_agent(task, continuous_mode)
     run_interaction_loop(agent)
 
 
-def bootstrap_agent(task):
+def bootstrap_agent(task, continuous_mode) -> Agent:
     config = ConfigBuilder.build_config_from_env(workdir=PROJECT_DIR)
-    config.continuous_mode = False
+    config.debug_mode = True
+    config.continuous_mode = continuous_mode
     config.temperature = 0
     config.plain_output = True
     command_registry = get_command_registry(config)
@@ -29,7 +32,7 @@ def bootstrap_agent(task):
     ai_config = AIConfig(
         ai_name="Auto-GPT",
         ai_role="a multi-purpose AI assistant.",
-        ai_goals=[task.user_input],
+        ai_goals=[task],
     )
     ai_config.command_registry = command_registry
     return Agent(
@@ -50,3 +53,12 @@ def get_command_registry(config: Config):
     for command_category in enabled_command_categories:
         command_registry.import_commands(command_category)
     return command_registry
+
+
+if __name__ == "__main__":
+    # The first argument is the script name itself, second is the task
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <task>")
+        sys.exit(1)
+    task = sys.argv[1]
+    run_specific_agent(task, continuous_mode=True)
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
new file mode 100644
index 00000000..dd957f9e
--- /dev/null
+++ b/agbenchmark/config.json
@@ -0,0 +1,4 @@
+{
+  "workspace": "autogpt/workspace/auto_gpt_workspace",
+  "entry_path": "agbenchmark.benchmarks"
+}
diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json
new file mode 100644
index 00000000..8d59b1a4
--- /dev/null
+++ b/agbenchmark/regression_tests.json
@@ -0,0 +1,24 @@
+{
+    "TestBasicCodeGeneration": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "data_path": "agbenchmark/challenges/code/d3"
+    },
+    "TestBasicMemory": {
+        "difficulty": "basic",
+        "data_path": "agbenchmark/challenges/memory/m1"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "data_path": "agbenchmark/challenges/interface/read_file"
+    },
+    "TestWriteFile": {
+        "dependencies": [],
+        "data_path": "agbenchmark/challenges/interface/write_file"
+    }
+}
diff --git a/requirements.txt b/requirements.txt
index 4af8bccd..c150c262 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 beautifulsoup4>=4.12.2
 colorama==0.4.6
 distro==1.8.0
-openai==0.27.2
+openai==0.27.8
 playsound==1.2.2
-python-dotenv==1.0.0
+python-dotenv==0.21
 pyyaml==6.0
 PyPDF2
 python-docx
@@ -31,6 +31,7 @@ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_
 prompt_toolkit>=3.0.38
 pydantic
 inflection
+agbenchmark
 
 # web server
 fastapi
diff --git a/tests/challenges/utils.py b/tests/challenges/utils.py
index 9d1b76e7..dd661b6e 100644
--- a/tests/challenges/utils.py
+++ b/tests/challenges/utils.py
@@ -6,9 +6,9 @@ from typing import Any, Generator
 
 import pytest
 
+from agbenchmark.benchmarks import run_specific_agent
 from autogpt.logs import LogCycleHandler
 from autogpt.workspace import Workspace
-from benchmarks import run_task
 from tests.challenges.schema import Task
 
 
@@ -75,4 +75,4 @@ def run_challenge(
     setup_mock_log_cycle_agent_name(monkeypatch, challenge_name, level_to_run)
     task = Task(user_input=user_input)
     with contextlib.suppress(SystemExit):
-        run_task(task)
+        run_specific_agent(task.user_input)