diff --git a/autogpts/forge/run b/autogpts/forge/run
index 6de613b8..8fa77196 100755
--- a/autogpts/forge/run
+++ b/autogpts/forge/run
@@ -8,4 +8,5 @@ if [ ! -f .env ]; then
   echo "Please add your api keys to the .env file."
 fi
 poetry run python -m forge &
-poetry run agbenchmark serve &
+
+agbenchmark serve & #voluntarily not using poetry run so that it runs in editable mode
diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py
index 494fae6c..bff606f9 100644
--- a/benchmark/agbenchmark/__main__.py
+++ b/benchmark/agbenchmark/__main__.py
@@ -114,8 +114,8 @@ def run_benchmark(
     no_dep: bool = False,
     nc: bool = False,
     keep_answers: bool = False,
-    category: Optional[list[str]] = None,
-    skip_category: Optional[list[str]] = None,
+    category: Optional[tuple[str]] = None,
+    skip_category: Optional[tuple[str]] = None,
     test: Optional[str] = None,
     cutoff: Optional[int] = None,
     server: bool = False,
@@ -157,7 +157,6 @@ def run_benchmark(
 
     if test:
         print("Running specific test:", test)
-        pytest_args.extend(["-k", test, "--test"])
     else:
         # Categories that are used in the challenges
         categories = get_unique_categories()
diff --git a/benchmark/agbenchmark/app.py b/benchmark/agbenchmark/app.py
index 8bc52613..0485528b 100644
--- a/benchmark/agbenchmark/app.py
+++ b/benchmark/agbenchmark/app.py
@@ -54,7 +54,18 @@ app.add_middleware(
 def run_single_test(body: CreateReportRequest) -> Any:
     from agbenchmark.__main__ import run_benchmark
 
-    run_benchmark(category=[body.category], mock=body.mock)
+    # it's a hack because other parts of the code are using sys.argv
+    sys.argv = [sys.argv[0]]
+    sys.argv.append("start")
+    if body.category:
+        sys.argv.append(f"--category={body.category}")
+    for body_test in body.tests:
+        sys.argv.append(f"--test={body_test}")
+    categories = None
+    if body.category:
+        categories = tuple([body.category])
+
+    run_benchmark(category=categories, mock=body.mock, test=tuple(body.tests))
     import json
     from pathlib import Path
 
@@ -95,6 +106,8 @@ from fastapi import FastAPI, Request, Response
 
 @app.get("/updates")
 def get_updates(request: Request) -> Any:
+    from agbenchmark.__main__ import UPDATES_JSON_PATH
+
     try:
         # Read data from the "update.json" file (provide the correct file path)
         with open(UPDATES_JSON_PATH, "r") as file:
diff --git a/benchmark/agbenchmark/generate_test.py b/benchmark/agbenchmark/generate_test.py
index 3018726c..bf2c3be2 100644
--- a/benchmark/agbenchmark/generate_test.py
+++ b/benchmark/agbenchmark/generate_test.py
@@ -192,8 +192,12 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
                 continue
 
         # --test flag, only run the test if it's the exact one specified
-        test_flag = "--test" in commands
-        if test_flag and data["name"] not in commands:
+        tests = []
+        for command in commands:
+            if command.startswith("--test="):
+                tests.append(command.split("=")[1])
+
+        if tests and data["name"] not in tests:
             continue
 
         # --maintain and --improve flag