diff --git a/benchmark/benchmark/start_benchmark.py b/benchmark/benchmark/start_benchmark.py
deleted file mode 100644
index b47488f5..00000000
--- a/benchmark/benchmark/start_benchmark.py
+++ /dev/null
@@ -1,432 +0,0 @@
-import glob
-import json
-import os
-import sys
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Any, Optional
-
-import click
-import pytest
-from helicone.lock import HeliconeLockManager
-
-sys.path.append("/Users/swifty/dev/Auto-GPT/benchmark")
-
-from benchmark.reports.ReportManager import ReportManager
-from benchmark.utils.utils import (  # get_git_commit_sha,
-    AGENT_NAME,
-    calculate_dynamic_paths,
-)
-
-CURRENT_DIRECTORY = Path(__file__).resolve().parent
-BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
-if os.environ.get("HELICONE_API_KEY"):
-    HeliconeLockManager.write_custom_property(
-        "benchmark_start_time", BENCHMARK_START_TIME
-    )
-
-(
-    HOME_DIRECTORY,
-    CONFIG_PATH,
-    REGRESSION_TESTS_PATH,
-    REPORTS_PATH,
-    SUCCESS_RATE_PATH,
-    CHALLENGES_PATH,
-) = calculate_dynamic_paths()
-BENCHMARK_GIT_COMMIT_SHA = "---"  # get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
-AGENT_GIT_COMMIT_SHA = "---"  # get_git_commit_sha(HOME_DIRECTORY)
-# open a file in the challenges/optional_categories
-with open(
-    Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
-) as f:
-    OPTIONAL_CATEGORIES = json.load(f)["optional_categories"]
-
-
-def get_unique_categories() -> set[str]:
-    """Find all data.json files in the directory relative to this file and its subdirectories,
-    read the "category" field from each file, and return a set of unique categories."""
-    categories = set()
-
-    # Get the directory of this file
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-
-    glob_path = os.path.join(this_dir, "./challenges/**/data.json")
-    # Use it as the base for the glob pattern
-    for data_file in glob.glob(glob_path, recursive=True):
-        with open(data_file, "r") as f:
-            try:
-                data = json.load(f)
-                categories.update(data.get("category", []))
-            except json.JSONDecodeError:
-                print(f"Error: {data_file} is not a valid JSON file.")
-                continue
-            except IOError:
-                print(f"IOError: file could not be read: {data_file}")
-                continue
-
-    return categories
-
-
-def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
-    # tests that consistently pass are considered regression tests
-    REGRESSION_MANAGER = ReportManager(REGRESSION_TESTS_PATH)
-
-    # print(f"Using {REPORTS_PATH} for reports")
-    # user facing reporting information
-    INFO_MANAGER = ReportManager(str(Path(REPORTS_PATH) / "report.json"))
-
-    # internal db step in replacement track pass/fail rate
-    INTERNAL_INFO_MANAGER = ReportManager(SUCCESS_RATE_PATH)
-
-    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
-
-
-(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
-
-
-def run_benchmark(
-    maintain: bool = False,
-    improve: bool = False,
-    explore: bool = False,
-    mock: bool = False,
-    no_dep: bool = False,
-    nc: bool = False,
-    category: Optional[list[str]] = None,
-    skip_category: Optional[list[str]] = None,
-    test: Optional[str] = None,
-    suite: Optional[str] = None,
-    cutoff: Optional[int] = None,
-    api_mode: bool = False,
-    host: Optional[str] = None,
-) -> int:
-    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
-    # Check if configuration file exists and is not empty
-
-    if maintain and improve and explore:
-        print(
-            "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
-        )
-        return 1
-
-    if test and (category or skip_category or maintain or improve or suite or explore):
-        print(
-            "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
-        )
-        return 1
-
-    # TODO: test and ensure that this functionality works before removing
-    # change elif suite below if removing
-    if suite and (category or skip_category or maintain or improve or explore):
-        print(
-            "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
-        )
-        return 1
-
-    if os.path.join("Auto-GPT-Benchmarks") in str(HOME_DIRECTORY) and not AGENT_NAME:
-        print(
-            "If you are running from the Auto-GPT-Benchmarks repo, you must have AGENT_NAME defined."
-        )
-        return 1
-
-    if os.path.exists(CONFIG_PATH) and os.stat(CONFIG_PATH).st_size:
-        # If the configuration file exists and is not empty, load it
-        with open(CONFIG_PATH, "r") as f:
-            config = json.load(f)
-    else:
-        config = {}
-    host = host or config.get("host")
-    api_mode = api_mode or config.get("api_mode")
-    if host:
-        config["host"] = host
-    if api_mode:
-        config["api_mode"] = api_mode
-    print("benchmark run path", CONFIG_PATH, HOME_DIRECTORY)
-    if not config.get("workspace"):
-        config["workspace"] = click.prompt(
-            "Please enter a new workspace path",
-            default=os.path.join("workspace"),
-            show_default=True,
-        )
-
-    if api_mode and not host:
-        config["host"] = click.prompt(
-            "Please enter the Agent API host address",
-            default="http://localhost:8000",
-            show_default=True,
-        )
-
-    with open(CONFIG_PATH, "w") as f:
-        json.dump(config, f)
-
-    print("Current configuration:")
-    for key, value in config.items():
-        print(f"{key}: {value}")
-
-    pytest_args = ["-vs"]
-    if test:
-        print("Running specific test:", test)
-        pytest_args.extend(["-k", test, "--test"])
-    elif suite:
-        print("Running specific suite:", suite)
-        pytest_args.extend(["--suite"])
-    else:
-        # Categories that are used in the challenges
-        categories = get_unique_categories()
-        if category:
-            invalid_categories = set(category) - categories
-            assert (
-                not invalid_categories
-            ), f"Invalid categories: {invalid_categories}. Valid categories are: {categories}"
-
-        if category:
-            categories_to_run = set(category)
-            if skip_category:
-                categories_to_run = categories_to_run.difference(set(skip_category))
-                assert categories_to_run, "Error: You can't skip all categories"
-            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
-            print("Running tests of category:", categories_to_run)
-        elif skip_category:
-            categories_to_run = categories - set(skip_category)
-            assert categories_to_run, "Error: You can't skip all categories"
-            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
-            print("Running tests of category:", categories_to_run)
-        else:
-            print("Running all categories")
-
-        if maintain:
-            print("Running only regression tests")
-            pytest_args.append("--maintain")
-        elif improve:
-            print("Running only non-regression tests")
-            pytest_args.append("--improve")
-        elif explore:
-            print("Only attempt challenges that have never been beaten")
-            pytest_args.append("--explore")
-    if host:
-        pytest_args.append(f"--host={host}")
-    if api_mode:
-        pytest_args.append("--api_mode")
-    if mock:
-        pytest_args.append("--mock")
-
-    if no_dep:
-        pytest_args.append("--no_dep")
-
-    if nc and cutoff:
-        print(
-            "Error: You can't use both --nc and --cutoff at the same time. Please choose one."
-        )
-        return 1
-
-    if nc:
-        pytest_args.append("--nc")
-    if cutoff:
-        pytest_args.append("--cutoff")
-        print(f"Setting cuttoff override to {cutoff} seconds.")
-
-    pytest_args.extend((str(CURRENT_DIRECTORY), "--cache-clear"))
-    pytest_args.append("--disable-warnings")
-
-    return pytest.main(pytest_args)
-
-
-@click.group()
-def cli() -> None:
-    pass
-
-
-@cli.command()
-@click.option("--backend", is_flag=True, help="If it's being run from the cli")
-@click.option("-c", "--category", multiple=True, help="Specific category to run")
-@click.option(
-    "-s",
-    "--skip-category",
-    multiple=True,
-    help="Skips preventing the tests from this category from running",
-)
-@click.option("--test", help="Specific test to run")
-@click.option("--maintain", is_flag=True, help="Runs only regression tests")
-@click.option("--improve", is_flag=True, help="Run only non-regression tests")
-@click.option(
-    "--explore",
-    is_flag=True,
-    help="Only attempt challenges that have never been beaten",
-)
-@click.option("--mock", is_flag=True, help="Run with mock")
-@click.option("--suite", help="Run a suite of related tests")
-@click.option(
-    "--no_dep",
-    is_flag=True,
-    help="Run without dependencies (can be useful for a suite run)",
-)
-@click.option("--nc", is_flag=True, help="Run without cutoff")
-@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
-@click.option("--api_mode", help="API mode")
-@click.option("--host", help="Define API host")
-def start(
-    maintain: bool,
-    improve: bool,
-    explore: bool,
-    mock: bool,
-    no_dep: bool,
-    nc: bool,
-    category: Optional[list[str]] = None,
-    skip_category: Optional[list[str]] = None,
-    test: Optional[str] = None,
-    suite: Optional[str] = None,
-    cutoff: Optional[int] = None,
-    backend: Optional[bool] = False,
-    api_mode: bool = False,
-    host: Optional[str] = None,
-) -> Any:
-    # Redirect stdout if backend is True
-    original_stdout = sys.stdout  # Save the original standard output
-    exit_code = None
-
-    if backend:
-        with open("backend/backend_stdout.txt", "w") as f:
-            sys.stdout = f
-            exit_code = run_benchmark(
-                maintain=maintain,
-                improve=improve,
-                explore=explore,
-                mock=mock,
-                no_dep=no_dep,
-                nc=nc,
-                category=category,
-                skip_category=skip_category,
-                test=test,
-                suite=suite,
-                cutoff=cutoff,
-                api_mode=api_mode,
-                host=host,
-            )
-
-        sys.stdout = original_stdout
-
-        with open(Path(REPORTS_PATH) / "report.json", "r") as file:
-            latest_report = json.load(file)
-
-        print(latest_report)
-
-    else:
-        exit_code = run_benchmark(
-            maintain=maintain,
-            improve=improve,
-            explore=explore,
-            mock=mock,
-            no_dep=no_dep,
-            nc=nc,
-            category=category,
-            skip_category=skip_category,
-            test=test,
-            suite=suite,
-            cutoff=cutoff,
-        )
-
-        sys.exit(exit_code)
-
-
-def get_regression_data() -> Any:
-    with open(REGRESSION_TESTS_PATH, "r") as file:
-        data = json.load(file)
-
-    return data
-
-
-@cli.command()
-def version():
-    """Print the version of the benchmark tool."""
-    import toml
-
-    version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"][
-        "version"
-    ]
-    print(f"Benchmark Tool Version {version}")
-
-
-# def run_from_backend(
-#     maintain: bool = False,
-#     improve: bool = False,
-#     explore: bool = False,
-#     mock: bool = False,
-#     no_dep: bool = False,
-#     nc: bool = False,
-#     category: Optional[list[str]] = None,
-#     skip_category: Optional[list[str]] = None,
-#     test: Optional[str] = None,
-#     suite: Optional[str] = None,
-#     cutoff: Optional[int] = None,
-# ) -> Any:
-#     global HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, REPORTS_PATH, SUCCESS_RATE_PATH, CHALLENGES_PATH
-#     global REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
-
-#     if INFO_MANAGER.tests != {}:
-#         (
-#             HOME_DIRECTORY,
-#             CONFIG_PATH,
-#             REGRESSION_TESTS_PATH,
-#             REPORTS_PATH,
-#             SUCCESS_RATE_PATH,
-#             CHALLENGES_PATH,
-#         ) = calculate_dynamic_paths()
-
-#         (
-#             REGRESSION_MANAGER,
-#             INFO_MANAGER,
-#             INTERNAL_INFO_MANAGER,
-#         ) = get_report_managers()
-
-#     sys.argv = ["run_benchmark"]
-
-#     if maintain:
-#         sys.argv.append("--maintain")
-#     if improve:
-#         sys.argv.append("--improve")
-#     if explore:
-#         sys.argv.append("--explore")
-#     if mock:
-#         sys.argv.append("--mock")
-#     if no_dep:
-#         sys.argv.append("--no_dep")
-#     if nc:
-#         sys.argv.append("--nc")
-
-#     if category:
-#         for cat in category:
-#             sys.argv.extend(["-c", cat])
-
-#     if skip_category:
-#         for skip_cat in skip_category:
-#             sys.argv.extend(["-s", skip_cat])
-
-#     if test:
-#         sys.argv.extend(["--test", test])
-
-#     if suite:
-#         sys.argv.extend(["--suite", suite])
-
-#     if cutoff is not None:
-#         sys.argv.extend(["--cutoff", str(cutoff)])
-
-#     exit_code = run_benchmark(
-#         maintain=maintain,
-#         improve=improve,
-#         explore=explore,
-#         mock=mock,
-#         no_dep=no_dep,
-#         nc=nc,
-#         category=category,
-#         skip_category=skip_category,
-#         test=test,
-#         suite=suite,
-#         cutoff=cutoff,
-#     )
-
-#     if exit_code != 0:
-#         return f"pytest failed with exit code: {exit_code}"
-
-#     with open(Path(REPORTS_PATH) / "report.json", "r") as file:
-#         latest_report = json.load(file)
-
-#     return latest_report