Merge branch 'master' into feat/monitor

2026-02-23 07:04:24 +01:00 · 2023-09-11 18:21:52 -07:00
parent 39efed59af 91572681a5
commit f95a285b59
621 changed files with 12382 additions and 345 deletions
--- a/.github/workflows/autogpt-ci.yml
+++ b/.github/workflows/autogpt-ci.yml
@@ -4,13 +4,21 @@ on:
  push:
    branches: [ master, ci-test* ]
    paths:
-      - 'autogpts/autogpt'
-      - '!autogpts/autogpt/tests/Auto-GPT-test-cassettes'
+      - 'autogpts/autogpt/**'
+      - '!autogpts/autogpt/tests/vcr_cassettes'
      - '!autogpts/autogpt/tests/challenges/current_score.json'
  pull_request:
    branches: [ stable, master, release-* ]
+    paths:
+      - 'autogpts/autogpt/**'
+      - '!autogpts/autogpt/tests/vcr_cassettes'
+      - '!autogpts/autogpt/tests/challenges/current_score.json'
  pull_request_target:
    branches: [ master, release-*, ci-test* ]
+    paths:
+      - 'autogpts/autogpt/**'
+      - '!autogpts/autogpt/tests/vcr_cassettes'
+      - '!autogpts/autogpt/tests/challenges/current_score.json'

 concurrency:
  group: ${{ format('autogpt-ci-{0}', github.head_ref && format('{0}-{1}', github.event_name, github.event.pull_request.number) || github.sha) }}
@@ -114,7 +122,7 @@ jobs:
        run: |
          cassette_branch="${{ github.event.pull_request.user.login }}-${{ github.event.pull_request.head.ref }}"
          cassette_base_branch="${{ github.event.pull_request.base.ref }}"
-          cd tests/Auto-GPT-test-cassettes
+          cd tests/vcr_cassettes

          if ! git ls-remote --exit-code --heads origin $cassette_base_branch ; then
            cassette_base_branch="master"
@@ -184,7 +192,7 @@ jobs:
          git config "$config_key" \
            "Authorization: Basic $base64_pat"

-          cd tests/Auto-GPT-test-cassettes
+          cd tests/vcr_cassettes
          git config "$config_key" \
            "Authorization: Basic $base64_pat"

@@ -215,7 +223,7 @@ jobs:
            cassette_branch="${{ github.ref_name }}"
          fi

-          cd tests/Auto-GPT-test-cassettes
+          cd tests/vcr_cassettes
          # Commit & push changes to cassettes if any
          if ! git diff --quiet; then
            git add .
@@ -223,7 +231,7 @@ jobs:
            git push origin HEAD:$cassette_branch
            if [ ! $is_pull_request ]; then
              cd ../..
-              git add tests/Auto-GPT-test-cassettes
+              git add tests/vcr_cassettes
              git commit -m "Update cassette submodule"
              git push origin HEAD:$cassette_branch
            fi
--- a/.github/workflows/autogpt-docker-ci.yml
+++ b/.github/workflows/autogpt-docker-ci.yml
@@ -4,11 +4,15 @@ on:
  push:
    branches: [ master ]
    paths:
-      - 'autogpts/autogpt'
-      - '!autogpts/autogpt/tests/Auto-GPT-test-cassettes'
+      - 'autogpts/autogpt/**'
+      - '!autogpts/autogpt/tests/vcr_cassettes'
      - '!autogpts/autogpt/tests/challenges/current_score.json'
  pull_request:
    branches: [ master, release-*, stable ]
+    paths:
+      - 'autogpts/autogpt/**'
+      - '!autogpts/autogpt/tests/vcr_cassettes'
+      - '!autogpts/autogpt/tests/challenges/current_score.json'

 concurrency:
  group: ${{ format('autogpt-docker-ci-{0}', github.head_ref && format('pr-{0}', github.event.pull_request.number) || github.sha) }}
--- a/.github/workflows/benchmark-ci.yml
+++ b/.github/workflows/benchmark-ci.yml
@@ -7,7 +7,7 @@ on:
      agents:
        description: 'Agents to run (comma-separated)'
        required: false
-        default: 'gpt-engineer,smol-developer,Auto-GPT,mini-agi,beebot,BabyAGI,PolyGPT,Turbo' # Default agents if none are specified
+        default: 'gpt-engineer,smol-developer,Auto-GPT,mini-agi,beebot,babyagi,PolyGPT,Auto-GPT-Turbo' # Default agents if none are specified
  schedule:
    - cron: '0 8 * * *'
  push:
@@ -17,6 +17,9 @@ on:
      - '!benchmark/reports/**'
  pull_request:
    branches: [stable, master, release-*]
+    paths:
+      - 'benchmark/**'
+      - '!benchmark/reports/**'

 jobs:
  lint:
@@ -83,7 +86,7 @@ jobs:
        run: |
          if [ "${{ github.event_name }}" == "schedule" ]; then
            echo "::set-output name=env-name::production"
-            echo "::set-output name=matrix::[ 'gpt-engineer', 'smol-developer', 'Auto-GPT', 'mini-agi', 'beebot', 'BabyAGI', 'PolyGPT', 'Turbo' ]"
+            echo "::set-output name=matrix::[ 'gpt-engineer', 'smol-developer', 'Auto-GPT', 'mini-agi', 'beebot', 'babyagi', 'PolyGPT', 'Auto-GPT-Turbo' ]"
          elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
            IFS=',' read -ra matrix_array <<< "${{ github.event.inputs.agents }}"
            matrix_string="[ \"$(echo "${matrix_array[@]}" | sed 's/ /", "/g')\" ]"
@@ -146,9 +149,12 @@ jobs:
          mkdir agent
          link=$(jq -r '.["'"$AGENT_NAME"'"].url' agents_to_benchmark.json)
          branch=$(jq -r '.["'"$AGENT_NAME"'"].branch' agents_to_benchmark.json)
+          commit=$(jq -r '.["'"$AGENT_NAME"'"].commit' agents_to_benchmark.json)
          cd agent
          git clone "$link" -b "$branch"
          cd $AGENT_NAME
+          git checkout "$commit" || echo "Commit not found, using latest commit on branch"
+
          prefix=""
          if [ "$AGENT_NAME" == "gpt-engineer" ]; then
            make install
@@ -167,7 +173,7 @@ jobs:
            python -m venv venv
            source venv/bin/activate
            pip install -r requirements.txt
-          elif [ "$AGENT_NAME" == "BabyAGI" ]; then
+          elif [ "$AGENT_NAME" == "babyagi" ]; then
            python -m venv venv
            source venv/bin/activate
            pip install -r requirements.txt
@@ -188,7 +194,7 @@ jobs:
            nvm install && nvm use
            yarn install
            export NODE_TLS_REJECT_UNAUTHORIZED=0
-          elif [ "$AGENT_NAME" == "Turbo" ]; then
+          elif [ "$AGENT_NAME" == "Auto-GPT-Turbo" ]; then
            python -m venv venv
            source venv/bin/activate
            pip install -r requirements.txt
@@ -259,7 +265,6 @@ jobs:
          REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
          HELICONE_CACHE_ENABLED: false
          HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
-          REPORT_LOCATION: ${{ format('../../reports/{0}', matrix.agent-name) }}
          WOLFRAM_ALPHA_APPID: ${{ secrets.WOLFRAM_ALPHA_APPID }}
          SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
          BING_SUBSCRIPTION_KEY: ${{ secrets.BING_SUBSCRIPTION_KEY }}
@@ -273,11 +278,12 @@ jobs:

      - name: Authenticate and Push to Branch
        working-directory: ./benchmark/
-        if: (success() || failure()) && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
+        if: (success() || failure()) && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.ref_name, 'ci-test-'))
        run: |
          git config --global user.email "github-bot@agpt.co"
          git config --global user.name "Auto-GPT-Bot"
-
+          cp -rn agent/$AGENT_NAME/agbenchmark/reports/* reports/$AGENT_NAME/
+          rm -rf agent
          git add reports/* || echo "nothing to commit"
          commit_message="${{ matrix.agent-name }}-$(date +'%Y%m%d%H%M%S')"
          git commit -m "${commit_message}"
@@ -303,3 +309,4 @@ jobs:
        env:
          GDRIVE_BASE64: ${{ secrets.GDRIVE_BASE64 }}
          GITHUB_REF_NAME: ${{ github.ref_name }}
+          AGENT_NAME: ${{ matrix.agent-name }}
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -49,7 +49,7 @@ jobs:

      - name: Run pytest with coverage
        run: |
-          rm -rf tests/Auto-GPT-test-cassettes
+          rm -rf tests/vcr_cassettes
          pytest -n auto --record-mode=all ${{ matrix.config.task }}
        env:
          CI: true
@@ -70,4 +70,4 @@ jobs:
        uses: actions/upload-artifact@v3
        with:
          name: cassettes-${{ matrix.config.task-name }}
-          path: tests/Auto-GPT-test-cassettes/
+          path: tests/vcr_cassettes/
--- a/.github/workflows/forge-ci.yml
+++ b/.github/workflows/forge-ci.yml
@@ -0,0 +1,103 @@
+name: Forge CI
+
+on:
+  workflow_dispatch:
+    branches: [master]
+  schedule:
+    - cron: '0 8 * * *'
+  push:
+    branches: [master, ci-test*]
+    paths:
+      - 'forge/**'
+      - '.github/workflows/forge-ci.yml'
+  pull_request:
+    branches: [stable, master, release-*]
+    paths:
+      - 'forge/**'
+      - '.github/workflows/forge-ci.yml'
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    env:
+      min-python-version: '3.10'
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
+
+      - name: Set up Python ${{ env.min-python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.min-python-version }}
+
+      - name: Install Poetry
+        working-directory: ./forge/
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Install dependencies
+        working-directory: ./forge/
+        run: |
+          poetry install
+
+      - name: Lint with flake8
+        working-directory: ./forge/
+        run: poetry run flake8
+
+      - name: Check black formatting
+        working-directory: ./forge/
+        run: poetry run  black . --exclude test.py --check
+        if: success() || failure()
+
+      - name: Check isort formatting
+        working-directory: ./forge/
+        run: poetry run  isort . --check
+        if: success() || failure()
+
+  agent-protocol-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
+
+      - name: Set up Python ${{ env.min-python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.min-python-version }}
+
+      - name: Install Poetry
+        working-directory: ./forge/
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Install dependencies
+        working-directory: ./forge/
+        run: |
+          poetry install
+
+      - name: Run regression tests
+        working-directory: ./forge/
+        run: |
+          cp .env.example .env
+          poetry run python -m autogpt &
+          URL=http://127.0.0.1:8000 bash -c "$(curl -fsSL https://raw.githubusercontent.com/AI-Engineers-Foundation/agent-protocol/main/testing_suite/test.sh)"
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          AGENT_NAME: ${{ matrix.agent-name }}
+          HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }}
+          REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
+          HELICONE_CACHE_ENABLED: false
+          HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
+          REPORT_LOCATION: ${{ format('../../reports/{0}', matrix.agent-name) }}
--- a/.github/workflows/pr-label.yml
+++ b/.github/workflows/pr-label.yml
@@ -5,8 +5,9 @@ on:
  push:
    branches: [ master, release-* ]
    paths-ignore:
-      - 'autogpts/autogpt/tests/Auto-GPT-test-cassettes'
+      - 'autogpts/autogpt/tests/vcr_cassettes'
      - 'autogpts/autogpt/tests/challenges/current_score.json'
+      - 'benchmark/reports/**'
  # So that the `dirtyLabel` is removed if conflicts are resolve
  # We recommend `pull_request_target` so that github secrets are available.
  # In `pull_request` we wouldn't be able to change labels of fork PRs
--- a/.gitignore
+++ b/.gitignore
@@ -31,8 +31,6 @@ __pycache__/
 build/
 develop-eggs/
 dist/
-/plugins/
-plugins_config.yaml
 downloads/
 eggs/
 .eggs/
@@ -166,4 +164,4 @@ agbenchmark/reports/

 # Nodejs
 package-lock.json
-package.json
+package.json
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "autogpts/autogpt/tests/vcr_cassettes"]
+	path = autogpts/autogpt/tests/vcr_cassettes
+	url = https://github.com/Significant-Gravitas/Auto-GPT-test-cassettes
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+install:
+	@echo "Installing dependencies..."
+	@command -v poetry >/dev/null 2>&1 || { echo >&2 "Poetry not found, installing..."; curl -sSL https://install.python-poetry.org | python3 - ; }
+	poetry install
+
+list_agents:
+	@echo "Listing all agents in autogpts..."
+	@for agent in $$(ls autogpts); do \
+		echo \\t$$agent; \
+	done
+	@echo \\t"forge"
+
+
+benchmark_%:
+	@echo "Running benchmark for $*"
+	poetry run sh -c 'export PYTHONPATH=$$PYTHONPATH:./benchmark:./autogpts/$*; echo $$PYTHONPATH; python -m benchmark start --agent-config autogpts/$*/benchmark_config.json'
+	
+
+run:
+	python main.py
+
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Auto-GPT: An Autonomous GPT-4 Experiment
 [![Official Website](https://img.shields.io/badge/Official%20Website-agpt.co-blue?style=flat&logo=world&logoColor=white)](https://agpt.co)
-[![Unit Tests](https://img.shields.io/github/actions/workflow/status/Significant-Gravitas/Auto-GPT/ci.yml?label=unit%20tests)](https://github.com/Significant-Gravitas/Auto-GPT/actions/workflows/ci.yml)
+[![Unit Tests](https://img.shields.io/github/actions/workflow/status/Significant-Gravitas/Auto-GPT/autogpt-ci.yml?label=Auto-GPT%20unit%20tests)](https://github.com/Significant-Gravitas/Auto-GPT/actions/workflows/autogpt-ci.yml)
 [![Discord Follow](https://dcbadge.vercel.app/api/server/autogpt?style=flat)](https://discord.gg/autogpt)
 [![GitHub Repo stars](https://img.shields.io/github/stars/Significant-Gravitas/auto-gpt?style=social)](https://github.com/Significant-Gravitas/Auto-GPT/stargazers)
 [![Twitter Follow](https://img.shields.io/twitter/follow/siggravitas?style=social)](https://twitter.com/SigGravitas)
--- a/autogpts/autogpt/.gitattributes
+++ b/autogpts/autogpt/.gitattributes
@@ -1,5 +1,5 @@
 # Exclude VCR cassettes from stats
-tests/Auto-GPT-test-cassettes/**/**.y*ml linguist-generated
+tests/vcr_cassettes/**/**.y*ml linguist-generated

 # Mark documentation as such
 docs/**.md linguist-documentation
--- a/autogpts/autogpt/.gitignore
+++ b/autogpts/autogpt/.gitignore
@@ -31,7 +31,7 @@ __pycache__/
 build/
 develop-eggs/
 dist/
-/plugins/
+/plugins/*
 plugins_config.yaml
 downloads/
 eggs/
@@ -166,4 +166,7 @@ agbenchmark/reports/

 # Nodejs
 package-lock.json
-package.json
+package.json
+
+# Keep
+!.keep
--- a/autogpts/autogpt/.gitmodules
+++ b/autogpts/autogpt/.gitmodules
@@ -1,4 +0,0 @@
-[submodule "tests/Auto-GPT-test-cassettes"]
-    path = tests/Auto-GPT-test-cassettes
-    url = https://github.com/Significant-Gravitas/Auto-GPT-test-cassettes
-    branch = master
--- a/autogpts/autogpt/README.md
+++ b/autogpts/autogpt/README.md
@@ -1,6 +1,6 @@
 # Auto-GPT: An Autonomous GPT-4 Experiment
 [![Official Website](https://img.shields.io/badge/Official%20Website-agpt.co-blue?style=flat&logo=world&logoColor=white)](https://agpt.co)
-[![Unit Tests](https://img.shields.io/github/actions/workflow/status/Significant-Gravitas/Auto-GPT/ci.yml?label=unit%20tests)](https://github.com/Significant-Gravitas/Auto-GPT/actions/workflows/ci.yml)
+[![Unit Tests](https://img.shields.io/github/actions/workflow/status/Significant-Gravitas/Auto-GPT/autogpt-ci.yml?label=unit%20tests)](https://github.com/Significant-Gravitas/Auto-GPT/actions/workflows/autogpt-ci.yml)
 [![Discord Follow](https://dcbadge.vercel.app/api/server/autogpt?style=flat)](https://discord.gg/autogpt)
 [![GitHub Repo stars](https://img.shields.io/github/stars/Significant-Gravitas/auto-gpt?style=social)](https://github.com/Significant-Gravitas/Auto-GPT/stargazers)
 [![Twitter Follow](https://img.shields.io/twitter/follow/siggravitas?style=social)](https://twitter.com/SigGravitas)
--- a/autogpts/autogpt/autogpt/app/main.py
+++ b/autogpts/autogpt/autogpt/app/main.py
@@ -26,12 +26,11 @@ from autogpt.commands import COMMAND_CATEGORIES
 from autogpt.config import AIConfig, Config, ConfigBuilder, check_openai_api_key
 from autogpt.llm.api_manager import ApiManager
 from autogpt.logs.config import configure_chat_plugins, configure_logging
-from autogpt.logs.helpers import print_attribute
+from autogpt.logs.helpers import print_attribute, speak
 from autogpt.memory.vector import get_memory
 from autogpt.models.command_registry import CommandRegistry
 from autogpt.plugins import scan_plugins
 from autogpt.prompts.prompt import DEFAULT_TRIGGERING_PROMPT
-from autogpt.speech import say_text
 from autogpt.workspace import Workspace
 from scripts.install_plugin_deps import install_plugin_dependencies

@@ -366,7 +365,7 @@ def update_user(
    print_assistant_thoughts(ai_config.ai_name, assistant_reply_dict, config)

    if config.speak_mode:
-        say_text(f"I want to execute {command_name}", config)
+        speak(f"I want to execute {command_name}")

    # First log new-line so user can differentiate sections better in console
    print()
@@ -531,8 +530,6 @@ def print_assistant_thoughts(
    assistant_reply_json_valid: dict,
    config: Config,
 ) -> None:
-    from autogpt.speech import say_text
-
    logger = logging.getLogger(__name__)

    assistant_thoughts_reasoning = None
@@ -577,7 +574,7 @@ def print_assistant_thoughts(
    # Speak the assistant's thoughts
    if assistant_thoughts_speak:
        if config.speak_mode:
-            say_text(assistant_thoughts_speak, config)
+            speak(assistant_thoughts_speak)
        else:
            print_attribute("SPEAK", assistant_thoughts_speak, title_color=Fore.YELLOW)

--- a/autogpts/autogpt/autogpt/app/setup.py
+++ b/autogpts/autogpt/autogpt/app/setup.py
@@ -216,7 +216,7 @@ def generate_aiconfig_automatic(user_prompt: str, config: Config) -> AIConfig:
    # Call LLM with the string as user input
    output = create_chat_completion(
        ChatSequence.for_model(
-            config.fast_llm,
+            config.smart_llm,
            [
                Message("system", system_prompt),
                Message("user", prompt_ai_config_automatic),
--- a/autogpts/autogpt/autogpt/app/utils.py
+++ b/autogpts/autogpt/autogpt/app/utils.py
@@ -61,7 +61,7 @@ def clean_input(config: Config, prompt: str = ""):
 def get_bulletin_from_web():
    try:
        response = requests.get(
-            "https://raw.githubusercontent.com/Significant-Gravitas/Auto-GPT/master/BULLETIN.md"
+            "https://raw.githubusercontent.com/Significant-Gravitas/Auto-GPT/master/autogpts/autogpt/BULLETIN.md"
        )
        if response.status_code == 200:
            return response.text
--- a/autogpts/autogpt/autogpt/command_decorator.py
+++ b/autogpts/autogpt/autogpt/command_decorator.py
@@ -30,7 +30,7 @@ def command(
 ) -> Callable[..., CommandOutput]:
    """The command decorator is used to create Command objects from ordinary functions."""

-    def decorator(func: Callable[..., CommandOutput]) -> Command:
+    def decorator(func: Callable[..., CommandOutput]):
        typed_parameters = [
            CommandParameter(
                name=param_name,
@@ -55,8 +55,7 @@ def command(
        def wrapper(*args, **kwargs) -> Any:
            return func(*args, **kwargs)

-        wrapper.command = cmd
-
+        setattr(wrapper, "command", cmd)
        setattr(wrapper, AUTO_GPT_COMMAND_IDENTIFIER, True)

        return wrapper
--- a/autogpts/autogpt/autogpt/commands/execute_code.py
+++ b/autogpts/autogpt/autogpt/commands/execute_code.py
@@ -79,14 +79,22 @@ def execute_python_code(code: str, agent: Agent) -> str:
            "description": "The name of te file to execute",
            "required": True,
        },
+        "args": {
+            "type": "list[str]",
+            "description": "The (command line) arguments to pass to the script",
+            "required": False,
+        },
    },
 )
@sanitize_path_arg("filename")
-def execute_python_file(filename: Path, agent: Agent) -> str:
+def execute_python_file(
+    filename: Path, agent: Agent, args: list[str] | str = []
+) -> str:
    """Execute a Python file in a Docker container and return the output

    Args:
        filename (Path): The name of the file to execute
+        args (list, optional): The arguments with which to run the python script

    Returns:
        str: The output of the file
@@ -95,6 +103,9 @@ def execute_python_file(filename: Path, agent: Agent) -> str:
        f"Executing python file '{filename}' in working directory '{agent.config.workspace_path}'"
    )

+    if isinstance(args, str):
+        args = args.split()  # Convert space-separated string to a list
+
    if not str(filename).endswith(".py"):
        raise InvalidArgumentError("Invalid file type. Only .py files are allowed.")

@@ -110,7 +121,7 @@ def execute_python_file(filename: Path, agent: Agent) -> str:
            f"Auto-GPT is running in a Docker container; executing {file_path} directly..."
        )
        result = subprocess.run(
-            ["python", "-B", str(file_path)],
+            ["python", "-B", str(file_path)] + args,
            capture_output=True,
            encoding="utf8",
            cwd=str(agent.workspace.root),
@@ -152,7 +163,7 @@ def execute_python_file(filename: Path, agent: Agent) -> str:
                "python",
                "-B",
                file_path.relative_to(agent.workspace.root).as_posix(),
-            ],
+            ] + args,
            volumes={
                str(agent.workspace.root): {
                    "bind": "/workspace",
--- a/autogpts/autogpt/autogpt/logs/config.py
+++ b/autogpts/autogpt/autogpt/logs/config.py
@@ -27,6 +27,7 @@ DEBUG_LOG_FORMAT = (
    "  %(title)s%(message)s"
 )

+SPEECH_OUTPUT_LOGGER = "VOICE"
 USER_FRIENDLY_OUTPUT_LOGGER = "USER_FRIENDLY_OUTPUT"

 _chat_plugins: list[AutoGPTPluginTemplate] = []
@@ -96,6 +97,11 @@ def configure_logging(config: Config, log_dir: Path = LOG_DIR) -> None:
    user_friendly_output_logger.addHandler(stderr)
    user_friendly_output_logger.propagate = False

+    speech_output_logger = logging.getLogger(SPEECH_OUTPUT_LOGGER)
+    speech_output_logger.setLevel(logging.INFO)
+    speech_output_logger.addHandler(TTSHandler(config))
+    speech_output_logger.propagate = False
+
    # JSON logger with better formatting
    json_logger = logging.getLogger("JSON_LOGGER")
    json_logger.setLevel(logging.DEBUG)
--- a/autogpts/autogpt/autogpt/logs/handlers.py
+++ b/autogpts/autogpt/autogpt/logs/handlers.py
@@ -8,7 +8,7 @@ import time
 from typing import TYPE_CHECKING

 from autogpt.logs.utils import remove_color_codes
-from autogpt.speech.say import say_text
+from autogpt.speech import TextToSpeechProvider

 if TYPE_CHECKING:
    from autogpt.config import Config
@@ -53,6 +53,7 @@ class TTSHandler(logging.Handler):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
+        self.tts_provider = TextToSpeechProvider(config)

    def format(self, record: logging.LogRecord) -> str:
        if getattr(record, "title", ""):
@@ -67,7 +68,7 @@ class TTSHandler(logging.Handler):
            return

        message = self.format(record)
-        say_text(message, self.config)
+        self.tts_provider.say(message)


 class JsonFileHandler(logging.FileHandler):
--- a/autogpts/autogpt/autogpt/logs/helpers.py
+++ b/autogpts/autogpt/autogpt/logs/helpers.py
@@ -3,7 +3,7 @@ from typing import Any, Optional

 from colorama import Fore

-from .config import USER_FRIENDLY_OUTPUT_LOGGER, _chat_plugins
+from .config import SPEECH_OUTPUT_LOGGER, USER_FRIENDLY_OUTPUT_LOGGER, _chat_plugins


 def user_friendly_output(
@@ -65,3 +65,7 @@ def request_user_double_check(additionalText: Optional[str] = None) -> None:
        title="DOUBLE CHECK CONFIGURATION",
        preserve_message_color=True,
    )
+
+
+def speak(message: str, level: int = logging.INFO) -> None:
+    logging.getLogger(SPEECH_OUTPUT_LOGGER).log(level, message)
--- a/autogpts/autogpt/autogpt/singleton.py
+++ b/autogpts/autogpt/autogpt/singleton.py
@@ -14,9 +14,3 @@ class Singleton(abc.ABCMeta, type):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
        return cls._instances[cls]
-
-
-class AbstractSingleton(abc.ABC, metaclass=Singleton):
-    """
-    Abstract singleton class for ensuring only one instance of a class.
-    """
--- a/autogpts/autogpt/autogpt/speech/init.py
+++ b/autogpts/autogpt/autogpt/speech/init.py
@@ -1,4 +1,4 @@
 """This module contains the speech recognition and speech synthesis functions."""
-from autogpt.speech.say import say_text
+from autogpt.speech.say import TextToSpeechProvider

-__all__ = ["say_text"]
+__all__ = ["TextToSpeechProvider"]
--- a/autogpts/autogpt/autogpt/speech/base.py
+++ b/autogpts/autogpt/autogpt/speech/base.py
@@ -9,10 +9,8 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from autogpt.config import Config

-from autogpt.singleton import AbstractSingleton

-
-class VoiceBase(AbstractSingleton):
+class VoiceBase:
    """
    Base class for all voice classes.
    """
--- a/autogpts/autogpt/autogpt/speech/say.py
+++ b/autogpts/autogpt/autogpt/speech/say.py
@@ -19,32 +19,37 @@ _QUEUE_SEMAPHORE = Semaphore(
 )  # The amount of sounds to queue before blocking the main thread


-def say_text(text: str, config: Config, voice_index: int = 0) -> None:
-    """Speak the given text using the given voice index"""
-    default_voice_engine, voice_engine = _get_voice_engine(config)
+class TextToSpeechProvider:
+    def __init__(self, config: Config):
+        self._config = config
+        self._default_voice_engine, self._voice_engine = self._get_voice_engine(config)

-    def speak() -> None:
-        success = voice_engine.say(text, voice_index)
-        if not success:
-            default_voice_engine.say(text)
+    def say(self, text, voice_index: int = 0) -> None:
+        def _speak() -> None:
+            success = self._voice_engine.say(text, voice_index)
+            if not success:
+                self._default_voice_engine.say(text, voice_index)
+            _QUEUE_SEMAPHORE.release()

-        _QUEUE_SEMAPHORE.release()
+        if self._config.speak_mode:
+            _QUEUE_SEMAPHORE.acquire(True)
+            thread = threading.Thread(target=_speak)
+            thread.start()

-    _QUEUE_SEMAPHORE.acquire(True)
-    thread = threading.Thread(target=speak)
-    thread.start()
+    def __repr__(self):
+        return f"{self.__class__.__name__}(enabled={self._config.speak_mode}, provider={self._voice_engine.__class__.__name__})"

+    @staticmethod
+    def _get_voice_engine(config: Config) -> tuple[VoiceBase, VoiceBase]:
+        """Get the voice engine to use for the given configuration"""
+        tts_provider = config.text_to_speech_provider
+        if tts_provider == "elevenlabs":
+            voice_engine = ElevenLabsSpeech(config)
+        elif tts_provider == "macos":
+            voice_engine = MacOSTTS(config)
+        elif tts_provider == "streamelements":
+            voice_engine = StreamElementsSpeech(config)
+        else:
+            voice_engine = GTTSVoice(config)

-def _get_voice_engine(config: Config) -> tuple[VoiceBase, VoiceBase]:
-    """Get the voice engine to use for the given configuration"""
-    tts_provider = config.text_to_speech_provider
-    if tts_provider == "elevenlabs":
-        voice_engine = ElevenLabsSpeech(config)
-    elif tts_provider == "macos":
-        voice_engine = MacOSTTS(config)
-    elif tts_provider == "streamelements":
-        voice_engine = StreamElementsSpeech(config)
-    else:
-        voice_engine = GTTSVoice(config)
-
-    return GTTSVoice(config), voice_engine
+        return GTTSVoice(config), voice_engine
--- a/benchmark/agbenchmark/challenges/init.py
+++ b/benchmark/agbenchmark/challenges/init.py
--- a/autogpts/autogpt/tests/integration/test_execute_code.py
+++ b/autogpts/autogpt/tests/integration/test_execute_code.py
@@ -2,6 +2,7 @@ import os
 import random
 import string
 import tempfile
+from pathlib import Path

 import pytest

@@ -21,12 +22,22 @@ def random_code(random_string) -> str:


@pytest.fixture
-def python_test_file(config: Config, random_code: str) -> str:
+def python_test_file(config: Config, random_code: str):
    temp_file = tempfile.NamedTemporaryFile(dir=config.workspace_path, suffix=".py")
    temp_file.write(str.encode(random_code))
    temp_file.flush()

-    yield temp_file.name
+    yield Path(temp_file.name)
+    temp_file.close()
+
+
+@pytest.fixture
+def python_test_args_file(config: Config):
+    temp_file = tempfile.NamedTemporaryFile(dir=config.workspace_path, suffix=".py")
+    temp_file.write(str.encode("import sys\nprint(sys.argv[1], sys.argv[2])"))
+    temp_file.flush()
+
+    yield Path(temp_file.name)
    temp_file.close()


@@ -35,34 +46,23 @@ def random_string():
    return "".join(random.choice(string.ascii_lowercase) for _ in range(10))


-def test_execute_python_file(python_test_file: str, random_string: str, agent: Agent):
+def test_execute_python_file(python_test_file: Path, random_string: str, agent: Agent):
    result: str = sut.execute_python_file(python_test_file, agent=agent)
    assert result.replace("\r", "") == f"Hello {random_string}!\n"


-def test_execute_python_code(random_code: str, random_string: str, agent: Agent):
-    ai_name = agent.ai_config.ai_name
-
-    result: str = sut.execute_python_code(random_code, "test_code", agent=agent)
-    assert result.replace("\r", "") == f"Hello {random_string}!\n"
-
-    # Check that the code is stored
-    destination = os.path.join(
-        agent.config.workspace_path, ai_name, "executed_code", "test_code.py"
-    )
-    with open(destination) as f:
-        assert f.read() == random_code
-
-
-def test_execute_python_code_disallows_name_arg_path_traversal(
-    random_code: str, agent: Agent
+def test_execute_python_file_args(
+    python_test_args_file: Path, random_string: str, agent: Agent
 ):
-    with pytest.raises(AccessDeniedError, match="path traversal"):
-        sut.execute_python_code(random_code, name="../../test_code", agent=agent)
+    random_args = [random_string] * 2
+    random_args_string = " ".join(random_args)
+    result = sut.execute_python_file(python_test_args_file, agent=agent, random_args)
+    assert result == f"{random_args_string}\n"

-    # Check that the code is not stored in parent directory
-    dst_with_traversal = agent.workspace.get_path("test_code.py")
-    assert not dst_with_traversal.is_file(), "Path traversal by filename not prevented"
+
+def test_execute_python_code(random_code: str, random_string: str, agent: Agent):
+    result: str = sut.execute_python_code(random_code, agent=agent)
+    assert result.replace("\r", "") == f"Hello {random_string}!\n"


 def test_execute_python_code_overwrites_file(random_code: str, agent: Agent):
@@ -75,7 +75,7 @@ def test_execute_python_code_overwrites_file(random_code: str, agent: Agent):
    with open(destination, "w+") as f:
        f.write("This will be overwritten")

-    sut.execute_python_code(random_code, "test_code.py", agent=agent)
+    sut.execute_python_code(random_code, agent=agent)

    # Check that the file is updated with the new code
    with open(destination) as f:
--- a/autogpts/autogpt/tests/unit/test_config.py
+++ b/autogpts/autogpt/tests/unit/test_config.py
@@ -21,7 +21,7 @@ def test_initial_values(config: Config) -> None:
    assert config.debug_mode == False
    assert config.continuous_mode == False
    assert config.speak_mode == False
-    assert config.fast_llm == "gpt-3.5-turbo"
+    assert config.fast_llm == "gpt-3.5-turbo-16k"
    assert config.smart_llm == "gpt-4-0314"


--- a/autogpts/autogpt/tests/unit/test_file_operations.py
+++ b/autogpts/autogpt/tests/unit/test_file_operations.py
@@ -245,10 +245,9 @@ def test_write_file_succeeds_if_content_different(
    test_file_with_content_path: Path, agent: Agent
 ):
    new_content = "This is different content.\n"
-    result = file_ops.write_to_file(
+    file_ops.write_to_file(
        str(test_file_with_content_path), new_content, agent=agent
    )
-    assert result == "File written to successfully."


 def test_append_to_file(test_nested_file: Path, agent: Agent):
@@ -301,7 +300,7 @@ def test_list_files(workspace: Workspace, test_directory: Path, agent: Agent):
    with open(os.path.join(test_directory, file_a.name), "w") as f:
        f.write("This is file A in the subdirectory.")

-    files = file_ops.list_files(str(workspace.root), agent=agent)
+    files = file_ops.list_folder(str(workspace.root), agent=agent)
    assert file_a.name in files
    assert file_b.name in files
    assert os.path.join(Path(test_directory).name, file_a.name) in files
@@ -314,5 +313,5 @@ def test_list_files(workspace: Workspace, test_directory: Path, agent: Agent):

    # Case 2: Search for a file that does not exist and make sure we don't throw
    non_existent_file = "non_existent_file.txt"
-    files = file_ops.list_files("", agent=agent)
+    files = file_ops.list_folder("", agent=agent)
    assert non_existent_file not in files
--- a/autogpts/autogpt/tests/unit/test_utils.py
+++ b/autogpts/autogpt/tests/unit/test_utils.py
@@ -86,7 +86,7 @@ def test_get_bulletin_from_web_success(mock_get):

    assert expected_content in bulletin
    mock_get.assert_called_with(
-        "https://raw.githubusercontent.com/Significant-Gravitas/Auto-GPT/master/BULLETIN.md"
+        "https://raw.githubusercontent.com/Significant-Gravitas/Auto-GPT/master/autogpts/autogpt/BULLETIN.md"
    )


--- a/autogpts/autogpt/tests/vcr/init.py
+++ b/autogpts/autogpt/tests/vcr/init.py
@@ -47,7 +47,7 @@ def get_base_vcr_config(request):
@pytest.fixture()
 def vcr_cassette_dir(request):
    test_name = os.path.splitext(request.node.name)[0]
-    return os.path.join("tests/Auto-GPT-test-cassettes", test_name)
+    return os.path.join("tests/vcr_cassettes", test_name)


 def patch_api_base(requestor: openai.api_requestor.APIRequestor):
--- a/autogpts/autogpt/tests/vcr_cassettes
+++ b/autogpts/autogpt/tests/vcr_cassettes
--- a/benchmark/agents_to_benchmark.json
+++ b/benchmark/agents_to_benchmark.json
@@ -1,38 +1,47 @@
 {
    "Auto-GPT": {
        "url": "https://github.com/Significant-Gravitas/Auto-GPT",
-        "branch": "master"
+        "branch": "master",
+        "commit": "3a2d08fb415071cc94dd6fcee24cfbdd1fb487dd"
    },
    "gpt-engineer": {
        "url": "https://github.com/merwanehamadi/gpt-engineer.git",
-        "branch": "benchmark-integration"
+        "branch": "benchmark-integration",
+        "commit": "9bb81041ace9f09e8ea0e34e29f2e46bb9d46a36"
    },
    "mini-agi": {
        "url": "https://github.com/SilenNaihin/mini-agi.git",
-        "branch": "benchmark-integration"
+        "branch": "benchmark-integration",
+        "commit": "2fc70aa0032eec986dfb1020854a1b3b8aaf6780"
    },
    "smol-developer": {
        "url": "https://github.com/e2b-dev/smol-developer.git",
-        "branch": "benchmarks"
+        "branch": "benchmarks",
+        "commit": "a23d01369cea976e80b7889fdbf1096619471301"
    },
    "SuperAGI": {
        "url": "https://github.com/SilenNaihin/SuperAGI.git",
-        "branch": "benchmark-integration"
+        "branch": "benchmark-integration",
+        "commit": "48b2101374264b97dbdfc2c0bb0ae45e769e157d"
    },
-    "BabyAGI": {
+    "babyagi": {
        "url": "https://github.com/SilenNaihin/babyagi.git",
-        "branch": "benchmark-integration"
+        "branch": "benchmark-integration",
+        "commit": "16f1b9519fea5543695203be0262a1b41c77cbba"
    },
    "beebot": {
        "url": "https://github.com/AutoPackAI/beebot.git",
-        "branch": "main"
+        "branch": "main",
+        "commit": "59d4e93c133612a0319d135bb0eb08bbcead9fa2"
    },
    "PolyGPT": {
        "url": "https://github.com/polywrap/PolyGPT.git",
-        "branch": "nerfzael-use-local-wrap-library"
+        "branch": "nerfzael-use-local-wrap-library",
+        "commit": "d621adf5f54cc0f9a6d191139fb67ac3d1436d7b"
    },
-    "Turbo": {
+    "Auto-GPT-Turbo": {
        "url": "https://github.com/lc0rp/Auto-GPT-Turbo.git",
-        "branch": "main"
+        "branch": "main",
+        "commit": "8469e09ae204f2d5f41d489b217551544597ee14"
    }
 }
--- a/benchmark/backend/main.py
+++ b/benchmark/backend/main.py
@@ -9,11 +9,10 @@ from typing import Any
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


+from agbenchmark.utils.utils import find_absolute_benchmark_path
 from fastapi import FastAPI, Query
 from fastapi.middleware.cors import CORSMiddleware

-from agbenchmark.utils.utils import find_absolute_benchmark_path
-
 app = FastAPI()

 origins = ["http://localhost:3000"]
--- a/benchmark/agbenchmark/README.md
+++ b/benchmark/agbenchmark/README.md
--- a/benchmark/benchmark/init.py
+++ b/benchmark/benchmark/init.py
@@ -0,0 +1,5 @@
+# import pydevd_pycharm
+
+# pydevd_pycharm.settrace(
+#     "localhost", port=9739, stdoutToServer=True, stderrToServer=True
+# )
--- a/benchmark/benchmark/main.py
+++ b/benchmark/benchmark/main.py
@@ -0,0 +1,285 @@
+import glob
+import json
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional
+
+import click
+import pytest
+import toml
+from helicone.lock import HeliconeLockManager
+
+from benchmark.utils.data_types import AgentBenchmarkConfig
+
+BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
+
+if os.environ.get("HELICONE_API_KEY"):
+    HeliconeLockManager.write_custom_property(
+        "benchmark_start_time", BENCHMARK_START_TIME
+    )
+
+with open(
+    Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
+) as f:
+    OPTIONAL_CATEGORIES = json.load(f)["optional_categories"]
+
+
+def get_unique_categories() -> set[str]:
+    """Find all data.json files in the directory relative to this file and its subdirectories,
+    read the "category" field from each file, and return a set of unique categories."""
+    categories = set()
+
+    # Get the directory of this file
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+
+    glob_path = os.path.join(this_dir, "./challenges/**/data.json")
+    # Use it as the base for the glob pattern
+    for data_file in glob.glob(glob_path, recursive=True):
+        with open(data_file, "r") as f:
+            try:
+                data = json.load(f)
+                categories.update(data.get("category", []))
+            except json.JSONDecodeError:
+                print(f"Error: {data_file} is not a valid JSON file.")
+                continue
+            except IOError:
+                print(f"IOError: file could not be read: {data_file}")
+                continue
+
+    return categories
+
+
+def run_benchmark(
+    agent_benchmark_config_path: AgentBenchmarkConfig,
+    maintain: bool = False,
+    improve: bool = False,
+    explore: bool = False,
+    mock: bool = False,
+    no_dep: bool = False,
+    nc: bool = False,
+    category: Optional[list[str]] = None,
+    skip_category: Optional[list[str]] = None,
+    test: Optional[str] = None,
+    suite: Optional[str] = None,
+    cutoff: Optional[int] = None,
+    server: bool = False,
+) -> int:
+    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
+    # Check if configuration file exists and is not empty
+
+    try:
+        with open(agent_benchmark_config_path, "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = (
+                agent_benchmark_config_path
+            )
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        return 1
+
+    if maintain and improve and explore:
+        print(
+            "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
+        )
+        return 1
+
+    if test and (category or skip_category or maintain or improve or suite or explore):
+        print(
+            "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
+        )
+        return 1
+
+    # TODO: test and ensure that this functionality works before removing
+    # change elif suite below if removing
+    if suite and (category or skip_category or maintain or improve or explore):
+        print(
+            "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
+        )
+        return 1
+
+    assert not (
+        agent_benchmark_config.api_mode and not agent_benchmark_config.host
+    ), "Error: host needs to be added to the config if api_mode is set to True."
+
+    print("Current configuration:")
+    for key, value in vars(agent_benchmark_config).items():
+        print(f"{key}: {value}")
+
+    pytest_args = ["-vs"]
+    pytest_args.extend(["--agent_config_path", agent_benchmark_config_path])
+    if test:
+        print("Running specific test:", test)
+        pytest_args.extend(["-k", test, "--test"])
+    elif suite:
+        print("Running specific suite:", suite)
+        pytest_args.extend(["--suite"])
+    else:
+        # Categories that are used in the challenges
+        categories = get_unique_categories()
+        if category:
+            invalid_categories = set(category) - categories
+            assert (
+                not invalid_categories
+            ), f"Invalid categories: {invalid_categories}. Valid categories are: {categories}"
+
+        if category:
+            categories_to_run = set(category)
+            if skip_category:
+                categories_to_run = categories_to_run.difference(set(skip_category))
+                assert categories_to_run, "Error: You can't skip all categories"
+            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
+            print("Running tests of category:", categories_to_run)
+        elif skip_category:
+            categories_to_run = categories - set(skip_category)
+            assert categories_to_run, "Error: You can't skip all categories"
+            pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
+            print("Running tests of category:", categories_to_run)
+        else:
+            print("Running all categories")
+
+        if maintain:
+            print("Running only regression tests")
+            pytest_args.append("--maintain")
+        elif improve:
+            print("Running only non-regression tests")
+            pytest_args.append("--improve")
+        elif explore:
+            print("Only attempt challenges that have never been beaten")
+            pytest_args.append("--explore")
+
+    if mock:
+        pytest_args.append("--mock")
+
+    if no_dep:
+        pytest_args.append("--no_dep")
+
+    if nc and cutoff:
+        print(
+            "Error: You can't use both --nc and --cutoff at the same time. Please choose one."
+        )
+        return 1
+
+    if nc:
+        pytest_args.append("--nc")
+    if cutoff:
+        pytest_args.append("--cutoff")
+        print(f"Setting cuttoff override to {cutoff} seconds.")
+    current_dir = Path(__file__).resolve().parent
+    print(f"Current directory: {current_dir}")
+    pytest_args.extend((str(current_dir), "--cache-clear"))
+    return pytest.main(pytest_args)
+
+
+@click.group()
+def cli() -> None:
+    pass
+
+
+@cli.command()
+@click.option("--backend", is_flag=True, help="If it's being run from the cli")
+@click.option("-c", "--category", multiple=True, help="Specific category to run")
+@click.option(
+    "-s",
+    "--skip-category",
+    multiple=True,
+    help="Skips preventing the tests from this category from running",
+)
+@click.option("--test", help="Specific test to run")
+@click.option("--maintain", is_flag=True, help="Runs only regression tests")
+@click.option("--improve", is_flag=True, help="Run only non-regression tests")
+@click.option(
+    "--explore",
+    is_flag=True,
+    help="Only attempt challenges that have never been beaten",
+)
+@click.option("--mock", is_flag=True, help="Run with mock")
+@click.option("--suite", help="Run a suite of related tests")
+@click.option(
+    "--no_dep",
+    is_flag=True,
+    help="Run without dependencies (can be useful for a suite run)",
+)
+@click.option("--nc", is_flag=True, help="Run without cutoff")
+@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
+@click.option(
+    "--agent-config",
+    type=click.Path(exists=True),
+    help="Path to the agent benchmark_config.json file,",
+    required=True,
+)
+def start(
+    maintain: bool,
+    improve: bool,
+    explore: bool,
+    mock: bool,
+    no_dep: bool,
+    nc: bool,
+    agent_config: click.Path,
+    category: Optional[list[str]] = None,
+    skip_category: Optional[list[str]] = None,
+    test: Optional[str] = None,
+    suite: Optional[str] = None,
+    cutoff: Optional[int] = None,
+    backend: Optional[bool] = False,
+) -> Any:
+    # Redirect stdout if backend is True
+    original_stdout = sys.stdout  # Save the original standard output
+    exit_code = None
+
+    assert (
+        "benchmark_config.json" in agent_config
+    ), "benchmark_config.json must be provided"
+
+    if backend:
+        with open("backend/backend_stdout.txt", "w") as f:
+            sys.stdout = f
+            exit_code = run_benchmark(
+                agent_benchmark_config_path=agent_config,
+                maintain=maintain,
+                improve=improve,
+                explore=explore,
+                mock=mock,
+                no_dep=no_dep,
+                nc=nc,
+                category=category,
+                skip_category=skip_category,
+                test=test,
+                suite=suite,
+                cutoff=cutoff,
+            )
+
+        sys.stdout = original_stdout
+
+    else:
+        exit_code = run_benchmark(
+            agent_benchmark_config_path=agent_config,
+            maintain=maintain,
+            improve=improve,
+            explore=explore,
+            mock=mock,
+            no_dep=no_dep,
+            nc=nc,
+            category=category,
+            skip_category=skip_category,
+            test=test,
+            suite=suite,
+            cutoff=cutoff,
+        )
+
+        sys.exit(exit_code)
+
+
+@cli.command()
+def version():
+    """Print the version of the benchmark tool."""
+    current_directory = Path(__file__).resolve().parent
+    version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"][
+        "version"
+    ]
+    print(f"Benchmark Tool Version {version}")
+
+
+if __name__ == "__main__":
+    cli()
--- a/benchmark/agbenchmark/agent_api_interface.py
+++ b/benchmark/agbenchmark/agent_api_interface.py
@@ -5,8 +5,8 @@ from typing import Any, Dict, Optional

 from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody

-from agbenchmark.agent_interface import get_list_of_file_paths
-from agbenchmark.utils.data_types import ChallengeData
+from benchmark.agent_interface import get_list_of_file_paths
+from benchmark.utils.data_types import ChallengeData


 async def run_api_agent(
@@ -51,7 +51,6 @@ async def run_api_agent(

        artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
        for artifact in artifacts:
-
            if artifact.relative_path:
                folder_path = os.path.join(config["workspace"], artifact.relative_path)
            else:
--- a/benchmark/agbenchmark/agent_interface.py
+++ b/benchmark/agbenchmark/agent_interface.py
@@ -12,8 +12,6 @@ from typing import Any, List
 import psutil
 from dotenv import load_dotenv

-import agbenchmark.start_benchmark
-
 load_dotenv()

 helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
@@ -77,7 +75,7 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
 def run_agent(task: str, timeout: int) -> None:
    """Calling to get a response"""

-    entry_path = "agbenchmark.benchmarks"
+    entry_path = "benchmark.benchmarks"

    print(f"Running '{entry_path}' with timeout {timeout}")

@@ -87,7 +85,7 @@ def run_agent(task: str, timeout: int) -> None:
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
-        cwd=agbenchmark.start_benchmark.HOME_DIRECTORY,
+        cwd=benchmark.start_benchmark.HOME_DIRECTORY,
        bufsize=1,
    )

@@ -109,7 +107,7 @@ def get_list_of_file_paths(
 ) -> List[str]:
    # this file is at agbenchmark\agent_interface.py
    source_dir = os.path.join(
-        agbenchmark.start_benchmark.CURRENT_DIRECTORY,
+        benchmark.start_benchmark.CURRENT_DIRECTORY,
        "..",
        challenge_dir_path,
        artifact_folder_name,
--- a/benchmark/agbenchmark/app.py
+++ b/benchmark/agbenchmark/app.py
--- a/benchmark/agbenchmark/challenges/CHALLENGE.md
+++ b/benchmark/agbenchmark/challenges/CHALLENGE.md
--- a/benchmark/agbenchmark/challenges/README.md
+++ b/benchmark/agbenchmark/challenges/README.md
--- a/benchmark/agbenchmark/challenges/SUITES.md
+++ b/benchmark/agbenchmark/challenges/SUITES.md
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_in/file_to_read.txt
--- a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/file_to_check.txt
--- a/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/artifacts_out/output.txt
--- a/benchmark/agbenchmark/challenges/abilities/read_file/data.json
+++ b/benchmark/agbenchmark/challenges/abilities/read_file/data.json
--- a/benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt
+++ b/benchmark/agbenchmark/challenges/abilities/write_file/artifacts_out/random_file.txt
--- a/benchmark/agbenchmark/challenges/abilities/write_file/data.json
+++ b/benchmark/agbenchmark/challenges/abilities/write_file/data.json
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_in/instructions.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/artifacts_out/goal.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/1_distraction/data.json
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_in/instructions_2.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/artifacts_out/goal.txt
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/2_injection/data.json
--- a/benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json
+++ b/benchmark/agbenchmark/challenges/alignment/goal_loss/suite.json
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a1_debug/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/artifacts_out/random_file.txt
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a2_tesla_revenue/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/artifacts_out/random_file.txt
--- a/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/adapatability/a3_book_price/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_in/test.py
--- a/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/init.py
+++ b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/1_return/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_in/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/2_write/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
--- a/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/init.py
+++ b/benchmark/benchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/3_modify/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/init.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c2_debug_suite/d2.1_guided/artifacts_in/init.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/custom_python/test.py
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/4_tests/data.json
--- a/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/c1_writing_suite_1/suite.json
--- a/Show More
+++ b/Show More