diff --git a/.github/workflows/autogpt-ci.yml b/.github/workflows/autogpt-ci.yml
index 375a433a..1701325d 100644
--- a/.github/workflows/autogpt-ci.yml
+++ b/.github/workflows/autogpt-ci.yml
@@ -4,16 +4,13 @@ on:
   push:
     branches: [ master, development, ci-test* ]
     paths:
+      - '.github/workflows/autogpt-ci.yml'
       - 'autogpts/autogpt/**'
       - '!autogpts/autogpt/tests/vcr_cassettes'
   pull_request:
     branches: [ master, development, release-* ]
     paths:
-      - 'autogpts/autogpt/**'
-      - '!autogpts/autogpt/tests/vcr_cassettes'
-  pull_request_target:
-    branches: [ master, development, release-*, ci-test* ]
-    paths:
+      - '.github/workflows/autogpt-ci.yml'
       - 'autogpts/autogpt/**'
       - '!autogpts/autogpt/tests/vcr_cassettes'
 
@@ -27,9 +24,6 @@ defaults:
 
 jobs:
   lint:
-    # eliminate duplicate runs
-    if: github.event_name == 'push' || (github.event.pull_request.head.repo.fork == (github.event_name == 'pull_request_target'))
-
     runs-on: ubuntu-latest
     env:
       min-python-version: "3.10"
@@ -39,8 +33,6 @@ jobs:
         uses: actions/checkout@v3
         with:
           fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
 
       - name: Set up Python ${{ env.min-python-version }}
         uses: actions/setup-python@v4
@@ -83,17 +75,8 @@ jobs:
           poetry run $cmd --check || (echo "You have unused imports or pass statements, please run '${cmd} --in-place'" && exit 1)
 
   test:
-    # eliminate duplicate runs
-    if: github.event_name == 'push' || (github.event.pull_request.head.repo.fork == (github.event_name == 'pull_request_target'))
-
     permissions:
-      # Gives the action the necessary permissions for publishing new
-      # comments in pull requests.
-      pull-requests: write
-      # Gives the action the necessary permissions for pushing data to the
-      # python-coverage-comment-action branch, and for editing existing
-      # comments (to avoid publishing multiple comments in the same PR)
-      contents: write
+      contents: read
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
@@ -105,8 +88,6 @@ jobs:
         uses: actions/checkout@v3
         with:
           fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
           submodules: true
 
       - name: Configure git user Auto-GPT-Bot
@@ -116,9 +97,13 @@ jobs:
 
       - name: Checkout cassettes
         if: ${{ startsWith(github.event_name, 'pull_request') }}
+        env:
+          PR_BASE: ${{ github.event.pull_request.base.ref }}
+          PR_BRANCH: ${{ github.event.pull_request.head.ref }}
+          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
         run: |
-          cassette_branch="${{ github.event.pull_request.user.login }}-${{ github.event.pull_request.head.ref }}"
-          cassette_base_branch="${{ github.event.pull_request.base.ref }}"
+          cassette_branch="${PR_AUTHOR}-${PR_BRANCH}"
+          cassette_base_branch="${PR_BASE}"
           cd tests/vcr_cassettes
 
           if ! git ls-remote --exit-code --heads origin $cassette_base_branch ; then
@@ -169,10 +154,7 @@ jobs:
             tests/unit tests/integration
         env:
           CI: true
-          PROXY: ${{ github.event_name == 'pull_request_target' && secrets.PROXY || '' }}
-          AGENT_MODE: ${{ github.event_name == 'pull_request_target' && secrets.AGENT_MODE || '' }}
-          AGENT_TYPE: ${{ github.event_name == 'pull_request_target' && secrets.AGENT_TYPE || '' }}
-          OPENAI_API_KEY: ${{ github.event_name != 'pull_request_target' && secrets.OPENAI_API_KEY || '' }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           PLAIN_OUTPUT: True
 
       - name: Upload coverage reports to Codecov
@@ -198,11 +180,14 @@ jobs:
       - id: push_cassettes
         name: Push updated cassettes
         # For pull requests, push updated cassettes even when tests fail
-        if: github.event_name == 'push' || success() || failure()
+        if: github.event_name == 'push' || (! github.event.pull_request.head.repo.fork && (success() || failure()))
+        env:
+          PR_BRANCH: ${{ github.event.pull_request.head.ref }}
+          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
         run: |
           if [ "${{ startsWith(github.event_name, 'pull_request') }}" = "true" ]; then
             is_pull_request=true
-            cassette_branch="${{ github.event.pull_request.user.login }}-${{ github.event.pull_request.head.ref }}"
+            cassette_branch="${PR_AUTHOR}-${PR_BRANCH}"
           else
             cassette_branch="${{ github.ref_name }}"
           fi
@@ -234,20 +219,15 @@ jobs:
       - name: Apply "behaviour change" label and comment on PR
         if: ${{ startsWith(github.event_name, 'pull_request') }}
         run: |
-          PR_NUMBER=${{ github.event.pull_request.number }}
-          TOKEN=${{ secrets.PAT_REVIEW }}
-          REPO=${{ github.repository }}
+          PR_NUMBER="${{ github.event.pull_request.number }}"
+          TOKEN="${{ secrets.PAT_REVIEW }}"
+          REPO="${{ github.repository }}"
 
           if [[ "${{ steps.push_cassettes.outputs.updated }}" == "true" ]]; then
             echo "Adding label and comment..."
-            curl -X POST \
-            -H "Authorization: Bearer $TOKEN" \
-            -H "Accept: application/vnd.github.v3+json" \
-            https://api.github.com/repos/$REPO/issues/$PR_NUMBER/labels \
-            -d '{"labels":["behaviour change"]}'
-
             echo $TOKEN | gh auth login --with-token
-            gh api repos/$REPO/issues/$PR_NUMBER/comments -X POST -F body="You changed AutoGPT's behaviour. The cassettes have been updated and will be merged to the submodule when this Pull Request gets merged."
+            gh issue edit $PR_NUMBER --add-label "behaviour change"
+            gh issue comment $PR_NUMBER --body "You changed AutoGPT's behaviour. The cassettes have been updated and will be merged to the submodule when this Pull Request gets merged."
           fi
 
       - name: Upload logs to artifact
diff --git a/.github/workflows/autogpt-docker-ci.yml b/.github/workflows/autogpt-docker-ci.yml
index a6f336a7..f9b8143e 100644
--- a/.github/workflows/autogpt-docker-ci.yml
+++ b/.github/workflows/autogpt-docker-ci.yml
@@ -4,11 +4,13 @@ on:
   push:
     branches: [ master, development ]
     paths:
+      - '.github/workflows/autogpt-docker-ci.yml'
       - 'autogpts/autogpt/**'
       - '!autogpts/autogpt/tests/vcr_cassettes'
   pull_request:
     branches: [ master, development, release-* ]
     paths:
+      - '.github/workflows/autogpt-docker-ci.yml'
       - 'autogpts/autogpt/**'
       - '!autogpts/autogpt/tests/vcr_cassettes'
 
diff --git a/.github/workflows/autogpts-ci.yml b/.github/workflows/autogpts-ci.yml
index 4f89fb7c..0385470a 100644
--- a/.github/workflows/autogpts-ci.yml
+++ b/.github/workflows/autogpts-ci.yml
@@ -1,4 +1,4 @@
-name: Valid AutoGPTs
+name: AutoGPTs smoke test CI
 
 on:
   workflow_dispatch:
@@ -6,8 +6,24 @@ on:
     - cron: '0 8 * * *'
   push:
     branches: [ master, development, ci-test* ]
+    paths:
+      - '.github/workflows/autogpts-ci.yml'
+      - 'autogpts/**'
+      - 'benchmark/**'
+      - 'run'
+      - 'cli.py'
+      - 'setup.py'
+      - '!**/*.md'
   pull_request:
     branches: [ master, development, release-* ]
+    paths:
+      - '.github/workflows/autogpts-ci.yml'
+      - 'autogpts/**'
+      - 'benchmark/**'
+      - 'run'
+      - 'cli.py'
+      - 'setup.py'
+      - '!**/*.md'
 
 jobs:
   run-tests:
@@ -17,13 +33,13 @@ jobs:
         agent-name: [ autogpt, forge ]
       fail-fast: false
     timeout-minutes: 20
+    env:
+      min-python-version: '3.10'
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
         with:
           fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
           submodules: true
 
       - name: Set up Python ${{ env.min-python-version }}
diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml
index fd8020d2..5f95dd02 100644
--- a/.github/workflows/benchmark-ci.yml
+++ b/.github/workflows/benchmark-ci.yml
@@ -14,19 +14,18 @@ on:
       - '!benchmark/reports/**'
       - .github/workflows/benchmark-ci.yml
 
+env:
+  min-python-version: '3.10'
+
 jobs:
   lint:
     runs-on: ubuntu-latest
-    env:
-      min-python-version: '3.10'
 
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
         with:
           fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
 
       - name: Set up Python ${{ env.min-python-version }}
         uses: actions/setup-python@v2
@@ -82,8 +81,6 @@ jobs:
         uses: actions/checkout@v3
         with:
           fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
           submodules: true
 
       - name: Set up Python ${{ env.min-python-version }}
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
deleted file mode 100644
index af2383a6..00000000
--- a/.github/workflows/benchmarks.yml
+++ /dev/null
@@ -1,73 +0,0 @@
-name: Benchmarks (legacy)
-
-on:
-  schedule:
-    - cron: '0 8 * * *'
-  workflow_dispatch:
-
-jobs:
-  Benchmark:
-    name: ${{ matrix.config.task-name }}
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          - python-version: "3.10"
-            task: "tests/challenges"
-            task-name: "Mandatory Tasks"
-          - python-version: "3.10"
-            task: "--beat-challenges -ra tests/challenges"
-            task-name: "Challenging Tasks"
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          ref: master
-
-      - name: Set up Python ${{ matrix.config.python-version }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.config.python-version }}
-
-      - id: get_date
-        name: Get date
-        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
-
-      - name: Set up Python dependency cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.cache/pypoetry
-          key: ${{ runner.os }}-poetry-${{ hashFiles('autogpts/autogpt/pyproject.toml') }}-${{ steps.get_date.outputs.date }}
-
-      - name: Install Python dependencies
-        run: |
-          curl -sSL https://install.python-poetry.org | python3 -
-          poetry install
-
-      - name: Run pytest with coverage
-        run: |
-          rm -rf tests/vcr_cassettes
-          pytest -n auto --record-mode=all ${{ matrix.config.task }}
-        env:
-          CI: true
-          PROXY: ${{ secrets.PROXY }}
-          AGENT_MODE: ${{ secrets.AGENT_MODE }}
-          AGENT_TYPE: ${{ secrets.AGENT_TYPE }}
-          PLAIN_OUTPUT: True
-
-      - name: Upload logs as artifact
-        if: always()
-        uses: actions/upload-artifact@v3
-        with:
-          name: test-logs-${{ matrix.config.task-name }}
-          path: logs/
-
-      - name: Upload cassettes as artifact
-        if: always()
-        uses: actions/upload-artifact@v3
-        with:
-          name: cassettes-${{ matrix.config.task-name }}
-          path: tests/vcr_cassettes/
diff --git a/.github/workflows/hackathon.yml b/.github/workflows/hackathon.yml
index 9bd5d5f1..a8b592eb 100644
--- a/.github/workflows/hackathon.yml
+++ b/.github/workflows/hackathon.yml
@@ -2,7 +2,6 @@ name: Hackathon
 
 on:
   workflow_dispatch:
-    branches: [hackathon]
     inputs:
       agents:
         description: "Agents to run (comma-separated)"
@@ -92,8 +91,6 @@ jobs:
         uses: actions/checkout@v3
         with:
           fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
           submodules: true
 
       - name: Set up Python ${{ env.min-python-version }}
@@ -133,4 +130,4 @@ jobs:
           WEAVIATE_URL: ${{ secrets.WEAVIATE_URL }}
           GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
           GOOGLE_CUSTOM_SEARCH_ENGINE_ID: ${{ secrets.GOOGLE_CUSTOM_SEARCH_ENGINE_ID }}
-          AGENT_NAME: ${{ matrix.agent-name }}
\ No newline at end of file
+          AGENT_NAME: ${{ matrix.agent-name }}
diff --git a/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json b/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json
index 01dd0afc..0f30ea87 100644
--- a/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/code/1_password_generator/data.json
@@ -24,5 +24,5 @@
         "side_effects": []
     },
     "name": "PasswordGenerator",
-    "task": "Create a random password generator. The password should have between 8 and 16 characters and should contain letters, numbers and symbols. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py [--len x] where x is the length of the password. If no length is specified, the password should be 8 characters long. The password_generator can also be imported as a module and called as password = password_generator.generate_password(len=x). Any invalid input should raise a ValueError."
+    "task": "Create a random password generator. The password should have between 8 and 16 characters and should contain letters, numbers and symbols. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py [--len x] where x is the length of the password. The password_generator can also be imported as a module and called as password = password_generator.generate_password(len=x). If no length is specified, the password should be 8 characters long. Any invalid input should raise a ValueError."
 }
diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py b/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py
index 514ec43a..5797ebcb 100644
--- a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py
+++ b/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/artifacts_out/password_generator.py
@@ -1,8 +1,9 @@
 import random
 import string
+import sys
 
 
-def generate_password(length: int) -> str:
+def generate_password(length: int = 8) -> str:
     if length < 8 or length > 16:
         raise ValueError("Password length must be between 8 and 16 characters.")
 
@@ -19,5 +20,8 @@ def generate_password(length: int) -> str:
 
 
 if __name__ == "__main__":
-    password_length = random.randint(8, 16)
+    password_length = (
+        int(sys.argv[sys.argv.index("--length") + 1])
+        if "--length" in sys.argv else 8
+    )
     print(generate_password(password_length))
diff --git a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json b/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json
index 8b2e1d91..14bbe453 100644
--- a/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/code/2_password_generator/data.json
@@ -24,5 +24,5 @@
         "side_effects": []
     },
     "name": "PasswordGenerator",
-    "task": "Create a random password generator. The password should have between 8 and 16 characters and should contain at least one letter, number and symbol. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py [--len x] where x is the length of the password. If no length is specified, the password should be 8 characters long. The password_generator can also be imported as a module and called as password = password_generator.generate_password(len=x). Any invalid input should raise a ValueError."
+    "task": "Create a random password generator. The password should have between 8 and 16 characters and should contain at least one letter, number and symbol. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py [--length x] where x is the length of the password. If no length is specified, the password should be 8 characters long. The password_generator can also be imported as a module and called as password = password_generator.generate_password(length=x). Any invalid input should raise a ValueError."
 }
diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py
index d2fc8dea..3ba9e6c6 100644
--- a/benchmark/agbenchmark/reports/processing/report_types.py
+++ b/benchmark/agbenchmark/reports/processing/report_types.py
@@ -54,7 +54,6 @@ class Test(BaseModelBenchmark):
     metadata: Any
 
 
-
 class ReportBase(BaseModelBenchmark):
     command: str
     completion_time: str | None
@@ -70,7 +69,6 @@ class Report(ReportBase):
     tests: Dict[str, Test]
 
 
-
 class ReportV2(Test, ReportBase):
     test_name: str
     run_id: str | None
diff --git a/benchmark/agbenchmark/utils/challenge.py b/benchmark/agbenchmark/utils/challenge.py
index 20353f68..a32ab6cf 100644
--- a/benchmark/agbenchmark/utils/challenge.py
+++ b/benchmark/agbenchmark/utils/challenge.py
@@ -123,7 +123,7 @@ class Challenge(ABC):
         print("\033[1;34mScoring content:\033[0m", content)
         if ground.should_contain:
             for should_contain_word in ground.should_contain:
-                if not getattr(ground, 'case_sensitive', True):
+                if not getattr(ground, "case_sensitive", True):
                     should_contain_word = should_contain_word.lower()
                     content = content.lower()
                 print_content = (
@@ -137,7 +137,7 @@ class Challenge(ABC):
 
         if ground.should_not_contain:
             for should_not_contain_word in ground.should_not_contain:
-                if not getattr(ground, 'case_sensitive', True):
+                if not getattr(ground, "case_sensitive", True):
                     should_not_contain_word = should_not_contain_word.lower()
                     content = content.lower()
                 print_content = f"\033[1;34mWord that should not exist\033[0m - {should_not_contain_word}:"