Add benchmark CI

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
2025-12-19 06:54:22 +01:00 · 2023-09-06 19:11:07 -07:00
parent 3ccffdab19
commit bc14028294
5 changed files with 302 additions and 45 deletions
--- a/.github/workflows/benchmark-ci.yml
+++ b/.github/workflows/benchmark-ci.yml
@@ -0,0 +1,301 @@
 name: Benchmark CI
 on:
  workflow_dispatch:
    branches: [master]
    inputs:
      agents:
        description: 'Agents to run (comma-separated)'
        required: false
        default: 'gpt-engineer,smol-developer,Auto-GPT,mini-agi,beebot,BabyAGI,PolyGPT,Turbo' # Default agents if none are specified
  schedule:
    - cron: '0 8 * * *'
  push:
    branches: [master, ci-test*]
    paths:
      - 'benchmark/**'
      - '!benchmark/reports/**'
  pull_request:
    branches: [stable, master, release-*]
 jobs:
  lint:
    runs-on: ubuntu-latest
    env:
      min-python-version: '3.10'
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
        with:
          fetch-depth: 0
          ref: ${{ github.event.pull_request.head.ref }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
      - name: Set up Python ${{ env.min-python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ env.min-python-version }}
      - id: get_date
        name: Get date
        working-directory: ./benchmark/
        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
      - name: Install Poetry
        working-directory: ./benchmark/
        run: |
          curl -sSL https://install.python-poetry.org | python -
      - name: Install dependencies
        working-directory: ./benchmark/
        run: |
          export POETRY_VIRTUALENVS_IN_PROJECT=true
          poetry install -vvv
      - name: Lint with flake8
        working-directory: ./benchmark/
        run: poetry run flake8
      - name: Check black formatting
        working-directory: ./benchmark/
        run: poetry run  black . --exclude test.py --check
        if: success() || failure()
      - name: Check isort formatting
        working-directory: ./benchmark/
        run: poetry run  isort . --check
        if: success() || failure()
      - name: Check for unused imports and pass statements
        working-directory: ./benchmark/
        run: |
          cmd="poetry run autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark"
          $cmd --check || (echo "You have unused imports or pass statements, please run '${cmd} --in-place'" && exit 1)
        if: success() || failure()
  matrix-setup:
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
      env-name: ${{ steps.set-matrix.outputs.env-name }}
    steps:
      - id: set-matrix
        run: |
          if [ "${{ github.event_name }}" == "schedule" ]; then
            echo "::set-output name=env-name::production"
            echo "::set-output name=matrix::[ 'gpt-engineer', 'smol-developer', 'Auto-GPT', 'mini-agi', 'beebot', 'BabyAGI', 'PolyGPT', 'Turbo' ]"
          elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
            IFS=',' read -ra matrix_array <<< "${{ github.event.inputs.agents }}"
            matrix_string="[ \"$(echo "${matrix_array[@]}" | sed 's/ /", "/g')\" ]"
            echo "::set-output name=env-name::production"
            echo "::set-output name=matrix::$matrix_string"
          else
            echo "::set-output name=env-name::develop"
            echo "::set-output name=matrix::[ 'mini-agi' ]"
          fi
  tests:
    environment:
      name: '${{ needs.matrix-setup.outputs.env-name }}'
    needs: matrix-setup
    env:
      min-python-version: '3.10'
    name: '${{ matrix.agent-name }}'
    runs-on: ubuntu-latest
    timeout-minutes: 50
    strategy:
      fail-fast: false
      matrix:
        agent-name: ${{fromJson(needs.matrix-setup.outputs.matrix)}}
    steps:
      - name: Print Environment Name
        run: |
          echo "Matrix Setup Environment Name: ${{ needs.matrix-setup.outputs.env-name }}"
      - name: Checkout repository
        uses: actions/checkout@v3
        with:
          fetch-depth: 0
          ref: ${{ github.event.pull_request.head.ref }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
          token: ${{ secrets.PAT_REVIEW }}
      - name: Set up Python ${{ env.min-python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ env.min-python-version }}
      - id: get_date
        name: Get date
        working-directory: ./benchmark/
        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
      - name: Install Poetry
        run: |
          curl -sSL https://install.python-poetry.org | python -
      - name: Install dependencies
        working-directory: ./benchmark/
        run: |
          poetry install -vvv
          poetry build
      - name: Run regression tests
        working-directory: ./benchmark/
        run: |
          mkdir agent
          cd agent
          git clone https://github.com/SilenNaihin/mini-agi -b benchmark-integration
          cd $AGENT_NAME
          prefix=""
          if [ "$AGENT_NAME" == "gpt-engineer" ]; then
            make install
            source venv/bin/activate
          elif [ "$AGENT_NAME" == "Auto-GPT" ]; then
            python -m venv venv
            source venv/bin/activate
            pip install -r requirements.txt
            pip uninstall agbenchmark -y
          elif [ "$AGENT_NAME" == "mini-agi" ]; then
            python -m venv venv
            source venv/bin/activate
            pip install -r requirements.txt
            cp .env_example .env
          elif [ "$AGENT_NAME" == "smol-developer" ]; then
            python -m venv venv
            source venv/bin/activate
            pip install -r requirements.txt
          elif [ "$AGENT_NAME" == "BabyAGI" ]; then
            python -m venv venv
            source venv/bin/activate
            pip install -r requirements.txt
          elif [ "$AGENT_NAME" == "SuperAGI" ]; then
            cp config_template.yaml config.yaml
            sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml
            docker-compose up -d --build
          elif [ "$AGENT_NAME" == "beebot" ]; then
            poetry install
            poetry run playwright install
            poetry run uvicorn beebot.initiator.api:create_app --factory  --timeout-graceful-shutdown=1 &
            prefix="poetry run "
          elif [ "$AGENT_NAME" == "PolyGPT" ]; then
            cp .env.template .env
            curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.1/install.sh | bash
            export NVM_DIR=$HOME/.nvm
            source $NVM_DIR/nvm.sh
            nvm install && nvm use
            yarn install
            export NODE_TLS_REJECT_UNAUTHORIZED=0
          elif [ "$AGENT_NAME" == "Turbo" ]; then
            python -m venv venv
            source venv/bin/activate
            pip install -r requirements.txt
            cp .env.template .env
            sed -i 's/your-openai-api-key/${{ secrets.OPENAI_API_KEY }}/g' .env
          else
            echo "Unknown agent name: $AGENT_NAME"
            exit 1
          fi
          pip install ../../dist/*.whl
          bash -c "$(curl -fsSL https://raw.githubusercontent.com/merwanehamadi/helicone/b7ab4bc53e51d8ab29fff19ce5986ab7720970c6/mitmproxy.sh)" -s start
          cd ../..
          if [ "${GITHUB_EVENT_NAME}" == "pull_request" ] || [ "${{ github.event_name }}" == "push" ]; then            
            set +e # Ignore non-zero exit codes and continue execution
            echo "Running the following command: ${prefix}agbenchmark start --maintain --mock"
            ${prefix}agbenchmark start --maintain --mock
            EXIT_CODE=$?
            set -e  # Stop ignoring non-zero exit codes
            # Check if the exit code was 5, and if so, exit with 0 instead
            if [ $EXIT_CODE -eq 5 ]; then
              echo "regression_tests.json is empty."
            fi
            echo "Running the following command: ${prefix}agbenchmark start --mock"
            ${prefix}agbenchmark start --mock
            echo "Running the following command: ${prefix}agbenchmark start --mock --category=retrieval"
            ${prefix}agbenchmark start --mock --category=retrieval
            echo "Running the following command: ${prefix}agbenchmark start --mock --category=interface"
            ${prefix}agbenchmark start --mock --category=interface
            echo "Running the following command: ${prefix}agbenchmark start --mock --category=code"
            ${prefix}agbenchmark start --mock --category=code
            echo "Running the following command: ${prefix}agbenchmark start --mock --category=memory"
            ${prefix}agbenchmark start --mock --category=memory
            echo "Running the following command: ${prefix}agbenchmark start --mock --suite TestRevenueRetrieval"
            ${prefix}agbenchmark start --mock --suite TestRevenueRetrieval
            echo "Running the following command: ${prefix}agbenchmark start --test=TestWriteFile"
            ${prefix}agbenchmark start --test=TestWriteFile
            poetry install
            poetry run uvicorn server:app --reload &
            sleep 5
            export AGENT_NAME=mini-agi
          else
            echo "${prefix}agbenchmark start"
            ${prefix}agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved."
          fi
          cd ../..
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          AGENT_NAME: ${{ matrix.agent-name }}
          PROMPT_USER: false # For mini-agi. TODO: Remove this and put it in benchmarks.py
          HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }}
          BASERUN_API_KEY: ${{ secrets.BASERUN_API_KEY }}
          REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
          HELICONE_CACHE_ENABLED: false
          HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
          REPORT_LOCATION: ${{ format('../../reports/{0}', matrix.agent-name) }}
          WOLFRAM_ALPHA_APPID: ${{ secrets.WOLFRAM_ALPHA_APPID }}
          SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
          BING_SUBSCRIPTION_KEY: ${{ secrets.BING_SUBSCRIPTION_KEY }}
      - name: Upload reports
        if: always()
        uses: actions/upload-artifact@v3
        with:
          name: ${{ matrix.agent-name }}
          path: reports/${{ matrix.agent-name }}
      - name: Authenticate and Push to Branch
        working-directory: ./benchmark/
        if: (success() || failure()) && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
        run: |
          git config --global user.email "github-bot@agpt.co"
          git config --global user.name "Auto-GPT-Bot"
          git add reports/* || echo "nothing to commit"
          commit_message="${{ matrix.agent-name }}-$(date +'%Y%m%d%H%M%S')"
          git commit -m "${commit_message}"
          git stash
          current_branch=${{ github.ref_name }}
          attempts=0
          max_attempts=3
          while [ $attempts -lt $max_attempts ]; do
              git fetch origin $current_branch
              git rebase origin/$current_branch
              if git push origin HEAD; then
                  echo "Success!"
                  poetry run python reports/send_to_googledrive.py || echo "Failed to upload to Google Drive"
                  exit 0
              else
                  echo "Attempt $(($attempts + 1)) failed. Retrying..."
                  attempts=$(($attempts + 1))
              fi
          done
          echo "Failed after $max_attempts attempts."
        env:
          GDRIVE_BASE64: ${{ secrets.GDRIVE_BASE64 }}
          GITHUB_REF_NAME: ${{ github.ref_name }}
--- a/benchmark/.gitmodules
+++ b/benchmark/.gitmodules
@@ -1,39 +0,0 @@
 [submodule "agent/Auto-GPT"]
 	path = agent/Auto-GPT
 	url = https://github.com/Significant-Gravitas/Auto-GPT
 	branch = master
 [submodule "agent/gpt-engineer"]
 	path = agent/gpt-engineer
 	url = https://github.com/merwanehamadi/gpt-engineer.git
 	branch = benchmark-integration
 [submodule "agent/mini-agi"]
 	path = agent/mini-agi
 	url = https://github.com/SilenNaihin/mini-agi.git
 	branch = benchmark-integration
 [submodule "agent/smol-developer"]
 	path = agent/smol-developer
 	url = https://github.com/e2b-dev/smol-developer.git
 	branch = benchmarks
 [submodule "agent/SuperAGI"]
 	path = agent/SuperAGI
 	url = https://github.com/SilenNaihin/SuperAGI.git
 	branch = benchmark-integration
 [submodule "agent/BabyAGI"]
 	path = agent/BabyAGI
 	url = https://github.com/SilenNaihin/babyagi.git
 	branch = benchmark-integration
 [submodule "agent/beebot"]
 	path = agent/beebot
 	url = https://github.com/AutoPackAI/beebot.git
 	branch = main
 [submodule "agent/PolyGPT"]
 	path = agent/PolyGPT
 	url = https://github.com/polywrap/PolyGPT.git
 	branch = nerfzael-use-local-wrap-library
 [submodule "frontend"]
 	path = frontend
 	url = https://github.com/agbenchmark/agbenchmark-frontend.git
 [submodule "agent/Turbo"]
 	path = agent/Turbo
 	url = https://github.com/lc0rp/Auto-GPT-Turbo.git
 	branch = main
--- a/benchmark/agbenchmark/init.py
+++ b/benchmark/agbenchmark/init.py
@@ -1,5 +0,0 @@
 import pydevd_pycharm
 pydevd_pycharm.settrace(
    "localhost", port=9739, stdoutToServer=True, stderrToServer=True
 )
--- a/benchmark/agbenchmark/challenges/library/ethereum/check_price/data_draft.json
+++ b/benchmark/agbenchmark/challenges/library/ethereum/check_price/data_draft.json
--- a/benchmark/paper/agent_action_regex.py
+++ b/benchmark/paper/agent_action_regex.py
@@ -1,5 +1,5 @@
 import re
 import json
 import re
 def is_action_auto_gpt(log):