goose/scripts/run-benchmarks.sh

#!/usr/bin/env bash
# run-benchmarks.sh - Script to run goose benchmarks across multiple provider:model pairs

set -e

# Display usage information
function show_usage() {
  echo "Usage: $0 [options]"
  echo ""
  echo "Options:"
  echo "  -p, --provider-models    Comma-separated list of provider:model pairs (e.g., 'openai:gpt-4o,anthropic:claude-3-5-sonnet')"
  echo "  -s, --suites             Comma-separated list of benchmark suites to run (e.g., 'core,small_models')"
  echo "  -o, --output-dir         Directory to store benchmark results (default: './benchmark-results')"
  echo "  -d, --debug              Use debug build instead of release build"
  echo "  -t, --toolshim           Enable toolshim mode by setting GOOSE_TOOLSHIM=1"
  echo "  -m, --toolshim-model     Set the toolshim model (sets GOOSE_TOOLSHIM_MODEL)"
  echo "  -h, --help               Show this help message"
  echo ""
  echo "Example:"
  echo "  $0 --provider-models 'openai:gpt-4o,anthropic:claude-3-5-sonnet' --suites 'core,small_models'"
}

# Parse command line arguments
PROVIDER_MODELS=""
SUITES=""
OUTPUT_DIR="./benchmark-results"
DEBUG_MODE=false
TOOLSHIM=false
TOOLSHIM_MODEL=""

while [[ $# -gt 0 ]]; do
  case "$1" in
    -p|--provider-models)
      PROVIDER_MODELS="$2"
      shift 2
      ;;
    -s|--suites)
      SUITES="$2"
      shift 2
      ;;
    -o|--output-dir)
      OUTPUT_DIR="$2"
      shift 2
      ;;
    -d|--debug)
      DEBUG_MODE=true
      shift
      ;;
    -t|--toolshim)
      TOOLSHIM=true
      shift
      ;;
    -m|--toolshim-model)
      TOOLSHIM_MODEL="$2"
      shift 2
      ;;
    -h|--help)
      show_usage
      exit 0
      ;;
    *)
      echo "Error: Unknown option: $1"
      show_usage
      exit 1
      ;;
  esac
done

# Validate required parameters
if [[ -z "$PROVIDER_MODELS" ]]; then
  echo "Error: Provider-model pairs must be specified"
  show_usage
  exit 1
fi

if [[ -z "$SUITES" ]]; then
  echo "Error: Benchmark suites must be specified"
  show_usage
  exit 1
fi

# Create output directory
mkdir -p "$OUTPUT_DIR"

# Create a results summary file
SUMMARY_FILE="$OUTPUT_DIR/summary.md"
echo "# Benchmark Results Summary" > "$SUMMARY_FILE"
echo "Run date: $(date)" >> "$SUMMARY_FILE"
echo "Suites: $SUITES" >> "$SUMMARY_FILE"
if [ "$DEBUG_MODE" = true ]; then
  echo "Mode: Debug" >> "$SUMMARY_FILE"
else
  echo "Mode: Release" >> "$SUMMARY_FILE"
fi
if [ "$TOOLSHIM" = true ]; then
  echo "Toolshim: Enabled" >> "$SUMMARY_FILE"
  if [[ -n "$TOOLSHIM_MODEL" ]]; then
    echo "Toolshim Model: $TOOLSHIM_MODEL" >> "$SUMMARY_FILE"
  fi
fi
echo "" >> "$SUMMARY_FILE"

# Determine which binary to use
GOOSE_CMD="goose"
if [ "$DEBUG_MODE" = true ]; then
  if [ -f "./target/debug/goose" ]; then
    GOOSE_CMD="./target/debug/goose"
    echo "Using debug binary: $GOOSE_CMD"
  else
    echo "Warning: Debug binary not found at ./target/debug/goose. Falling back to system-installed goose."
  fi
else
  if [ -f "./target/release/goose" ]; then
    GOOSE_CMD="./target/release/goose"
    echo "Using release binary: $GOOSE_CMD"
  else
    echo "Warning: Release binary not found at ./target/release/goose. Falling back to system-installed goose."
  fi
fi

# Parse provider:model pairs
PROVIDERS=()
MODELS=()

# Read provider:model pairs
IFS=',' read -ra PAIRS <<< "$PROVIDER_MODELS"
for pair in "${PAIRS[@]}"; do
  # Split by colon
  IFS=':' read -r provider model <<< "$pair"
  if [[ -n "$provider" && -n "$model" ]]; then
    PROVIDERS+=("$provider")
    MODELS+=("$model")
  else
    echo "Warning: Invalid provider:model pair: $pair. Skipping."
  fi
done

# Track overall success
OVERALL_SUCCESS=true
COUNT=${#PROVIDERS[@]}

echo "Running benchmarks for $COUNT provider:model pairs..."
echo "Benchmark suites: $SUITES"
echo ""

# Loop through each provider-model pair
for ((i=0; i<$COUNT; i++)); do
  provider="${PROVIDERS[i]}"
  model="${MODELS[i]}"

  echo "=========================================================="
  echo "Provider: $provider, Model: $model"
  echo "=========================================================="

  echo "## Provider: $provider, Model: $model" >> "$SUMMARY_FILE"

  # Set environment variables for this provider/model instead of using configure
  export GOOSE_PROVIDER="$provider"
  export GOOSE_MODEL="$model"

  # Set toolshim environment variables if enabled
  if [ "$TOOLSHIM" = true ]; then
    export GOOSE_TOOLSHIM=1
    if [[ -n "$TOOLSHIM_MODEL" ]]; then
      export GOOSE_TOOLSHIM_OLLAMA_MODEL="$TOOLSHIM_MODEL"
    fi
  fi

  # Run the benchmark and save results to JSON
  echo "Running benchmark for $provider/$model with suites: $SUITES"
  OUTPUT_FILE="$OUTPUT_DIR/${provider}-${model}.json"
  ANALYSIS_FILE="$OUTPUT_DIR/${provider}-${model}-analysis.txt"

  if $GOOSE_CMD bench --suites "$SUITES" --output "$OUTPUT_FILE" --format json; then
    echo "✅ Benchmark completed successfully" | tee -a "$SUMMARY_FILE"

    # Parse the JSON to check for failures
    if [ -f "$OUTPUT_FILE" ]; then
      # Check if jq is installed
      if ! command -v jq &> /dev/null; then
        echo "Warning: jq not found. Cannot parse JSON results."
        echo "⚠️ Could not parse results (jq not installed)" >> "$SUMMARY_FILE"
      else
        # Basic validation of the JSON file
        if jq empty "$OUTPUT_FILE" 2>/dev/null; then
          # Extract basic information
          PROVIDER_NAME=$(jq -r '.provider' "$OUTPUT_FILE")
          START_TIME=$(jq -r '.start_time' "$OUTPUT_FILE")
          SUITE_COUNT=$(jq '.suites | length' "$OUTPUT_FILE")

          echo "Benchmark Results Analysis" > "$ANALYSIS_FILE"
          echo "-------------------------" >> "$ANALYSIS_FILE"
          echo "Provider: $PROVIDER_NAME" >> "$ANALYSIS_FILE"
          echo "Start Time: $START_TIME" >> "$ANALYSIS_FILE"
          echo "Number of Suites: $SUITE_COUNT" >> "$ANALYSIS_FILE"
          echo "" >> "$ANALYSIS_FILE"

          # Initialize counters
          TOTAL_EVALS=0
          TOTAL_METRICS=0
          FAILED_METRICS=0
          PASSED_METRICS=0
          OTHER_METRICS=0
          TOTAL_ERRORS=0

          # Process each suite
          for j in $(seq 0 $((SUITE_COUNT-1))); do
            SUITE_NAME=$(jq -r ".suites[$j].name" "$OUTPUT_FILE")
            EVAL_COUNT=$(jq ".suites[$j].evaluations | length" "$OUTPUT_FILE")
            TOTAL_EVALS=$((TOTAL_EVALS + EVAL_COUNT))

            echo "Suite: $SUITE_NAME ($EVAL_COUNT evaluations)" >> "$ANALYSIS_FILE"

            # Process each evaluation in this suite
            for k in $(seq 0 $((EVAL_COUNT-1))); do
              EVAL_NAME=$(jq -r ".suites[$j].evaluations[$k].name" "$OUTPUT_FILE")
              METRIC_COUNT=$(jq ".suites[$j].evaluations[$k].metrics | length" "$OUTPUT_FILE")
              TOTAL_METRICS=$((TOTAL_METRICS + METRIC_COUNT))

              # Check for errors in this evaluation
              ERROR_COUNT=$(jq ".suites[$j].evaluations[$k].errors | length" "$OUTPUT_FILE")
              TOTAL_ERRORS=$((TOTAL_ERRORS + ERROR_COUNT))

              # Count boolean metrics (passed and failed)
              BOOLEAN_COUNT=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
                select(.[1].Boolean != null) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')

              # Count failed boolean metrics
              FAILURES=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
                select(
                  .[1].Boolean == false or .[1].Boolean == \"false\" or .[1].Boolean == 0 or .[1].Boolean == \"0\"
                ) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')

              # Count passed boolean metrics
              PASSES=$((BOOLEAN_COUNT - FAILURES))

              # Count non-boolean metrics
              NON_BOOLEAN=$((METRIC_COUNT - BOOLEAN_COUNT))

              # Update global counters
              FAILED_METRICS=$((FAILED_METRICS + FAILURES))
              PASSED_METRICS=$((PASSED_METRICS + PASSES))
              OTHER_METRICS=$((OTHER_METRICS + NON_BOOLEAN))

              if [ "$FAILURES" -gt 0 ] || [ "$ERROR_COUNT" -gt 0 ]; then
                echo "  ❌ $EVAL_NAME:" >> "$ANALYSIS_FILE"

                if [ "$FAILURES" -gt 0 ]; then
                  echo "    - $FAILURES metric failures detected" >> "$ANALYSIS_FILE"
                  # Print the specific failing metrics
                  FAILING_METRICS=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
                    select(
                    .[1].Boolean == false or .[1].Boolean == \"false\" or .[1].Boolean == 0 or .[1].Boolean == \"0\"
                  ) | .[0]" "$OUTPUT_FILE")
                  echo "    Failed metrics:" >> "$ANALYSIS_FILE"
                  echo "$FAILING_METRICS" | sed 's/^/      - /' >> "$ANALYSIS_FILE"
                fi

                if [ "$ERROR_COUNT" -gt 0 ]; then
                  echo "    - $ERROR_COUNT errors detected" >> "$ANALYSIS_FILE"
                  # Print the errors
                  jq -r ".suites[$j].evaluations[$k].errors[] | \"      [\(.level)] \(.message)\"" "$OUTPUT_FILE" >> "$ANALYSIS_FILE"
                fi
              else
                # This line is no longer needed since we count passes/fails/others individually
                echo "  ✅ $EVAL_NAME: All metrics passed, no errors" >> "$ANALYSIS_FILE"
              fi
            done
            echo "" >> "$ANALYSIS_FILE"
          done

          # Print summary
          echo "Summary:" >> "$ANALYSIS_FILE"
          echo "-------" >> "$ANALYSIS_FILE"
          echo "Total Evaluations: $TOTAL_EVALS" >> "$ANALYSIS_FILE"
          echo "Total Metrics: $TOTAL_METRICS" >> "$ANALYSIS_FILE"
          echo "Passed Metrics: $PASSED_METRICS" >> "$ANALYSIS_FILE"
          echo "Failed Metrics: $FAILED_METRICS" >> "$ANALYSIS_FILE"
          echo "Other Metrics: $OTHER_METRICS" >> "$ANALYSIS_FILE"
          echo "Total Errors: $TOTAL_ERRORS" >> "$ANALYSIS_FILE"

          # Verification of metrics counting
          COUNTED_METRICS=$((PASSED_METRICS + FAILED_METRICS + OTHER_METRICS))
          if [ "$COUNTED_METRICS" -ne "$TOTAL_METRICS" ]; then
            echo "⚠️ Metrics counting discrepancy: $COUNTED_METRICS counted vs $TOTAL_METRICS total" >> "$ANALYSIS_FILE"
          fi

          # Determine success/failure
          if [ "$FAILED_METRICS" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then
            if [ "$FAILED_METRICS" -gt 0 ]; then
              echo "❌ Benchmark has $FAILED_METRICS failed metrics" >> "$ANALYSIS_FILE"
            fi
            if [ "$TOTAL_ERRORS" -gt 0 ]; then
              echo "❌ Benchmark has $TOTAL_ERRORS errors" >> "$ANALYSIS_FILE"
            fi
            echo "❌ Tests failed for $provider/$model" | tee -a "$SUMMARY_FILE"
            cat "$ANALYSIS_FILE" >> "$SUMMARY_FILE"
            OVERALL_SUCCESS=false
          else
            echo "✅ All metrics passed successfully, no errors" >> "$ANALYSIS_FILE"
            echo "✅ All tests passed for $provider/$model" | tee -a "$SUMMARY_FILE"
            cat "$ANALYSIS_FILE" >> "$SUMMARY_FILE"
          fi
        else
          echo "❌ Invalid JSON in benchmark output" | tee -a "$SUMMARY_FILE"
          OVERALL_SUCCESS=false
        fi
      fi
    else
      echo "❌ Benchmark output file not found" | tee -a "$SUMMARY_FILE"
      OVERALL_SUCCESS=false
    fi
  else
    echo "❌ Benchmark failed to run" | tee -a "$SUMMARY_FILE"
    OVERALL_SUCCESS=false
  fi

  echo "" >> "$SUMMARY_FILE"
  echo ""
done

echo "=========================================================="
echo "Benchmark run completed"
echo "Results saved to: $OUTPUT_DIR"
echo "Summary file: $SUMMARY_FILE"

# Output final status
if [ "$OVERALL_SUCCESS" = false ]; then
  echo "❌ Some benchmarks failed. Check the summary for details."
  exit 1
else
  echo "✅ All benchmarks completed successfully."
  exit 0
fi