Files
goose/scripts/run-benchmarks.sh
marcelle 8fbd9eb327 feat: efficient benching (#1921)
Co-authored-by: Tyler Rockwood <rockwotj@gmail.com>
Co-authored-by: Kalvin C <kalvinnchau@users.noreply.github.com>
Co-authored-by: Alice Hau <110418948+ahau-square@users.noreply.github.com>
2025-04-08 14:43:43 -04:00

334 lines
12 KiB
Bash
Executable File

#!/usr/bin/env bash
# run-benchmarks.sh - Script to run goose benchmarks across multiple provider:model pairs
set -e
# Display usage information
function show_usage() {
echo "Usage: $0 [options]"
echo ""
echo "Options:"
echo " -p, --provider-models Comma-separated list of provider:model pairs (e.g., 'openai:gpt-4o,anthropic:claude-3-5-sonnet')"
echo " -s, --suites Comma-separated list of benchmark suites to run (e.g., 'core,small_models')"
echo " -o, --output-dir Directory to store benchmark results (default: './benchmark-results')"
echo " -d, --debug Use debug build instead of release build"
echo " -t, --toolshim Enable toolshim mode by setting GOOSE_TOOLSHIM=1"
echo " -m, --toolshim-model Set the toolshim model (sets GOOSE_TOOLSHIM_MODEL)"
echo " -h, --help Show this help message"
echo ""
echo "Example:"
echo " $0 --provider-models 'openai:gpt-4o,anthropic:claude-3-5-sonnet' --suites 'core,small_models'"
}
# Parse command line arguments
PROVIDER_MODELS=""
SUITES=""
OUTPUT_DIR="./benchmark-results"
DEBUG_MODE=false
TOOLSHIM=false
TOOLSHIM_MODEL=""
while [[ $# -gt 0 ]]; do
case "$1" in
-p|--provider-models)
PROVIDER_MODELS="$2"
shift 2
;;
-s|--suites)
SUITES="$2"
shift 2
;;
-o|--output-dir)
OUTPUT_DIR="$2"
shift 2
;;
-d|--debug)
DEBUG_MODE=true
shift
;;
-t|--toolshim)
TOOLSHIM=true
shift
;;
-m|--toolshim-model)
TOOLSHIM_MODEL="$2"
shift 2
;;
-h|--help)
show_usage
exit 0
;;
*)
echo "Error: Unknown option: $1"
show_usage
exit 1
;;
esac
done
# Validate required parameters
if [[ -z "$PROVIDER_MODELS" ]]; then
echo "Error: Provider-model pairs must be specified"
show_usage
exit 1
fi
if [[ -z "$SUITES" ]]; then
echo "Error: Benchmark suites must be specified"
show_usage
exit 1
fi
# Create output directory
mkdir -p "$OUTPUT_DIR"
# Create a results summary file
SUMMARY_FILE="$OUTPUT_DIR/summary.md"
echo "# Benchmark Results Summary" > "$SUMMARY_FILE"
echo "Run date: $(date)" >> "$SUMMARY_FILE"
echo "Suites: $SUITES" >> "$SUMMARY_FILE"
if [ "$DEBUG_MODE" = true ]; then
echo "Mode: Debug" >> "$SUMMARY_FILE"
else
echo "Mode: Release" >> "$SUMMARY_FILE"
fi
if [ "$TOOLSHIM" = true ]; then
echo "Toolshim: Enabled" >> "$SUMMARY_FILE"
if [[ -n "$TOOLSHIM_MODEL" ]]; then
echo "Toolshim Model: $TOOLSHIM_MODEL" >> "$SUMMARY_FILE"
fi
fi
echo "" >> "$SUMMARY_FILE"
# Determine which binary to use
GOOSE_CMD="goose"
if [ "$DEBUG_MODE" = true ]; then
if [ -f "./target/debug/goose" ]; then
GOOSE_CMD="./target/debug/goose"
echo "Using debug binary: $GOOSE_CMD"
else
echo "Warning: Debug binary not found at ./target/debug/goose. Falling back to system-installed goose."
fi
else
if [ -f "./target/release/goose" ]; then
GOOSE_CMD="./target/release/goose"
echo "Using release binary: $GOOSE_CMD"
else
echo "Warning: Release binary not found at ./target/release/goose. Falling back to system-installed goose."
fi
fi
# Parse provider:model pairs
PROVIDERS=()
MODELS=()
# Read provider:model pairs
IFS=',' read -ra PAIRS <<< "$PROVIDER_MODELS"
for pair in "${PAIRS[@]}"; do
# Split by colon
IFS=':' read -r provider model <<< "$pair"
if [[ -n "$provider" && -n "$model" ]]; then
PROVIDERS+=("$provider")
MODELS+=("$model")
else
echo "Warning: Invalid provider:model pair: $pair. Skipping."
fi
done
# Track overall success
OVERALL_SUCCESS=true
COUNT=${#PROVIDERS[@]}
echo "Running benchmarks for $COUNT provider:model pairs..."
echo "Benchmark suites: $SUITES"
echo ""
# Loop through each provider-model pair
for ((i=0; i<$COUNT; i++)); do
provider="${PROVIDERS[i]}"
model="${MODELS[i]}"
echo "=========================================================="
echo "Provider: $provider, Model: $model"
echo "=========================================================="
echo "## Provider: $provider, Model: $model" >> "$SUMMARY_FILE"
# Set environment variables for this provider/model instead of using configure
export GOOSE_PROVIDER="$provider"
export GOOSE_MODEL="$model"
# Set toolshim environment variables if enabled
if [ "$TOOLSHIM" = true ]; then
export GOOSE_TOOLSHIM=1
if [[ -n "$TOOLSHIM_MODEL" ]]; then
export GOOSE_TOOLSHIM_OLLAMA_MODEL="$TOOLSHIM_MODEL"
fi
fi
# Run the benchmark and save results to JSON
echo "Running benchmark for $provider/$model with suites: $SUITES"
OUTPUT_FILE="$OUTPUT_DIR/${provider}-${model}.json"
ANALYSIS_FILE="$OUTPUT_DIR/${provider}-${model}-analysis.txt"
if $GOOSE_CMD bench --suites "$SUITES" --output "$OUTPUT_FILE" --format json; then
echo "✅ Benchmark completed successfully" | tee -a "$SUMMARY_FILE"
# Parse the JSON to check for failures
if [ -f "$OUTPUT_FILE" ]; then
# Check if jq is installed
if ! command -v jq &> /dev/null; then
echo "Warning: jq not found. Cannot parse JSON results."
echo "⚠️ Could not parse results (jq not installed)" >> "$SUMMARY_FILE"
else
# Basic validation of the JSON file
if jq empty "$OUTPUT_FILE" 2>/dev/null; then
# Extract basic information
PROVIDER_NAME=$(jq -r '.provider' "$OUTPUT_FILE")
START_TIME=$(jq -r '.start_time' "$OUTPUT_FILE")
SUITE_COUNT=$(jq '.suites | length' "$OUTPUT_FILE")
echo "Benchmark Results Analysis" > "$ANALYSIS_FILE"
echo "-------------------------" >> "$ANALYSIS_FILE"
echo "Provider: $PROVIDER_NAME" >> "$ANALYSIS_FILE"
echo "Start Time: $START_TIME" >> "$ANALYSIS_FILE"
echo "Number of Suites: $SUITE_COUNT" >> "$ANALYSIS_FILE"
echo "" >> "$ANALYSIS_FILE"
# Initialize counters
TOTAL_EVALS=0
TOTAL_METRICS=0
FAILED_METRICS=0
PASSED_METRICS=0
OTHER_METRICS=0
TOTAL_ERRORS=0
# Process each suite
for j in $(seq 0 $((SUITE_COUNT-1))); do
SUITE_NAME=$(jq -r ".suites[$j].name" "$OUTPUT_FILE")
EVAL_COUNT=$(jq ".suites[$j].evaluations | length" "$OUTPUT_FILE")
TOTAL_EVALS=$((TOTAL_EVALS + EVAL_COUNT))
echo "Suite: $SUITE_NAME ($EVAL_COUNT evaluations)" >> "$ANALYSIS_FILE"
# Process each evaluation in this suite
for k in $(seq 0 $((EVAL_COUNT-1))); do
EVAL_NAME=$(jq -r ".suites[$j].evaluations[$k].name" "$OUTPUT_FILE")
METRIC_COUNT=$(jq ".suites[$j].evaluations[$k].metrics | length" "$OUTPUT_FILE")
TOTAL_METRICS=$((TOTAL_METRICS + METRIC_COUNT))
# Check for errors in this evaluation
ERROR_COUNT=$(jq ".suites[$j].evaluations[$k].errors | length" "$OUTPUT_FILE")
TOTAL_ERRORS=$((TOTAL_ERRORS + ERROR_COUNT))
# Count boolean metrics (passed and failed)
BOOLEAN_COUNT=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
select(.[1].Boolean != null) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')
# Count failed boolean metrics
FAILURES=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
select(
.[1].Boolean == false or .[1].Boolean == \"false\" or .[1].Boolean == 0 or .[1].Boolean == \"0\"
) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')
# Count passed boolean metrics
PASSES=$((BOOLEAN_COUNT - FAILURES))
# Count non-boolean metrics
NON_BOOLEAN=$((METRIC_COUNT - BOOLEAN_COUNT))
# Update global counters
FAILED_METRICS=$((FAILED_METRICS + FAILURES))
PASSED_METRICS=$((PASSED_METRICS + PASSES))
OTHER_METRICS=$((OTHER_METRICS + NON_BOOLEAN))
if [ "$FAILURES" -gt 0 ] || [ "$ERROR_COUNT" -gt 0 ]; then
echo "$EVAL_NAME:" >> "$ANALYSIS_FILE"
if [ "$FAILURES" -gt 0 ]; then
echo " - $FAILURES metric failures detected" >> "$ANALYSIS_FILE"
# Print the specific failing metrics
FAILING_METRICS=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
select(
.[1].Boolean == false or .[1].Boolean == \"false\" or .[1].Boolean == 0 or .[1].Boolean == \"0\"
) | .[0]" "$OUTPUT_FILE")
echo " Failed metrics:" >> "$ANALYSIS_FILE"
echo "$FAILING_METRICS" | sed 's/^/ - /' >> "$ANALYSIS_FILE"
fi
if [ "$ERROR_COUNT" -gt 0 ]; then
echo " - $ERROR_COUNT errors detected" >> "$ANALYSIS_FILE"
# Print the errors
jq -r ".suites[$j].evaluations[$k].errors[] | \" [\(.level)] \(.message)\"" "$OUTPUT_FILE" >> "$ANALYSIS_FILE"
fi
else
# This line is no longer needed since we count passes/fails/others individually
echo "$EVAL_NAME: All metrics passed, no errors" >> "$ANALYSIS_FILE"
fi
done
echo "" >> "$ANALYSIS_FILE"
done
# Print summary
echo "Summary:" >> "$ANALYSIS_FILE"
echo "-------" >> "$ANALYSIS_FILE"
echo "Total Evaluations: $TOTAL_EVALS" >> "$ANALYSIS_FILE"
echo "Total Metrics: $TOTAL_METRICS" >> "$ANALYSIS_FILE"
echo "Passed Metrics: $PASSED_METRICS" >> "$ANALYSIS_FILE"
echo "Failed Metrics: $FAILED_METRICS" >> "$ANALYSIS_FILE"
echo "Other Metrics: $OTHER_METRICS" >> "$ANALYSIS_FILE"
echo "Total Errors: $TOTAL_ERRORS" >> "$ANALYSIS_FILE"
# Verification of metrics counting
COUNTED_METRICS=$((PASSED_METRICS + FAILED_METRICS + OTHER_METRICS))
if [ "$COUNTED_METRICS" -ne "$TOTAL_METRICS" ]; then
echo "⚠️ Metrics counting discrepancy: $COUNTED_METRICS counted vs $TOTAL_METRICS total" >> "$ANALYSIS_FILE"
fi
# Determine success/failure
if [ "$FAILED_METRICS" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then
if [ "$FAILED_METRICS" -gt 0 ]; then
echo "❌ Benchmark has $FAILED_METRICS failed metrics" >> "$ANALYSIS_FILE"
fi
if [ "$TOTAL_ERRORS" -gt 0 ]; then
echo "❌ Benchmark has $TOTAL_ERRORS errors" >> "$ANALYSIS_FILE"
fi
echo "❌ Tests failed for $provider/$model" | tee -a "$SUMMARY_FILE"
cat "$ANALYSIS_FILE" >> "$SUMMARY_FILE"
OVERALL_SUCCESS=false
else
echo "✅ All metrics passed successfully, no errors" >> "$ANALYSIS_FILE"
echo "✅ All tests passed for $provider/$model" | tee -a "$SUMMARY_FILE"
cat "$ANALYSIS_FILE" >> "$SUMMARY_FILE"
fi
else
echo "❌ Invalid JSON in benchmark output" | tee -a "$SUMMARY_FILE"
OVERALL_SUCCESS=false
fi
fi
else
echo "❌ Benchmark output file not found" | tee -a "$SUMMARY_FILE"
OVERALL_SUCCESS=false
fi
else
echo "❌ Benchmark failed to run" | tee -a "$SUMMARY_FILE"
OVERALL_SUCCESS=false
fi
echo "" >> "$SUMMARY_FILE"
echo ""
done
echo "=========================================================="
echo "Benchmark run completed"
echo "Results saved to: $OUTPUT_DIR"
echo "Summary file: $SUMMARY_FILE"
# Output final status
if [ "$OVERALL_SUCCESS" = false ]; then
echo "❌ Some benchmarks failed. Check the summary for details."
exit 1
else
echo "✅ All benchmarks completed successfully."
exit 0
fi