mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-17 14:14:26 +01:00
Co-authored-by: Tyler Rockwood <rockwotj@gmail.com> Co-authored-by: Kalvin C <kalvinnchau@users.noreply.github.com> Co-authored-by: Alice Hau <110418948+ahau-square@users.noreply.github.com>
334 lines
12 KiB
Bash
Executable File
334 lines
12 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# run-benchmarks.sh - Script to run goose benchmarks across multiple provider:model pairs
|
|
|
|
set -e
|
|
|
|
# Display usage information
|
|
function show_usage() {
|
|
echo "Usage: $0 [options]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " -p, --provider-models Comma-separated list of provider:model pairs (e.g., 'openai:gpt-4o,anthropic:claude-3-5-sonnet')"
|
|
echo " -s, --suites Comma-separated list of benchmark suites to run (e.g., 'core,small_models')"
|
|
echo " -o, --output-dir Directory to store benchmark results (default: './benchmark-results')"
|
|
echo " -d, --debug Use debug build instead of release build"
|
|
echo " -t, --toolshim Enable toolshim mode by setting GOOSE_TOOLSHIM=1"
|
|
echo " -m, --toolshim-model Set the toolshim model (sets GOOSE_TOOLSHIM_MODEL)"
|
|
echo " -h, --help Show this help message"
|
|
echo ""
|
|
echo "Example:"
|
|
echo " $0 --provider-models 'openai:gpt-4o,anthropic:claude-3-5-sonnet' --suites 'core,small_models'"
|
|
}
|
|
|
|
# Parse command line arguments
|
|
PROVIDER_MODELS=""
|
|
SUITES=""
|
|
OUTPUT_DIR="./benchmark-results"
|
|
DEBUG_MODE=false
|
|
TOOLSHIM=false
|
|
TOOLSHIM_MODEL=""
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
-p|--provider-models)
|
|
PROVIDER_MODELS="$2"
|
|
shift 2
|
|
;;
|
|
-s|--suites)
|
|
SUITES="$2"
|
|
shift 2
|
|
;;
|
|
-o|--output-dir)
|
|
OUTPUT_DIR="$2"
|
|
shift 2
|
|
;;
|
|
-d|--debug)
|
|
DEBUG_MODE=true
|
|
shift
|
|
;;
|
|
-t|--toolshim)
|
|
TOOLSHIM=true
|
|
shift
|
|
;;
|
|
-m|--toolshim-model)
|
|
TOOLSHIM_MODEL="$2"
|
|
shift 2
|
|
;;
|
|
-h|--help)
|
|
show_usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Error: Unknown option: $1"
|
|
show_usage
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Validate required parameters
|
|
if [[ -z "$PROVIDER_MODELS" ]]; then
|
|
echo "Error: Provider-model pairs must be specified"
|
|
show_usage
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -z "$SUITES" ]]; then
|
|
echo "Error: Benchmark suites must be specified"
|
|
show_usage
|
|
exit 1
|
|
fi
|
|
|
|
# Create output directory
|
|
mkdir -p "$OUTPUT_DIR"
|
|
|
|
# Create a results summary file
|
|
SUMMARY_FILE="$OUTPUT_DIR/summary.md"
|
|
echo "# Benchmark Results Summary" > "$SUMMARY_FILE"
|
|
echo "Run date: $(date)" >> "$SUMMARY_FILE"
|
|
echo "Suites: $SUITES" >> "$SUMMARY_FILE"
|
|
if [ "$DEBUG_MODE" = true ]; then
|
|
echo "Mode: Debug" >> "$SUMMARY_FILE"
|
|
else
|
|
echo "Mode: Release" >> "$SUMMARY_FILE"
|
|
fi
|
|
if [ "$TOOLSHIM" = true ]; then
|
|
echo "Toolshim: Enabled" >> "$SUMMARY_FILE"
|
|
if [[ -n "$TOOLSHIM_MODEL" ]]; then
|
|
echo "Toolshim Model: $TOOLSHIM_MODEL" >> "$SUMMARY_FILE"
|
|
fi
|
|
fi
|
|
echo "" >> "$SUMMARY_FILE"
|
|
|
|
# Determine which binary to use
|
|
GOOSE_CMD="goose"
|
|
if [ "$DEBUG_MODE" = true ]; then
|
|
if [ -f "./target/debug/goose" ]; then
|
|
GOOSE_CMD="./target/debug/goose"
|
|
echo "Using debug binary: $GOOSE_CMD"
|
|
else
|
|
echo "Warning: Debug binary not found at ./target/debug/goose. Falling back to system-installed goose."
|
|
fi
|
|
else
|
|
if [ -f "./target/release/goose" ]; then
|
|
GOOSE_CMD="./target/release/goose"
|
|
echo "Using release binary: $GOOSE_CMD"
|
|
else
|
|
echo "Warning: Release binary not found at ./target/release/goose. Falling back to system-installed goose."
|
|
fi
|
|
fi
|
|
|
|
# Parse provider:model pairs
|
|
PROVIDERS=()
|
|
MODELS=()
|
|
|
|
# Read provider:model pairs
|
|
IFS=',' read -ra PAIRS <<< "$PROVIDER_MODELS"
|
|
for pair in "${PAIRS[@]}"; do
|
|
# Split by colon
|
|
IFS=':' read -r provider model <<< "$pair"
|
|
if [[ -n "$provider" && -n "$model" ]]; then
|
|
PROVIDERS+=("$provider")
|
|
MODELS+=("$model")
|
|
else
|
|
echo "Warning: Invalid provider:model pair: $pair. Skipping."
|
|
fi
|
|
done
|
|
|
|
# Track overall success
|
|
OVERALL_SUCCESS=true
|
|
COUNT=${#PROVIDERS[@]}
|
|
|
|
echo "Running benchmarks for $COUNT provider:model pairs..."
|
|
echo "Benchmark suites: $SUITES"
|
|
echo ""
|
|
|
|
# Loop through each provider-model pair
|
|
for ((i=0; i<$COUNT; i++)); do
|
|
provider="${PROVIDERS[i]}"
|
|
model="${MODELS[i]}"
|
|
|
|
echo "=========================================================="
|
|
echo "Provider: $provider, Model: $model"
|
|
echo "=========================================================="
|
|
|
|
echo "## Provider: $provider, Model: $model" >> "$SUMMARY_FILE"
|
|
|
|
# Set environment variables for this provider/model instead of using configure
|
|
export GOOSE_PROVIDER="$provider"
|
|
export GOOSE_MODEL="$model"
|
|
|
|
# Set toolshim environment variables if enabled
|
|
if [ "$TOOLSHIM" = true ]; then
|
|
export GOOSE_TOOLSHIM=1
|
|
if [[ -n "$TOOLSHIM_MODEL" ]]; then
|
|
export GOOSE_TOOLSHIM_OLLAMA_MODEL="$TOOLSHIM_MODEL"
|
|
fi
|
|
fi
|
|
|
|
# Run the benchmark and save results to JSON
|
|
echo "Running benchmark for $provider/$model with suites: $SUITES"
|
|
OUTPUT_FILE="$OUTPUT_DIR/${provider}-${model}.json"
|
|
ANALYSIS_FILE="$OUTPUT_DIR/${provider}-${model}-analysis.txt"
|
|
|
|
if $GOOSE_CMD bench --suites "$SUITES" --output "$OUTPUT_FILE" --format json; then
|
|
echo "✅ Benchmark completed successfully" | tee -a "$SUMMARY_FILE"
|
|
|
|
# Parse the JSON to check for failures
|
|
if [ -f "$OUTPUT_FILE" ]; then
|
|
# Check if jq is installed
|
|
if ! command -v jq &> /dev/null; then
|
|
echo "Warning: jq not found. Cannot parse JSON results."
|
|
echo "⚠️ Could not parse results (jq not installed)" >> "$SUMMARY_FILE"
|
|
else
|
|
# Basic validation of the JSON file
|
|
if jq empty "$OUTPUT_FILE" 2>/dev/null; then
|
|
# Extract basic information
|
|
PROVIDER_NAME=$(jq -r '.provider' "$OUTPUT_FILE")
|
|
START_TIME=$(jq -r '.start_time' "$OUTPUT_FILE")
|
|
SUITE_COUNT=$(jq '.suites | length' "$OUTPUT_FILE")
|
|
|
|
echo "Benchmark Results Analysis" > "$ANALYSIS_FILE"
|
|
echo "-------------------------" >> "$ANALYSIS_FILE"
|
|
echo "Provider: $PROVIDER_NAME" >> "$ANALYSIS_FILE"
|
|
echo "Start Time: $START_TIME" >> "$ANALYSIS_FILE"
|
|
echo "Number of Suites: $SUITE_COUNT" >> "$ANALYSIS_FILE"
|
|
echo "" >> "$ANALYSIS_FILE"
|
|
|
|
# Initialize counters
|
|
TOTAL_EVALS=0
|
|
TOTAL_METRICS=0
|
|
FAILED_METRICS=0
|
|
PASSED_METRICS=0
|
|
OTHER_METRICS=0
|
|
TOTAL_ERRORS=0
|
|
|
|
# Process each suite
|
|
for j in $(seq 0 $((SUITE_COUNT-1))); do
|
|
SUITE_NAME=$(jq -r ".suites[$j].name" "$OUTPUT_FILE")
|
|
EVAL_COUNT=$(jq ".suites[$j].evaluations | length" "$OUTPUT_FILE")
|
|
TOTAL_EVALS=$((TOTAL_EVALS + EVAL_COUNT))
|
|
|
|
echo "Suite: $SUITE_NAME ($EVAL_COUNT evaluations)" >> "$ANALYSIS_FILE"
|
|
|
|
# Process each evaluation in this suite
|
|
for k in $(seq 0 $((EVAL_COUNT-1))); do
|
|
EVAL_NAME=$(jq -r ".suites[$j].evaluations[$k].name" "$OUTPUT_FILE")
|
|
METRIC_COUNT=$(jq ".suites[$j].evaluations[$k].metrics | length" "$OUTPUT_FILE")
|
|
TOTAL_METRICS=$((TOTAL_METRICS + METRIC_COUNT))
|
|
|
|
# Check for errors in this evaluation
|
|
ERROR_COUNT=$(jq ".suites[$j].evaluations[$k].errors | length" "$OUTPUT_FILE")
|
|
TOTAL_ERRORS=$((TOTAL_ERRORS + ERROR_COUNT))
|
|
|
|
# Count boolean metrics (passed and failed)
|
|
BOOLEAN_COUNT=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
|
|
select(.[1].Boolean != null) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')
|
|
|
|
# Count failed boolean metrics
|
|
FAILURES=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
|
|
select(
|
|
.[1].Boolean == false or .[1].Boolean == \"false\" or .[1].Boolean == 0 or .[1].Boolean == \"0\"
|
|
) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')
|
|
|
|
# Count passed boolean metrics
|
|
PASSES=$((BOOLEAN_COUNT - FAILURES))
|
|
|
|
# Count non-boolean metrics
|
|
NON_BOOLEAN=$((METRIC_COUNT - BOOLEAN_COUNT))
|
|
|
|
# Update global counters
|
|
FAILED_METRICS=$((FAILED_METRICS + FAILURES))
|
|
PASSED_METRICS=$((PASSED_METRICS + PASSES))
|
|
OTHER_METRICS=$((OTHER_METRICS + NON_BOOLEAN))
|
|
|
|
if [ "$FAILURES" -gt 0 ] || [ "$ERROR_COUNT" -gt 0 ]; then
|
|
echo " ❌ $EVAL_NAME:" >> "$ANALYSIS_FILE"
|
|
|
|
if [ "$FAILURES" -gt 0 ]; then
|
|
echo " - $FAILURES metric failures detected" >> "$ANALYSIS_FILE"
|
|
# Print the specific failing metrics
|
|
FAILING_METRICS=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
|
|
select(
|
|
.[1].Boolean == false or .[1].Boolean == \"false\" or .[1].Boolean == 0 or .[1].Boolean == \"0\"
|
|
) | .[0]" "$OUTPUT_FILE")
|
|
echo " Failed metrics:" >> "$ANALYSIS_FILE"
|
|
echo "$FAILING_METRICS" | sed 's/^/ - /' >> "$ANALYSIS_FILE"
|
|
fi
|
|
|
|
if [ "$ERROR_COUNT" -gt 0 ]; then
|
|
echo " - $ERROR_COUNT errors detected" >> "$ANALYSIS_FILE"
|
|
# Print the errors
|
|
jq -r ".suites[$j].evaluations[$k].errors[] | \" [\(.level)] \(.message)\"" "$OUTPUT_FILE" >> "$ANALYSIS_FILE"
|
|
fi
|
|
else
|
|
# This line is no longer needed since we count passes/fails/others individually
|
|
echo " ✅ $EVAL_NAME: All metrics passed, no errors" >> "$ANALYSIS_FILE"
|
|
fi
|
|
done
|
|
echo "" >> "$ANALYSIS_FILE"
|
|
done
|
|
|
|
# Print summary
|
|
echo "Summary:" >> "$ANALYSIS_FILE"
|
|
echo "-------" >> "$ANALYSIS_FILE"
|
|
echo "Total Evaluations: $TOTAL_EVALS" >> "$ANALYSIS_FILE"
|
|
echo "Total Metrics: $TOTAL_METRICS" >> "$ANALYSIS_FILE"
|
|
echo "Passed Metrics: $PASSED_METRICS" >> "$ANALYSIS_FILE"
|
|
echo "Failed Metrics: $FAILED_METRICS" >> "$ANALYSIS_FILE"
|
|
echo "Other Metrics: $OTHER_METRICS" >> "$ANALYSIS_FILE"
|
|
echo "Total Errors: $TOTAL_ERRORS" >> "$ANALYSIS_FILE"
|
|
|
|
# Verification of metrics counting
|
|
COUNTED_METRICS=$((PASSED_METRICS + FAILED_METRICS + OTHER_METRICS))
|
|
if [ "$COUNTED_METRICS" -ne "$TOTAL_METRICS" ]; then
|
|
echo "⚠️ Metrics counting discrepancy: $COUNTED_METRICS counted vs $TOTAL_METRICS total" >> "$ANALYSIS_FILE"
|
|
fi
|
|
|
|
# Determine success/failure
|
|
if [ "$FAILED_METRICS" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then
|
|
if [ "$FAILED_METRICS" -gt 0 ]; then
|
|
echo "❌ Benchmark has $FAILED_METRICS failed metrics" >> "$ANALYSIS_FILE"
|
|
fi
|
|
if [ "$TOTAL_ERRORS" -gt 0 ]; then
|
|
echo "❌ Benchmark has $TOTAL_ERRORS errors" >> "$ANALYSIS_FILE"
|
|
fi
|
|
echo "❌ Tests failed for $provider/$model" | tee -a "$SUMMARY_FILE"
|
|
cat "$ANALYSIS_FILE" >> "$SUMMARY_FILE"
|
|
OVERALL_SUCCESS=false
|
|
else
|
|
echo "✅ All metrics passed successfully, no errors" >> "$ANALYSIS_FILE"
|
|
echo "✅ All tests passed for $provider/$model" | tee -a "$SUMMARY_FILE"
|
|
cat "$ANALYSIS_FILE" >> "$SUMMARY_FILE"
|
|
fi
|
|
else
|
|
echo "❌ Invalid JSON in benchmark output" | tee -a "$SUMMARY_FILE"
|
|
OVERALL_SUCCESS=false
|
|
fi
|
|
fi
|
|
else
|
|
echo "❌ Benchmark output file not found" | tee -a "$SUMMARY_FILE"
|
|
OVERALL_SUCCESS=false
|
|
fi
|
|
else
|
|
echo "❌ Benchmark failed to run" | tee -a "$SUMMARY_FILE"
|
|
OVERALL_SUCCESS=false
|
|
fi
|
|
|
|
echo "" >> "$SUMMARY_FILE"
|
|
echo ""
|
|
done
|
|
|
|
echo "=========================================================="
|
|
echo "Benchmark run completed"
|
|
echo "Results saved to: $OUTPUT_DIR"
|
|
echo "Summary file: $SUMMARY_FILE"
|
|
|
|
# Output final status
|
|
if [ "$OVERALL_SUCCESS" = false ]; then
|
|
echo "❌ Some benchmarks failed. Check the summary for details."
|
|
exit 1
|
|
else
|
|
echo "✅ All benchmarks completed successfully."
|
|
exit 0
|
|
fi |