Files
goose/scripts/parse-benchmark-results.sh
2025-03-05 21:23:00 -05:00

93 lines
2.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# Script to parse goose-bench results and check for failures
set -e
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <benchmark-result-json-file>"
exit 1
fi
RESULT_FILE="$1"
if [ ! -f "$RESULT_FILE" ]; then
echo "Error: Result file not found: $RESULT_FILE"
exit 1
fi
# Extract basic information
PROVIDER=$(jq -r '.provider' "$RESULT_FILE")
START_TIME=$(jq -r '.start_time' "$RESULT_FILE")
SUITE_COUNT=$(jq '.suites | length' "$RESULT_FILE")
echo "Benchmark Results Analysis"
echo "-------------------------"
echo "Provider: $PROVIDER"
echo "Start Time: $START_TIME"
echo "Number of Suites: $SUITE_COUNT"
echo ""
# Initialize counters
TOTAL_EVALS=0
TOTAL_METRICS=0
FAILED_METRICS=0
PASSED_METRICS=0
# Process each suite
for i in $(seq 0 $((SUITE_COUNT-1))); do
SUITE_NAME=$(jq -r ".suites[$i].name" "$RESULT_FILE")
EVAL_COUNT=$(jq ".suites[$i].evaluations | length" "$RESULT_FILE")
TOTAL_EVALS=$((TOTAL_EVALS + EVAL_COUNT))
echo "Suite: $SUITE_NAME ($EVAL_COUNT evaluations)"
# Process each evaluation in this suite
for j in $(seq 0 $((EVAL_COUNT-1))); do
EVAL_NAME=$(jq -r ".suites[$i].evaluations[$j].name" "$RESULT_FILE")
METRIC_COUNT=$(jq ".suites[$i].evaluations[$j].metrics | length" "$RESULT_FILE")
TOTAL_METRICS=$((TOTAL_METRICS + METRIC_COUNT))
# Check for failures in this evaluation
# This assumes metrics with names containing "success", "pass", or "correct"
# and boolean values of false indicate failures
FAILURES=$(jq -r ".suites[$i].evaluations[$j].metrics[] |
select(
(.[0] | test(\"success|pass|correct\"; \"i\")) and
(.[1] == false or .[1] == \"false\" or .[1] == 0 or .[1] == \"0\")
) | .[0]" "$RESULT_FILE" | wc -l | tr -d ' ')
if [ "$FAILURES" -gt 0 ]; then
FAILED_METRICS=$((FAILED_METRICS + FAILURES))
echo "$EVAL_NAME: $FAILURES failures detected"
# Print the specific failing metrics
FAILING_METRICS=$(jq -r ".suites[$i].evaluations[$j].metrics[] |
select(
(.[0] | test(\"success|pass|correct\"; \"i\")) and
(.[1] == false or .[1] == \"false\" or .[1] == 0 or .[1] == \"0\")
) | \" - \" + .[0]" "$RESULT_FILE")
echo "$FAILING_METRICS"
else
PASSED_METRICS=$((PASSED_METRICS + METRIC_COUNT))
echo "$EVAL_NAME: All metrics passed"
fi
done
echo ""
done
# Print summary
echo "Summary:"
echo "-------"
echo "Total Evaluations: $TOTAL_EVALS"
echo "Total Metrics: $TOTAL_METRICS"
echo "Passed Metrics: $PASSED_METRICS"
echo "Failed Metrics: $FAILED_METRICS"
# Set exit code based on failures
if [ "$FAILED_METRICS" -gt 0 ]; then
echo "❌ Benchmark has $FAILED_METRICS failures"
exit 1
else
echo "✅ All metrics passed successfully"
exit 0
fi