[feat] goosebenchv2 additions for eval post-processing (#2619)

Co-authored-by: Alice Hau <ahau@squareup.com>
2025-12-17 22:24:21 +01:00 · 2025-05-21 15:00:13 -04:00
parent 8fade6b320
commit be09849128
18 changed files with 1471 additions and 106 deletions
--- a/scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh
+++ b/scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Wrapper script for LLM judge post-processing and final score calculation
+# This script is called by the benchmark runner with the eval results file as an argument
+
+# Get the directory where this script is located
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+# Get the eval results file path from the first argument
+EVAL_RESULTS_FILE="$1"
+
+# Extract the working directory from the eval results file path
+WORKING_DIR="$(dirname "$EVAL_RESULTS_FILE")"
+
+# Change to the working directory
+cd "$WORKING_DIR"
+
+# Determine the evaluation name from the eval-results.json
+EVAL_NAME=$(python3 -c "import json; print(json.load(open('eval-results.json'))['name'])")
+
+# Set the output file name and prompt file based on the evaluation
+if [ "$EVAL_NAME" = "blog_summary" ]; then
+    OUTPUT_FILE="blog_summary_output.txt"
+    PROMPT_FILE="$SCRIPT_DIR/blog_summary_prompt.txt"
+elif [ "$EVAL_NAME" = "restaurant_research" ]; then
+    OUTPUT_FILE="restaurant_research_output.txt"
+    PROMPT_FILE="$SCRIPT_DIR/restaurant_research_prompt.txt"
+else
+    echo "Error: Unknown evaluation name: $EVAL_NAME"
+    exit 1
+fi
+
+# Run the LLM judge script with the appropriate arguments
+python3 "$SCRIPT_DIR/llm_judge.py" "$OUTPUT_FILE" --prompt-file "$PROMPT_FILE"
+
+# Check if LLM judge succeeded
+if [ $? -ne 0 ]; then
+    echo "Error: LLM judge failed"
+    exit 1
+fi
+
+# Calculate the final score
+python3 "$SCRIPT_DIR/calculate_final_scores_vibes.py" "$EVAL_NAME"
+
+# Check if score calculation succeeded
+if [ $? -ne 0 ]; then
+    echo "Error: Final score calculation failed"
+    exit 1
+fi
+
+echo "Successfully completed post-processing for $EVAL_NAME"