mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-17 22:24:21 +01:00
[feat] goosebenchv2 additions for eval post-processing (#2619)
Co-authored-by: Alice Hau <ahau@squareup.com>
This commit is contained in:
50
scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh
Executable file
50
scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh
Executable file
@@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
# Wrapper script for LLM judge post-processing and final score calculation
|
||||
# This script is called by the benchmark runner with the eval results file as an argument
|
||||
|
||||
# Get the directory where this script is located
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
|
||||
# Get the eval results file path from the first argument
|
||||
EVAL_RESULTS_FILE="$1"
|
||||
|
||||
# Extract the working directory from the eval results file path
|
||||
WORKING_DIR="$(dirname "$EVAL_RESULTS_FILE")"
|
||||
|
||||
# Change to the working directory
|
||||
cd "$WORKING_DIR"
|
||||
|
||||
# Determine the evaluation name from the eval-results.json
|
||||
EVAL_NAME=$(python3 -c "import json; print(json.load(open('eval-results.json'))['name'])")
|
||||
|
||||
# Set the output file name and prompt file based on the evaluation
|
||||
if [ "$EVAL_NAME" = "blog_summary" ]; then
|
||||
OUTPUT_FILE="blog_summary_output.txt"
|
||||
PROMPT_FILE="$SCRIPT_DIR/blog_summary_prompt.txt"
|
||||
elif [ "$EVAL_NAME" = "restaurant_research" ]; then
|
||||
OUTPUT_FILE="restaurant_research_output.txt"
|
||||
PROMPT_FILE="$SCRIPT_DIR/restaurant_research_prompt.txt"
|
||||
else
|
||||
echo "Error: Unknown evaluation name: $EVAL_NAME"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run the LLM judge script with the appropriate arguments
|
||||
python3 "$SCRIPT_DIR/llm_judge.py" "$OUTPUT_FILE" --prompt-file "$PROMPT_FILE"
|
||||
|
||||
# Check if LLM judge succeeded
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: LLM judge failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Calculate the final score
|
||||
python3 "$SCRIPT_DIR/calculate_final_scores_vibes.py" "$EVAL_NAME"
|
||||
|
||||
# Check if score calculation succeeded
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: Final score calculation failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Successfully completed post-processing for $EVAL_NAME"
|
||||
Reference in New Issue
Block a user