mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-17 14:14:26 +01:00
51 lines
1.6 KiB
Bash
Executable File
51 lines
1.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# Wrapper script for LLM judge post-processing and final score calculation
|
|
# This script is called by the benchmark runner with the eval results file as an argument
|
|
|
|
# Get the directory where this script is located
|
|
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
|
|
|
# Get the eval results file path from the first argument
|
|
EVAL_RESULTS_FILE="$1"
|
|
|
|
# Extract the working directory from the eval results file path
|
|
WORKING_DIR="$(dirname "$EVAL_RESULTS_FILE")"
|
|
|
|
# Change to the working directory
|
|
cd "$WORKING_DIR"
|
|
|
|
# Determine the evaluation name from the eval-results.json
|
|
EVAL_NAME=$(python3 -c "import json; print(json.load(open('eval-results.json'))['name'])")
|
|
|
|
# Set the output file name and prompt file based on the evaluation
|
|
if [ "$EVAL_NAME" = "blog_summary" ]; then
|
|
OUTPUT_FILE="blog_summary_output.txt"
|
|
PROMPT_FILE="$SCRIPT_DIR/blog_summary_prompt.txt"
|
|
elif [ "$EVAL_NAME" = "restaurant_research" ]; then
|
|
OUTPUT_FILE="restaurant_research_output.txt"
|
|
PROMPT_FILE="$SCRIPT_DIR/restaurant_research_prompt.txt"
|
|
else
|
|
echo "Error: Unknown evaluation name: $EVAL_NAME"
|
|
exit 1
|
|
fi
|
|
|
|
# Run the LLM judge script with the appropriate arguments
|
|
python3 "$SCRIPT_DIR/llm_judge.py" "$OUTPUT_FILE" --prompt-file "$PROMPT_FILE"
|
|
|
|
# Check if LLM judge succeeded
|
|
if [ $? -ne 0 ]; then
|
|
echo "Error: LLM judge failed"
|
|
exit 1
|
|
fi
|
|
|
|
# Calculate the final score
|
|
python3 "$SCRIPT_DIR/calculate_final_scores_vibes.py" "$EVAL_NAME"
|
|
|
|
# Check if score calculation succeeded
|
|
if [ $? -ne 0 ]; then
|
|
echo "Error: Final score calculation failed"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Successfully completed post-processing for $EVAL_NAME"
|