[feat] goosebenchv2 additions for eval post-processing (#2619)

Co-authored-by: Alice Hau <ahau@squareup.com>
2026-02-23 15:34:27 +01:00 · 2025-05-21 15:00:13 -04:00
parent 8fade6b320
commit be09849128
18 changed files with 1471 additions and 106 deletions
--- a/scripts/bench-postprocess-scripts/llm-judges/blog_summary_prompt.txt
+++ b/scripts/bench-postprocess-scripts/llm-judges/blog_summary_prompt.txt
@@ -0,0 +1,8 @@
+You are evaluating a response to a summarization task and will give a score of 0, 1, or 2. The instructions were:
+
+'What are the top 5 most counterintuitive insights from this blog post? https://huyenchip.com/2025/01/07/agents.html'
+
+Does the response below appropriately answer the query (ignore formatting)?
+0 = does not provide any insights at all
+1 = provides some insights, but not all 5
+2 = provides all 5 insights
--- a/scripts/bench-postprocess-scripts/llm-judges/calculate_final_scores_vibes.py
+++ b/scripts/bench-postprocess-scripts/llm-judges/calculate_final_scores_vibes.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+Calculate final score for vibes evaluations.
+This script combines the LLM judge score with other metrics to produce a final score.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+def get_metric_value(metrics, metric_name):
+    """Extract a metric value from the metrics array."""
+    for metric in metrics:
+        if metric[0] == metric_name:
+            value = metric[1]
+            if "Float" in value:
+                return float(value["Float"])
+            elif "Integer" in value:
+                return float(value["Integer"])
+            elif "Boolean" in value:
+                return 1.0 if value["Boolean"] else 0.0
+    return None
+
+
+def calculate_score(eval_name, metrics):
+    """Calculate the final score based on the evaluation type."""
+    llm_judge_score = get_metric_value(metrics, "llm_judge_score")
+    used_fetch_tool = get_metric_value(metrics, "used_fetch_tool")
+    valid_markdown_format = get_metric_value(metrics, "valid_markdown_format")
+    
+    if llm_judge_score is None:
+        raise ValueError("llm_judge_score not found in metrics")
+    
+    # Convert boolean metrics to 0/1 if needed
+    used_fetch_tool = 1.0 if used_fetch_tool else 0.0
+    valid_markdown_format = 1.0 if valid_markdown_format else 0.0
+    
+    if eval_name == "blog_summary":
+        # max score is 4.0 as llm_judge_score is between [0,2] and used_fetch_tool/valid_markedown_format have values [0,1]
+        score = (llm_judge_score + used_fetch_tool + valid_markdown_format) / 4.0
+    elif eval_name == "restaurant_research":
+        score = (llm_judge_score + valid_markdown_format + used_fetch_tool) / 4.0
+    else:
+        raise ValueError(f"Unknown evaluation type: {eval_name}")
+    
+    return score
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: calculate_final_score.py <eval_name>")
+        sys.exit(1)
+    
+    eval_name = sys.argv[1]
+    
+    # Load eval results from current directory
+    eval_results_path = Path("eval-results.json")
+    if not eval_results_path.exists():
+        print(f"Error: eval-results.json not found in current directory")
+        sys.exit(1)
+    
+    with open(eval_results_path, 'r') as f:
+        eval_results = json.load(f)
+    
+    try:
+        # Calculate the final score
+        score = calculate_score(eval_name, eval_results["metrics"])
+        
+        # Add the score metric
+        eval_results["metrics"].append([
+            "score",
+            {"Float": score}
+        ])
+        
+        # Save updated results
+        with open(eval_results_path, 'w') as f:
+            json.dump(eval_results, f, indent=2)
+        
+        print(f"Successfully added final score: {score}")
+        
+    except Exception as e:
+        print(f"Error calculating final score: {str(e)}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/bench-postprocess-scripts/llm-judges/llm_judge.py
+++ b/scripts/bench-postprocess-scripts/llm-judges/llm_judge.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+LLM Judge post-processing script for Goose benchmarks.
+
+This script evaluates benchmark results using OpenAI's API as a judge.
+It reads the eval-results.json file and a specified output file, then uses
+OpenAI to score the output based on a provided rubric.
+
+Usage:
+    python llm_judge.py <output_file> [--rubric-max-score N] [--prompt-file PATH]
+    
+Arguments:
+    output_file: Name of the file containing the output to evaluate (e.g., blog_summary_output.txt)
+    --rubric-max-score: Maximum score for the rubric (default: 2)
+    --prompt-file: Path to custom evaluation prompt file
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from collections import Counter
+from pathlib import Path
+from typing import Dict, Any
+
+try:
+    from openai import OpenAI
+except ImportError:
+    print("Error: openai package not found. Please install it with: pip install openai")
+    sys.exit(1)
+
+
+def evaluate_with_openai(prompt: str, text: str, rubric_max_score: int = 2) -> float:
+    """Evaluate response using OpenAI's API.
+    
+    Args:
+        prompt: System prompt for evaluation
+        text: Text to evaluate
+        rubric_max_score: Maximum score for the rubric (default: 2.0)
+        
+    Returns:
+        float: Evaluation score (0 to rubric_max_score)
+        
+    Raises:
+        ValueError: If OPENAI_API_KEY environment variable is not set
+    """
+    print("Starting OpenAI evaluation...")
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        print("No OpenAI API key found!")
+        raise ValueError("OPENAI_API_KEY environment variable is not set, but is needed to run this evaluation.")
+        
+    try:
+        client = OpenAI(api_key=api_key)
+        
+        # Append output instructions to system prompt
+        output_instructions = f"""
+Output Instructions:
+Return your evaluation as a JSON object in the following format:
+{{
+    "reasoning": "Your brief reasoning for the score",
+    "score": <integer between 0 and {rubric_max_score}>
+}}
+
+IMPORTANT: 
+- Do not use any markdown formatting (no ```json blocks)
+- Do not include any additional text before or after the JSON
+- Return only the raw JSON object
+- The score must be an integer between 0 and {rubric_max_score}"""
+        
+        input_prompt = f"{prompt} {output_instructions}\nResponse to evaluate: {text}"
+        
+        # Run the chat completion 3 times and collect scores
+        scores = []
+        for i in range(3):
+            max_retries = 5
+            retry_count = 0
+            
+            while retry_count < max_retries:
+                try:
+                    response = client.chat.completions.create(
+                        model="gpt-4o",
+                        messages=[
+                            {"role": "user", "content": input_prompt}
+                        ],
+                        temperature=0.9
+                    )
+                    
+                    # Extract and parse JSON from response
+                    response_text = response.choices[0].message.content.strip()
+                    try:
+                        evaluation = json.loads(response_text)
+                        score = float(evaluation.get("score", 0.0))
+                        score = max(0.0, min(score, rubric_max_score))
+                        scores.append(score)
+                        print(f"Run {i+1} score: {score}")
+                        break  # Successfully parsed, exit retry loop
+                    except (json.JSONDecodeError, ValueError) as e:
+                        retry_count += 1
+                        print(f"Error parsing OpenAI response as JSON (attempt {retry_count}/{max_retries}): {str(e)}")
+                        print(f"Response text: {response_text}")
+                        if retry_count == max_retries:
+                            raise ValueError(f"Failed to parse OpenAI evaluation response after {max_retries} attempts: {str(e)}")
+                        print("Retrying...")
+                        time.sleep(1)  # Wait 1 second before retrying
+                        continue
+                except Exception as e:
+                    # For other exceptions (API errors, etc.), raise immediately
+                    print(f"API error: {str(e)}")
+                    raise
+        
+        # Count occurrences of each score
+        score_counts = Counter(scores)
+        
+        # If there's no single most common score (all scores are different), run one more time
+        if len(scores) == 3 and max(score_counts.values()) == 1:
+            print("No majority score found. Running tie-breaker...")
+            max_retries = 5
+            retry_count = 0
+            
+            while retry_count < max_retries:
+                try:
+                    response = client.chat.completions.create(
+                        model="gpt-4o",
+                        messages=[
+                            {"role": "user", "content": input_prompt}
+                        ],
+                        temperature=0.9
+                    )
+                    
+                    response_text = response.choices[0].message.content.strip()
+                    try:
+                        evaluation = json.loads(response_text)
+                        score = float(evaluation.get("score", 0.0))
+                        score = max(0.0, min(score, rubric_max_score))
+                        scores.append(score)
+                        print(f"Tie-breaker score: {score}")
+                        score_counts = Counter(scores)
+                        break  # Successfully parsed, exit retry loop
+                    except (json.JSONDecodeError, ValueError) as e:
+                        retry_count += 1
+                        print(f"Error parsing tie-breaker response as JSON (attempt {retry_count}/{max_retries}): {str(e)}")
+                        print(f"Response text: {response_text}")
+                        if retry_count == max_retries:
+                            raise ValueError(f"Failed to parse tie-breaker response after {max_retries} attempts: {str(e)}")
+                        print("Retrying tie-breaker...")
+                        time.sleep(1)  # Wait 1 second before retrying
+                        continue
+                except Exception as e:
+                    # For other exceptions (API errors, etc.), raise immediately
+                    print(f"API error in tie-breaker: {str(e)}")
+                    raise
+        
+        # Get the most common score
+        most_common_score = score_counts.most_common(1)[0][0]
+        print(f"Most common score: {most_common_score} (occurred {score_counts[most_common_score]} times)")
+        return most_common_score
+            
+    except Exception as e:
+        if "OPENAI_API_KEY" in str(e):
+            raise  # Re-raise API key errors
+        print(f"Error evaluating with OpenAI: {str(e)}")
+        raise ValueError(f"OpenAI evaluation failed: {str(e)}")
+
+
+def load_eval_results(working_dir: Path) -> Dict[str, Any]:
+    """Load the eval-results.json file from the working directory."""
+    eval_results_path = working_dir / "eval-results.json"
+    if not eval_results_path.exists():
+        raise FileNotFoundError(f"eval-results.json not found in {working_dir}")
+    
+    with open(eval_results_path, 'r') as f:
+        return json.load(f)
+
+
+def load_output_file(working_dir: Path, output_file: str) -> str:
+    """Load the output file to evaluate from the working directory."""
+    output_path = working_dir / output_file
+    if not output_path.exists():
+        raise FileNotFoundError(f"Output file not found: {output_path}")
+    
+    with open(output_path, 'r') as f:
+        return f.read().strip()
+
+
+def load_evaluation_prompt(working_dir: Path) -> str:
+    """Load the evaluation prompt from a file or use a default.
+    
+    This function looks for a prompt.txt file in the working directory.
+    If not found, it returns a default evaluation prompt.
+    """
+    prompt_file = working_dir / "prompt.txt"
+    if prompt_file.exists():
+        with open(prompt_file, 'r') as f:
+            return f.read().strip()
+    
+    # Default evaluation prompt
+    return """You are an expert evaluator assessing the quality of AI responses.
+Evaluate the response based on the following criteria:
+- Accuracy and correctness
+- Completeness of the answer
+- Clarity and coherence
+- Helpfulness to the user
+
+Score the response on a scale from 0 to 2:
+0 = Poor response (incorrect, incomplete, or unhelpful)
+1 = Acceptable response (partially correct but with issues)
+2 = Excellent response (correct, complete, and helpful)"""
+
+
+def main():
+    parser = argparse.ArgumentParser(description="LLM Judge post-processing script for Goose benchmarks")
+    parser.add_argument("output_file", type=str, help="Name of the output file to evaluate (e.g., blog_summary_output.txt)")
+    parser.add_argument("--rubric-max-score", type=int, default=2, help="Maximum score for the rubric (default: 2)")
+    parser.add_argument("--prompt-file", type=str, help="Path to custom evaluation prompt file")
+    
+    args = parser.parse_args()
+    
+    # Use current working directory
+    working_dir = Path.cwd()
+    
+    try:
+        # Load eval results
+        eval_results = load_eval_results(working_dir)
+        
+        # Load the output file to evaluate
+        response_text = load_output_file(working_dir, args.output_file)
+        
+        # Load evaluation prompt
+        if args.prompt_file:
+            with open(args.prompt_file, 'r') as f:
+                evaluation_prompt = f.read().strip()
+        else:
+            evaluation_prompt = load_evaluation_prompt(working_dir)
+        
+        # Evaluate with OpenAI
+        score = evaluate_with_openai(evaluation_prompt, response_text, args.rubric_max_score)
+        
+        # Update eval results with the score
+        eval_results["metrics"].append([
+            "llm_judge_score", 
+            {"Float": score}
+        ])
+
+        # Save updated results
+        eval_results_path = working_dir / "eval-results.json"
+        with open(eval_results_path, 'w') as f:
+            json.dump(eval_results, f, indent=2)
+        
+        print(f"Successfully updated eval-results.json with LLM judge score: {score}")
+        
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/bench-postprocess-scripts/llm-judges/restaurant_research_prompt.txt
+++ b/scripts/bench-postprocess-scripts/llm-judges/restaurant_research_prompt.txt
@@ -0,0 +1,15 @@
+You are evaluating an AI assistant's response to a restaurant research task. The instructions were:
+
+'Search the internet for and provide a current, detailed list of the best Sichuanese restaurants specifically in the East Village neighborhood of NYC. Format your response in Markdown using bullet points (either - or *) for each restaurant. For each restaurant include:
+- Restaurant name and what they're known for
+- Signature dishes
+- Atmosphere/setting
+- Any relevant details about reservations or dining experience
+- What distinguishes them from others
+
+Present the information in order of significance or quality. Focus specifically on Sichuanese establishments, not general Chinese restaurants. If you encounter a page you cannot access, try another one. Do not ask me for confirmation just conduct the searches yourself until you find the needed information. Remember to use your tools if applicable.'
+
+Give a score of 0, 1, or 2:
+0 = does not provide any restaurants at all
+1 = provides some restaurants, but not all are Sichuanese or in the East Village NYC
+2 = provides all Sichuanese restaurants in the East Village, probably including Mala project and Szechuan Mountain House, or Uluh. Use your memory/knowledge of the East Village NYC restaurants to double check non-East Village restaurants.
--- a/scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh
+++ b/scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Wrapper script for LLM judge post-processing and final score calculation
+# This script is called by the benchmark runner with the eval results file as an argument
+
+# Get the directory where this script is located
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+# Get the eval results file path from the first argument
+EVAL_RESULTS_FILE="$1"
+
+# Extract the working directory from the eval results file path
+WORKING_DIR="$(dirname "$EVAL_RESULTS_FILE")"
+
+# Change to the working directory
+cd "$WORKING_DIR"
+
+# Determine the evaluation name from the eval-results.json
+EVAL_NAME=$(python3 -c "import json; print(json.load(open('eval-results.json'))['name'])")
+
+# Set the output file name and prompt file based on the evaluation
+if [ "$EVAL_NAME" = "blog_summary" ]; then
+    OUTPUT_FILE="blog_summary_output.txt"
+    PROMPT_FILE="$SCRIPT_DIR/blog_summary_prompt.txt"
+elif [ "$EVAL_NAME" = "restaurant_research" ]; then
+    OUTPUT_FILE="restaurant_research_output.txt"
+    PROMPT_FILE="$SCRIPT_DIR/restaurant_research_prompt.txt"
+else
+    echo "Error: Unknown evaluation name: $EVAL_NAME"
+    exit 1
+fi
+
+# Run the LLM judge script with the appropriate arguments
+python3 "$SCRIPT_DIR/llm_judge.py" "$OUTPUT_FILE" --prompt-file "$PROMPT_FILE"
+
+# Check if LLM judge succeeded
+if [ $? -ne 0 ]; then
+    echo "Error: LLM judge failed"
+    exit 1
+fi
+
+# Calculate the final score
+python3 "$SCRIPT_DIR/calculate_final_scores_vibes.py" "$EVAL_NAME"
+
+# Check if score calculation succeeded
+if [ $? -ne 0 ]; then
+    echo "Error: Final score calculation failed"
+    exit 1
+fi
+
+echo "Successfully completed post-processing for $EVAL_NAME"