[feat] goosebenchv2 additions for eval post-processing (#2619)

Co-authored-by: Alice Hau <ahau@squareup.com>
2025-12-18 22:54:24 +01:00 · 2025-05-21 15:00:13 -04:00
parent 8fade6b320
commit be09849128
18 changed files with 1471 additions and 106 deletions
--- a/scripts/bench-postprocess-scripts/llm-judges/llm_judge.py
+++ b/scripts/bench-postprocess-scripts/llm-judges/llm_judge.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+LLM Judge post-processing script for Goose benchmarks.
+
+This script evaluates benchmark results using OpenAI's API as a judge.
+It reads the eval-results.json file and a specified output file, then uses
+OpenAI to score the output based on a provided rubric.
+
+Usage:
+    python llm_judge.py <output_file> [--rubric-max-score N] [--prompt-file PATH]
+    
+Arguments:
+    output_file: Name of the file containing the output to evaluate (e.g., blog_summary_output.txt)
+    --rubric-max-score: Maximum score for the rubric (default: 2)
+    --prompt-file: Path to custom evaluation prompt file
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from collections import Counter
+from pathlib import Path
+from typing import Dict, Any
+
+try:
+    from openai import OpenAI
+except ImportError:
+    print("Error: openai package not found. Please install it with: pip install openai")
+    sys.exit(1)
+
+
+def evaluate_with_openai(prompt: str, text: str, rubric_max_score: int = 2) -> float:
+    """Evaluate response using OpenAI's API.
+    
+    Args:
+        prompt: System prompt for evaluation
+        text: Text to evaluate
+        rubric_max_score: Maximum score for the rubric (default: 2.0)
+        
+    Returns:
+        float: Evaluation score (0 to rubric_max_score)
+        
+    Raises:
+        ValueError: If OPENAI_API_KEY environment variable is not set
+    """
+    print("Starting OpenAI evaluation...")
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        print("No OpenAI API key found!")
+        raise ValueError("OPENAI_API_KEY environment variable is not set, but is needed to run this evaluation.")
+        
+    try:
+        client = OpenAI(api_key=api_key)
+        
+        # Append output instructions to system prompt
+        output_instructions = f"""
+Output Instructions:
+Return your evaluation as a JSON object in the following format:
+{{
+    "reasoning": "Your brief reasoning for the score",
+    "score": <integer between 0 and {rubric_max_score}>
+}}
+
+IMPORTANT: 
+- Do not use any markdown formatting (no ```json blocks)
+- Do not include any additional text before or after the JSON
+- Return only the raw JSON object
+- The score must be an integer between 0 and {rubric_max_score}"""
+        
+        input_prompt = f"{prompt} {output_instructions}\nResponse to evaluate: {text}"
+        
+        # Run the chat completion 3 times and collect scores
+        scores = []
+        for i in range(3):
+            max_retries = 5
+            retry_count = 0
+            
+            while retry_count < max_retries:
+                try:
+                    response = client.chat.completions.create(
+                        model="gpt-4o",
+                        messages=[
+                            {"role": "user", "content": input_prompt}
+                        ],
+                        temperature=0.9
+                    )
+                    
+                    # Extract and parse JSON from response
+                    response_text = response.choices[0].message.content.strip()
+                    try:
+                        evaluation = json.loads(response_text)
+                        score = float(evaluation.get("score", 0.0))
+                        score = max(0.0, min(score, rubric_max_score))
+                        scores.append(score)
+                        print(f"Run {i+1} score: {score}")
+                        break  # Successfully parsed, exit retry loop
+                    except (json.JSONDecodeError, ValueError) as e:
+                        retry_count += 1
+                        print(f"Error parsing OpenAI response as JSON (attempt {retry_count}/{max_retries}): {str(e)}")
+                        print(f"Response text: {response_text}")
+                        if retry_count == max_retries:
+                            raise ValueError(f"Failed to parse OpenAI evaluation response after {max_retries} attempts: {str(e)}")
+                        print("Retrying...")
+                        time.sleep(1)  # Wait 1 second before retrying
+                        continue
+                except Exception as e:
+                    # For other exceptions (API errors, etc.), raise immediately
+                    print(f"API error: {str(e)}")
+                    raise
+        
+        # Count occurrences of each score
+        score_counts = Counter(scores)
+        
+        # If there's no single most common score (all scores are different), run one more time
+        if len(scores) == 3 and max(score_counts.values()) == 1:
+            print("No majority score found. Running tie-breaker...")
+            max_retries = 5
+            retry_count = 0
+            
+            while retry_count < max_retries:
+                try:
+                    response = client.chat.completions.create(
+                        model="gpt-4o",
+                        messages=[
+                            {"role": "user", "content": input_prompt}
+                        ],
+                        temperature=0.9
+                    )
+                    
+                    response_text = response.choices[0].message.content.strip()
+                    try:
+                        evaluation = json.loads(response_text)
+                        score = float(evaluation.get("score", 0.0))
+                        score = max(0.0, min(score, rubric_max_score))
+                        scores.append(score)
+                        print(f"Tie-breaker score: {score}")
+                        score_counts = Counter(scores)
+                        break  # Successfully parsed, exit retry loop
+                    except (json.JSONDecodeError, ValueError) as e:
+                        retry_count += 1
+                        print(f"Error parsing tie-breaker response as JSON (attempt {retry_count}/{max_retries}): {str(e)}")
+                        print(f"Response text: {response_text}")
+                        if retry_count == max_retries:
+                            raise ValueError(f"Failed to parse tie-breaker response after {max_retries} attempts: {str(e)}")
+                        print("Retrying tie-breaker...")
+                        time.sleep(1)  # Wait 1 second before retrying
+                        continue
+                except Exception as e:
+                    # For other exceptions (API errors, etc.), raise immediately
+                    print(f"API error in tie-breaker: {str(e)}")
+                    raise
+        
+        # Get the most common score
+        most_common_score = score_counts.most_common(1)[0][0]
+        print(f"Most common score: {most_common_score} (occurred {score_counts[most_common_score]} times)")
+        return most_common_score
+            
+    except Exception as e:
+        if "OPENAI_API_KEY" in str(e):
+            raise  # Re-raise API key errors
+        print(f"Error evaluating with OpenAI: {str(e)}")
+        raise ValueError(f"OpenAI evaluation failed: {str(e)}")
+
+
+def load_eval_results(working_dir: Path) -> Dict[str, Any]:
+    """Load the eval-results.json file from the working directory."""
+    eval_results_path = working_dir / "eval-results.json"
+    if not eval_results_path.exists():
+        raise FileNotFoundError(f"eval-results.json not found in {working_dir}")
+    
+    with open(eval_results_path, 'r') as f:
+        return json.load(f)
+
+
+def load_output_file(working_dir: Path, output_file: str) -> str:
+    """Load the output file to evaluate from the working directory."""
+    output_path = working_dir / output_file
+    if not output_path.exists():
+        raise FileNotFoundError(f"Output file not found: {output_path}")
+    
+    with open(output_path, 'r') as f:
+        return f.read().strip()
+
+
+def load_evaluation_prompt(working_dir: Path) -> str:
+    """Load the evaluation prompt from a file or use a default.
+    
+    This function looks for a prompt.txt file in the working directory.
+    If not found, it returns a default evaluation prompt.
+    """
+    prompt_file = working_dir / "prompt.txt"
+    if prompt_file.exists():
+        with open(prompt_file, 'r') as f:
+            return f.read().strip()
+    
+    # Default evaluation prompt
+    return """You are an expert evaluator assessing the quality of AI responses.
+Evaluate the response based on the following criteria:
+- Accuracy and correctness
+- Completeness of the answer
+- Clarity and coherence
+- Helpfulness to the user
+
+Score the response on a scale from 0 to 2:
+0 = Poor response (incorrect, incomplete, or unhelpful)
+1 = Acceptable response (partially correct but with issues)
+2 = Excellent response (correct, complete, and helpful)"""
+
+
+def main():
+    parser = argparse.ArgumentParser(description="LLM Judge post-processing script for Goose benchmarks")
+    parser.add_argument("output_file", type=str, help="Name of the output file to evaluate (e.g., blog_summary_output.txt)")
+    parser.add_argument("--rubric-max-score", type=int, default=2, help="Maximum score for the rubric (default: 2)")
+    parser.add_argument("--prompt-file", type=str, help="Path to custom evaluation prompt file")
+    
+    args = parser.parse_args()
+    
+    # Use current working directory
+    working_dir = Path.cwd()
+    
+    try:
+        # Load eval results
+        eval_results = load_eval_results(working_dir)
+        
+        # Load the output file to evaluate
+        response_text = load_output_file(working_dir, args.output_file)
+        
+        # Load evaluation prompt
+        if args.prompt_file:
+            with open(args.prompt_file, 'r') as f:
+                evaluation_prompt = f.read().strip()
+        else:
+            evaluation_prompt = load_evaluation_prompt(working_dir)
+        
+        # Evaluate with OpenAI
+        score = evaluate_with_openai(evaluation_prompt, response_text, args.rubric_max_score)
+        
+        # Update eval results with the score
+        eval_results["metrics"].append([
+            "llm_judge_score", 
+            {"Float": score}
+        ])
+
+        # Save updated results
+        eval_results_path = working_dir / "eval-results.json"
+        with open(eval_results_path, 'w') as f:
+            json.dump(eval_results, f, indent=2)
+        
+        print(f"Successfully updated eval-results.json with LLM judge score: {score}")
+        
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()