[feat] goosebenchv2 additions for eval post-processing (#2619)

Co-authored-by: Alice Hau <ahau@squareup.com>
2026-02-23 15:34:27 +01:00 · 2025-05-21 15:00:13 -04:00
parent 8fade6b320
commit be09849128
18 changed files with 1471 additions and 106 deletions
--- a/scripts/bench-postprocess-scripts/generate_leaderboard.py
+++ b/scripts/bench-postprocess-scripts/generate_leaderboard.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+# Compatible with Python 3.6+
+"""
+Generate a leaderboard CSV from benchmark results, including server error information.
+
+This script:
+1. Looks for model folders in the benchmark directory
+2. Finds eval-results/aggregate_metrics.csv in each model folder
+3. Extracts key metrics (provider, model_name, eval_suite, eval_name, tool calls, execution time, tokens, score, prompt error, server error)
+4. Creates a union of all CSVs with these columns
+5. Creates a leaderboard.csv grouping by provider and model_name, averaging numeric columns
+
+Usage:
+    python generate_leaderboard.py --benchmark-dir /path/to/benchmark-dir
+"""
+
+import argparse
+import pandas as pd
+from pathlib import Path
+import sys
+
+
+def find_aggregate_metrics_files(benchmark_dir: Path) -> list:
+    """Find all aggregate_metrics.csv files in model subdirectories."""
+    csv_files = []
+    
+    # Look for model directories in the benchmark directory
+    for model_dir in benchmark_dir.iterdir():
+        if model_dir.is_dir():
+            # Look for eval-results/aggregate_metrics.csv in each model directory
+            eval_results_dir = model_dir / "eval-results"
+            if eval_results_dir.exists() and eval_results_dir.is_dir():
+                csv_path = eval_results_dir / "aggregate_metrics.csv"
+                if csv_path.exists():
+                    csv_files.append(csv_path)
+    
+    return csv_files
+
+
+def process_csv_files(csv_files: list) -> tuple:
+    """
+    Process all CSV files and create two dataframes:
+    1. A union of all CSVs with selected columns
+    2. A leaderboard grouping by provider and model_name with averaged metrics
+    """
+    selected_columns = [
+        'provider', 
+        'model_name', 
+        'eval_suite', 
+        'eval_name', 
+        'total_tool_calls_mean', 
+        'prompt_execution_time_mean', 
+        'total_tokens_mean', 
+        'score_mean', 
+        'prompt_error_mean',
+        'server_error_mean' 
+    ]
+    
+    all_data = []
+    
+    for csv_file in csv_files:
+        try:
+            df = pd.read_csv(csv_file)
+            
+            # Check which selected columns are available
+            missing_columns = [col for col in selected_columns if col not in df.columns]
+            if missing_columns:
+                print(f"Warning: {csv_file} is missing columns: {missing_columns}")
+                
+                # For missing columns, add them with NaN values
+                for col in missing_columns:
+                    df[col] = float('nan')
+            
+            # Select only the columns we care about
+            df_subset = df[selected_columns].copy()  # Create a copy to avoid SettingWithCopyWarning
+            
+            # Add model folder name as additional context
+            model_folder = csv_file.parent.parent.name
+            df_subset['model_folder'] = model_folder
+            
+            all_data.append(df_subset)
+            
+        except Exception as e:
+            print(f"Error processing {csv_file}: {str(e)}")
+    
+    if not all_data:
+        raise ValueError("No valid CSV files found with required columns")
+    
+    # Concatenate all dataframes to create a union
+    union_df = pd.concat(all_data, ignore_index=True)
+    
+    # Create leaderboard by grouping and averaging numerical columns
+    numeric_columns = [
+        'total_tool_calls_mean', 
+        'prompt_execution_time_mean', 
+        'total_tokens_mean', 
+        'score_mean', 
+        'prompt_error_mean',
+        'server_error_mean'
+    ]
+    
+    # Group by provider and model_name, then calculate averages for numeric columns
+    leaderboard_df = union_df.groupby(['provider', 'model_name'])[numeric_columns].mean().reset_index()
+    
+    # Sort by score_mean in descending order (highest scores first)
+    leaderboard_df = leaderboard_df.sort_values('score_mean', ascending=False)
+    
+    return union_df, leaderboard_df
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate a leaderboard CSV from benchmark results, including server error information"
+    )
+    parser.add_argument(
+        "--benchmark-dir",
+        type=str,
+        required=True,
+        help="Path to the benchmark directory containing model subdirectories"
+    )
+    parser.add_argument(
+        "--union-output",
+        type=str,
+        default="all_metrics.csv",
+        help="Output filename for the union of all CSVs (default: all_metrics.csv)"
+    )
+    parser.add_argument(
+        "--leaderboard-output",
+        type=str,
+        default="leaderboard.csv",
+        help="Output filename for the leaderboard (default: leaderboard.csv)"
+    )
+    
+    args = parser.parse_args()
+    
+    benchmark_dir = Path(args.benchmark_dir)
+    if not benchmark_dir.exists() or not benchmark_dir.is_dir():
+        print(f"Error: Benchmark directory {benchmark_dir} does not exist or is not a directory")
+        sys.exit(1)
+    
+    try:
+        # Find all aggregate_metrics.csv files in model subdirectories
+        csv_files = find_aggregate_metrics_files(benchmark_dir)
+        
+        if not csv_files:
+            print(f"No aggregate_metrics.csv files found in any model directory under {benchmark_dir}")
+            sys.exit(1)
+        
+        print(f"Found {len(csv_files)} aggregate_metrics.csv files in model directories")
+        
+        # Process and create the union and leaderboard dataframes
+        union_df, leaderboard_df = process_csv_files(csv_files)
+        
+        # Save the union CSV to the benchmark directory
+        union_output_path = benchmark_dir / args.union_output
+        union_df.to_csv(union_output_path, index=False)
+        print(f"Union CSV with all metrics saved to: {union_output_path}")
+        
+        # Save the leaderboard CSV to the benchmark directory
+        leaderboard_output_path = benchmark_dir / args.leaderboard_output
+        leaderboard_df.to_csv(leaderboard_output_path, index=False)
+        print(f"Leaderboard CSV with averaged metrics saved to: {leaderboard_output_path}")
+        
+        # Print a summary of the leaderboard
+        print("\nLeaderboard Summary:")
+        pd.set_option('display.max_columns', None)  # Show all columns
+        print(leaderboard_df.to_string(index=False))
+        
+        # Highlight models with server errors
+        if 'server_error_mean' in leaderboard_df.columns:
+            models_with_errors = leaderboard_df[leaderboard_df['server_error_mean'] > 0]
+            if not models_with_errors.empty:
+                print("\nWARNING - Models with server errors detected:")
+                for _, row in models_with_errors.iterrows():
+                    print(f"  * {row['provider']} {row['model_name']} - {row['server_error_mean']*100:.1f}% of evaluations had server errors")
+                print("\nThese models may need to be re-run to get accurate results.")
+        
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()