goose/scripts/bench-postprocess-scripts/generate_leaderboard.py

#!/usr/bin/env python3
# Compatible with Python 3.6+
"""
Generate a leaderboard CSV from benchmark results, including server error information.

This script:
1. Looks for model folders in the benchmark directory
2. Finds eval-results/aggregate_metrics.csv in each model folder
3. Extracts key metrics (provider, model_name, eval_suite, eval_name, tool calls, execution time, tokens, score, prompt error, server error)
4. Creates a union of all CSVs with these columns
5. Creates a leaderboard.csv grouping by provider and model_name, averaging numeric columns

Usage:
    python generate_leaderboard.py --benchmark-dir /path/to/benchmark-dir
"""

import argparse
import pandas as pd
from pathlib import Path
import sys


def find_aggregate_metrics_files(benchmark_dir: Path) -> list:
    """Find all aggregate_metrics.csv files in model subdirectories."""
    csv_files = []

    # Look for model directories in the benchmark directory
    for model_dir in benchmark_dir.iterdir():
        if model_dir.is_dir():
            # Look for eval-results/aggregate_metrics.csv in each model directory
            eval_results_dir = model_dir / "eval-results"
            if eval_results_dir.exists() and eval_results_dir.is_dir():
                csv_path = eval_results_dir / "aggregate_metrics.csv"
                if csv_path.exists():
                    csv_files.append(csv_path)

    return csv_files


def process_csv_files(csv_files: list) -> tuple:
    """
    Process all CSV files and create two dataframes:
    1. A union of all CSVs with selected columns
    2. A leaderboard grouping by provider and model_name with averaged metrics
    """
    selected_columns = [
        'provider',
        'model_name',
        'eval_suite',
        'eval_name',
        'total_tool_calls_mean',
        'prompt_execution_time_mean',
        'total_tokens_mean',
        'score_mean',
        'prompt_error_mean',
        'server_error_mean'
    ]

    all_data = []

    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)

            # Check which selected columns are available
            missing_columns = [col for col in selected_columns if col not in df.columns]
            if missing_columns:
                print(f"Warning: {csv_file} is missing columns: {missing_columns}")

                # For missing columns, add them with NaN values
                for col in missing_columns:
                    df[col] = float('nan')

            # Select only the columns we care about
            df_subset = df[selected_columns].copy()  # Create a copy to avoid SettingWithCopyWarning

            # Add model folder name as additional context
            model_folder = csv_file.parent.parent.name
            df_subset['model_folder'] = model_folder

            all_data.append(df_subset)

        except Exception as e:
            print(f"Error processing {csv_file}: {str(e)}")

    if not all_data:
        raise ValueError("No valid CSV files found with required columns")

    # Concatenate all dataframes to create a union
    union_df = pd.concat(all_data, ignore_index=True)

    # Create leaderboard by grouping and averaging numerical columns
    numeric_columns = [
        'total_tool_calls_mean',
        'prompt_execution_time_mean',
        'total_tokens_mean',
        'score_mean',
        'prompt_error_mean',
        'server_error_mean'
    ]

    # Group by provider and model_name, then calculate averages for numeric columns
    leaderboard_df = union_df.groupby(['provider', 'model_name'])[numeric_columns].mean().reset_index()

    # Sort by score_mean in descending order (highest scores first)
    leaderboard_df = leaderboard_df.sort_values('score_mean', ascending=False)

    return union_df, leaderboard_df


def main():
    parser = argparse.ArgumentParser(
        description="Generate a leaderboard CSV from benchmark results, including server error information"
    )
    parser.add_argument(
        "--benchmark-dir",
        type=str,
        required=True,
        help="Path to the benchmark directory containing model subdirectories"
    )
    parser.add_argument(
        "--union-output",
        type=str,
        default="all_metrics.csv",
        help="Output filename for the union of all CSVs (default: all_metrics.csv)"
    )
    parser.add_argument(
        "--leaderboard-output",
        type=str,
        default="leaderboard.csv",
        help="Output filename for the leaderboard (default: leaderboard.csv)"
    )

    args = parser.parse_args()

    benchmark_dir = Path(args.benchmark_dir)
    if not benchmark_dir.exists() or not benchmark_dir.is_dir():
        print(f"Error: Benchmark directory {benchmark_dir} does not exist or is not a directory")
        sys.exit(1)

    try:
        # Find all aggregate_metrics.csv files in model subdirectories
        csv_files = find_aggregate_metrics_files(benchmark_dir)

        if not csv_files:
            print(f"No aggregate_metrics.csv files found in any model directory under {benchmark_dir}")
            sys.exit(1)

        print(f"Found {len(csv_files)} aggregate_metrics.csv files in model directories")

        # Process and create the union and leaderboard dataframes
        union_df, leaderboard_df = process_csv_files(csv_files)

        # Save the union CSV to the benchmark directory
        union_output_path = benchmark_dir / args.union_output
        union_df.to_csv(union_output_path, index=False)
        print(f"Union CSV with all metrics saved to: {union_output_path}")

        # Save the leaderboard CSV to the benchmark directory
        leaderboard_output_path = benchmark_dir / args.leaderboard_output
        leaderboard_df.to_csv(leaderboard_output_path, index=False)
        print(f"Leaderboard CSV with averaged metrics saved to: {leaderboard_output_path}")

        # Print a summary of the leaderboard
        print("\nLeaderboard Summary:")
        pd.set_option('display.max_columns', None)  # Show all columns
        print(leaderboard_df.to_string(index=False))

        # Highlight models with server errors
        if 'server_error_mean' in leaderboard_df.columns:
            models_with_errors = leaderboard_df[leaderboard_df['server_error_mean'] > 0]
            if not models_with_errors.empty:
                print("\nWARNING - Models with server errors detected:")
                for _, row in models_with_errors.iterrows():
                    print(f"  * {row['provider']} {row['model_name']} - {row['server_error_mean']*100:.1f}% of evaluations had server errors")
                print("\nThese models may need to be re-run to get accurate results.")

    except Exception as e:
        print(f"Error: {str(e)}")
        sys.exit(1)


if __name__ == "__main__":
    main()