mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-17 14:14:26 +01:00
185 lines
6.8 KiB
Python
Executable File
185 lines
6.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Compatible with Python 3.6+
|
|
"""
|
|
Generate a leaderboard CSV from benchmark results, including server error information.
|
|
|
|
This script:
|
|
1. Looks for model folders in the benchmark directory
|
|
2. Finds eval-results/aggregate_metrics.csv in each model folder
|
|
3. Extracts key metrics (provider, model_name, eval_suite, eval_name, tool calls, execution time, tokens, score, prompt error, server error)
|
|
4. Creates a union of all CSVs with these columns
|
|
5. Creates a leaderboard.csv grouping by provider and model_name, averaging numeric columns
|
|
|
|
Usage:
|
|
python generate_leaderboard.py --benchmark-dir /path/to/benchmark-dir
|
|
"""
|
|
|
|
import argparse
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
|
|
def find_aggregate_metrics_files(benchmark_dir: Path) -> list:
|
|
"""Find all aggregate_metrics.csv files in model subdirectories."""
|
|
csv_files = []
|
|
|
|
# Look for model directories in the benchmark directory
|
|
for model_dir in benchmark_dir.iterdir():
|
|
if model_dir.is_dir():
|
|
# Look for eval-results/aggregate_metrics.csv in each model directory
|
|
eval_results_dir = model_dir / "eval-results"
|
|
if eval_results_dir.exists() and eval_results_dir.is_dir():
|
|
csv_path = eval_results_dir / "aggregate_metrics.csv"
|
|
if csv_path.exists():
|
|
csv_files.append(csv_path)
|
|
|
|
return csv_files
|
|
|
|
|
|
def process_csv_files(csv_files: list) -> tuple:
|
|
"""
|
|
Process all CSV files and create two dataframes:
|
|
1. A union of all CSVs with selected columns
|
|
2. A leaderboard grouping by provider and model_name with averaged metrics
|
|
"""
|
|
selected_columns = [
|
|
'provider',
|
|
'model_name',
|
|
'eval_suite',
|
|
'eval_name',
|
|
'total_tool_calls_mean',
|
|
'prompt_execution_time_mean',
|
|
'total_tokens_mean',
|
|
'score_mean',
|
|
'prompt_error_mean',
|
|
'server_error_mean'
|
|
]
|
|
|
|
all_data = []
|
|
|
|
for csv_file in csv_files:
|
|
try:
|
|
df = pd.read_csv(csv_file)
|
|
|
|
# Check which selected columns are available
|
|
missing_columns = [col for col in selected_columns if col not in df.columns]
|
|
if missing_columns:
|
|
print(f"Warning: {csv_file} is missing columns: {missing_columns}")
|
|
|
|
# For missing columns, add them with NaN values
|
|
for col in missing_columns:
|
|
df[col] = float('nan')
|
|
|
|
# Select only the columns we care about
|
|
df_subset = df[selected_columns].copy() # Create a copy to avoid SettingWithCopyWarning
|
|
|
|
# Add model folder name as additional context
|
|
model_folder = csv_file.parent.parent.name
|
|
df_subset['model_folder'] = model_folder
|
|
|
|
all_data.append(df_subset)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {csv_file}: {str(e)}")
|
|
|
|
if not all_data:
|
|
raise ValueError("No valid CSV files found with required columns")
|
|
|
|
# Concatenate all dataframes to create a union
|
|
union_df = pd.concat(all_data, ignore_index=True)
|
|
|
|
# Create leaderboard by grouping and averaging numerical columns
|
|
numeric_columns = [
|
|
'total_tool_calls_mean',
|
|
'prompt_execution_time_mean',
|
|
'total_tokens_mean',
|
|
'score_mean',
|
|
'prompt_error_mean',
|
|
'server_error_mean'
|
|
]
|
|
|
|
# Group by provider and model_name, then calculate averages for numeric columns
|
|
leaderboard_df = union_df.groupby(['provider', 'model_name'])[numeric_columns].mean().reset_index()
|
|
|
|
# Sort by score_mean in descending order (highest scores first)
|
|
leaderboard_df = leaderboard_df.sort_values('score_mean', ascending=False)
|
|
|
|
return union_df, leaderboard_df
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate a leaderboard CSV from benchmark results, including server error information"
|
|
)
|
|
parser.add_argument(
|
|
"--benchmark-dir",
|
|
type=str,
|
|
required=True,
|
|
help="Path to the benchmark directory containing model subdirectories"
|
|
)
|
|
parser.add_argument(
|
|
"--union-output",
|
|
type=str,
|
|
default="all_metrics.csv",
|
|
help="Output filename for the union of all CSVs (default: all_metrics.csv)"
|
|
)
|
|
parser.add_argument(
|
|
"--leaderboard-output",
|
|
type=str,
|
|
default="leaderboard.csv",
|
|
help="Output filename for the leaderboard (default: leaderboard.csv)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
benchmark_dir = Path(args.benchmark_dir)
|
|
if not benchmark_dir.exists() or not benchmark_dir.is_dir():
|
|
print(f"Error: Benchmark directory {benchmark_dir} does not exist or is not a directory")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
# Find all aggregate_metrics.csv files in model subdirectories
|
|
csv_files = find_aggregate_metrics_files(benchmark_dir)
|
|
|
|
if not csv_files:
|
|
print(f"No aggregate_metrics.csv files found in any model directory under {benchmark_dir}")
|
|
sys.exit(1)
|
|
|
|
print(f"Found {len(csv_files)} aggregate_metrics.csv files in model directories")
|
|
|
|
# Process and create the union and leaderboard dataframes
|
|
union_df, leaderboard_df = process_csv_files(csv_files)
|
|
|
|
# Save the union CSV to the benchmark directory
|
|
union_output_path = benchmark_dir / args.union_output
|
|
union_df.to_csv(union_output_path, index=False)
|
|
print(f"Union CSV with all metrics saved to: {union_output_path}")
|
|
|
|
# Save the leaderboard CSV to the benchmark directory
|
|
leaderboard_output_path = benchmark_dir / args.leaderboard_output
|
|
leaderboard_df.to_csv(leaderboard_output_path, index=False)
|
|
print(f"Leaderboard CSV with averaged metrics saved to: {leaderboard_output_path}")
|
|
|
|
# Print a summary of the leaderboard
|
|
print("\nLeaderboard Summary:")
|
|
pd.set_option('display.max_columns', None) # Show all columns
|
|
print(leaderboard_df.to_string(index=False))
|
|
|
|
# Highlight models with server errors
|
|
if 'server_error_mean' in leaderboard_df.columns:
|
|
models_with_errors = leaderboard_df[leaderboard_df['server_error_mean'] > 0]
|
|
if not models_with_errors.empty:
|
|
print("\nWARNING - Models with server errors detected:")
|
|
for _, row in models_with_errors.iterrows():
|
|
print(f" * {row['provider']} {row['model_name']} - {row['server_error_mean']*100:.1f}% of evaluations had server errors")
|
|
print("\nThese models may need to be re-run to get accurate results.")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {str(e)}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|