mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-17 14:14:26 +01:00
[feat] goosebenchv2 additions for eval post-processing (#2619)
Co-authored-by: Alice Hau <ahau@squareup.com>
This commit is contained in:
184
scripts/bench-postprocess-scripts/generate_leaderboard.py
Executable file
184
scripts/bench-postprocess-scripts/generate_leaderboard.py
Executable file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env python3
|
||||
# Compatible with Python 3.6+
|
||||
"""
|
||||
Generate a leaderboard CSV from benchmark results, including server error information.
|
||||
|
||||
This script:
|
||||
1. Looks for model folders in the benchmark directory
|
||||
2. Finds eval-results/aggregate_metrics.csv in each model folder
|
||||
3. Extracts key metrics (provider, model_name, eval_suite, eval_name, tool calls, execution time, tokens, score, prompt error, server error)
|
||||
4. Creates a union of all CSVs with these columns
|
||||
5. Creates a leaderboard.csv grouping by provider and model_name, averaging numeric columns
|
||||
|
||||
Usage:
|
||||
python generate_leaderboard.py --benchmark-dir /path/to/benchmark-dir
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
|
||||
def find_aggregate_metrics_files(benchmark_dir: Path) -> list:
|
||||
"""Find all aggregate_metrics.csv files in model subdirectories."""
|
||||
csv_files = []
|
||||
|
||||
# Look for model directories in the benchmark directory
|
||||
for model_dir in benchmark_dir.iterdir():
|
||||
if model_dir.is_dir():
|
||||
# Look for eval-results/aggregate_metrics.csv in each model directory
|
||||
eval_results_dir = model_dir / "eval-results"
|
||||
if eval_results_dir.exists() and eval_results_dir.is_dir():
|
||||
csv_path = eval_results_dir / "aggregate_metrics.csv"
|
||||
if csv_path.exists():
|
||||
csv_files.append(csv_path)
|
||||
|
||||
return csv_files
|
||||
|
||||
|
||||
def process_csv_files(csv_files: list) -> tuple:
|
||||
"""
|
||||
Process all CSV files and create two dataframes:
|
||||
1. A union of all CSVs with selected columns
|
||||
2. A leaderboard grouping by provider and model_name with averaged metrics
|
||||
"""
|
||||
selected_columns = [
|
||||
'provider',
|
||||
'model_name',
|
||||
'eval_suite',
|
||||
'eval_name',
|
||||
'total_tool_calls_mean',
|
||||
'prompt_execution_time_mean',
|
||||
'total_tokens_mean',
|
||||
'score_mean',
|
||||
'prompt_error_mean',
|
||||
'server_error_mean'
|
||||
]
|
||||
|
||||
all_data = []
|
||||
|
||||
for csv_file in csv_files:
|
||||
try:
|
||||
df = pd.read_csv(csv_file)
|
||||
|
||||
# Check which selected columns are available
|
||||
missing_columns = [col for col in selected_columns if col not in df.columns]
|
||||
if missing_columns:
|
||||
print(f"Warning: {csv_file} is missing columns: {missing_columns}")
|
||||
|
||||
# For missing columns, add them with NaN values
|
||||
for col in missing_columns:
|
||||
df[col] = float('nan')
|
||||
|
||||
# Select only the columns we care about
|
||||
df_subset = df[selected_columns].copy() # Create a copy to avoid SettingWithCopyWarning
|
||||
|
||||
# Add model folder name as additional context
|
||||
model_folder = csv_file.parent.parent.name
|
||||
df_subset['model_folder'] = model_folder
|
||||
|
||||
all_data.append(df_subset)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {csv_file}: {str(e)}")
|
||||
|
||||
if not all_data:
|
||||
raise ValueError("No valid CSV files found with required columns")
|
||||
|
||||
# Concatenate all dataframes to create a union
|
||||
union_df = pd.concat(all_data, ignore_index=True)
|
||||
|
||||
# Create leaderboard by grouping and averaging numerical columns
|
||||
numeric_columns = [
|
||||
'total_tool_calls_mean',
|
||||
'prompt_execution_time_mean',
|
||||
'total_tokens_mean',
|
||||
'score_mean',
|
||||
'prompt_error_mean',
|
||||
'server_error_mean'
|
||||
]
|
||||
|
||||
# Group by provider and model_name, then calculate averages for numeric columns
|
||||
leaderboard_df = union_df.groupby(['provider', 'model_name'])[numeric_columns].mean().reset_index()
|
||||
|
||||
# Sort by score_mean in descending order (highest scores first)
|
||||
leaderboard_df = leaderboard_df.sort_values('score_mean', ascending=False)
|
||||
|
||||
return union_df, leaderboard_df
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate a leaderboard CSV from benchmark results, including server error information"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--benchmark-dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the benchmark directory containing model subdirectories"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--union-output",
|
||||
type=str,
|
||||
default="all_metrics.csv",
|
||||
help="Output filename for the union of all CSVs (default: all_metrics.csv)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--leaderboard-output",
|
||||
type=str,
|
||||
default="leaderboard.csv",
|
||||
help="Output filename for the leaderboard (default: leaderboard.csv)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
benchmark_dir = Path(args.benchmark_dir)
|
||||
if not benchmark_dir.exists() or not benchmark_dir.is_dir():
|
||||
print(f"Error: Benchmark directory {benchmark_dir} does not exist or is not a directory")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Find all aggregate_metrics.csv files in model subdirectories
|
||||
csv_files = find_aggregate_metrics_files(benchmark_dir)
|
||||
|
||||
if not csv_files:
|
||||
print(f"No aggregate_metrics.csv files found in any model directory under {benchmark_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(csv_files)} aggregate_metrics.csv files in model directories")
|
||||
|
||||
# Process and create the union and leaderboard dataframes
|
||||
union_df, leaderboard_df = process_csv_files(csv_files)
|
||||
|
||||
# Save the union CSV to the benchmark directory
|
||||
union_output_path = benchmark_dir / args.union_output
|
||||
union_df.to_csv(union_output_path, index=False)
|
||||
print(f"Union CSV with all metrics saved to: {union_output_path}")
|
||||
|
||||
# Save the leaderboard CSV to the benchmark directory
|
||||
leaderboard_output_path = benchmark_dir / args.leaderboard_output
|
||||
leaderboard_df.to_csv(leaderboard_output_path, index=False)
|
||||
print(f"Leaderboard CSV with averaged metrics saved to: {leaderboard_output_path}")
|
||||
|
||||
# Print a summary of the leaderboard
|
||||
print("\nLeaderboard Summary:")
|
||||
pd.set_option('display.max_columns', None) # Show all columns
|
||||
print(leaderboard_df.to_string(index=False))
|
||||
|
||||
# Highlight models with server errors
|
||||
if 'server_error_mean' in leaderboard_df.columns:
|
||||
models_with_errors = leaderboard_df[leaderboard_df['server_error_mean'] > 0]
|
||||
if not models_with_errors.empty:
|
||||
print("\nWARNING - Models with server errors detected:")
|
||||
for _, row in models_with_errors.iterrows():
|
||||
print(f" * {row['provider']} {row['model_name']} - {row['server_error_mean']*100:.1f}% of evaluations had server errors")
|
||||
print("\nThese models may need to be re-run to get accurate results.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user