[feat] goosebenchv2 additions for eval post-processing (#2619)

Co-authored-by: Alice Hau <ahau@squareup.com>
2025-12-17 22:24:21 +01:00 · 2025-05-21 15:00:13 -04:00
parent 8fade6b320
commit be09849128
18 changed files with 1471 additions and 106 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1974,6 +1974,12 @@ version = "0.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f"

+[[package]]
+name = "dotenvy"
+version = "0.15.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
+
 [[package]]
 name = "downcast"
 version = "0.11.0"
@@ -2578,6 +2584,7 @@ dependencies = [
 "async-trait",
 "chrono",
 "ctor",
+ "dotenvy",
 "goose",
 "include_dir",
 "mcp-core",
--- a/crates/goose-bench/Cargo.toml
+++ b/crates/goose-bench/Cargo.toml
@@ -25,6 +25,7 @@ include_dir = "0.7.4"
 once_cell = "1.19"
 regex = "1.11.1"
 toml = "0.8.20"
+dotenvy = "0.15.7"

 [target.'cfg(target_os = "windows")'.dependencies]
 winapi = { version = "0.3", features = ["wincred"] }
--- a/crates/goose-bench/README.md
+++ b/crates/goose-bench/README.md
@@ -0,0 +1,273 @@
+# Goose Benchmarking Framework
+
+The `goose-bench` crate provides a framework for benchmarking and evaluating LLM models with the Goose framework. This tool helps quantify model performance across various tasks and generate structured reports.
+
+## Features
+
+- Run benchmark suites across multiple LLM models
+- Execute evaluations in parallel when supported
+- Generate structured JSON and CSV reports
+- Process evaluation results with custom scripts
+- Calculate aggregate metrics across evaluations
+- Support for tool-shim evaluation
+- Generate leaderboards and comparative metrics
+
+## Prerequisites
+
+- **Python Environment**: The `generate-leaderboard` command executes Python scripts and requires a valid Python environment with necessary dependencies (pandas, etc.)
+- **OpenAI API Key**: For evaluations using LLM-as-judge (like `blog_summary` and `restaurant_research`), you must have an `OPENAI_API_KEY` environment variable set, as the judge uses the OpenAI GPT-4o model
+
+## Benchmark Workflow
+
+Running benchmarks is a two-step process:
+
+### Step 1: Run Benchmarks
+
+First, run the benchmark evaluations with your configuration:
+
+```bash
+goose bench run --config /path/to/your-config.json
+```
+
+This will execute all evaluations for all models specified in your configuration and create a benchmark directory with results.
+
+### Step 2: Generate Leaderboard
+
+After the benchmarks complete, generate the leaderboard and aggregated metrics:
+
+```bash
+goose bench generate-leaderboard --benchmark-dir /path/to/benchmark-output-directory
+```
+
+The benchmark directory path will be shown in the output of the previous command, typically in the format `benchmark-YYYY-MM-DD-HH:MM:SS`.
+
+**Note**: This command requires a valid Python environment as it executes Python scripts for data aggregation and leaderboard generation.
+
+## Configuration
+
+Benchmark configuration is provided through a JSON file. Here's a sample configuration file (leaderboard-config.json) that you can use as a template:
+
+```json
+{
+  "models": [
+    {
+      "provider": "databricks",
+      "name": "gpt-4-1-mini",
+      "parallel_safe": true,
+      "tool_shim": {
+        "use_tool_shim": false,
+        "tool_shim_model": null
+      }
+    },
+    {
+      "provider": "databricks",
+      "name": "claude-3-5-sonnet",
+      "parallel_safe": true,
+      "tool_shim": null
+    },
+    {
+      "provider": "databricks",
+      "name": "gpt-4o",
+      "parallel_safe": true,
+      "tool_shim": null
+    }
+  ],
+  "evals": [
+    {
+      "selector": "core:developer",
+      "post_process_cmd": null,
+      "parallel_safe": true
+    },
+    {
+      "selector": "core:developer_search_replace",
+      "post_process_cmd": null,
+      "parallel_safe": true
+    },
+    {
+      "selector": "vibes:blog_summary",
+      "post_process_cmd": "/Users/ahau/Development/goose-1.0/goose/scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh",
+      "parallel_safe": true
+    },
+    {
+      "selector": "vibes:restaurant_research",
+      "post_process_cmd": "/Users/ahau/Development/goose-1.0/goose/scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh",
+      "parallel_safe": true
+    }
+  ],
+  "include_dirs": [],
+  "repeat": 3,
+  "run_id": null,
+  "output_dir": "/path/to/output/directory",
+  "eval_result_filename": "eval-results.json",
+  "run_summary_filename": "run-results-summary.json",
+  "env_file": "/path/to/.goosebench.env"
+}
+```
+
+## Configuration Options
+
+### Models
+
+- `provider`: The LLM provider (e.g., "databricks", "openai")
+- `name`: The model name
+- `parallel_safe`: Whether the model can be run in parallel
+- `tool_shim`: Configuration for tool-shim support
+  - `use_tool_shim`: Whether to use tool-shim
+  - `tool_shim_model`: Optional custom model for tool-shim
+
+### Evaluations
+
+- `selector`: The evaluation selector in format `suite:evaluation`
+- `post_process_cmd`: Optional path to a post-processing script
+- `parallel_safe`: Whether the evaluation can be run in parallel
+
+### Global Configuration
+
+- `include_dirs`: Additional directories to include in the benchmark environment
+- `repeat`: Number of times to repeat evaluations (for statistical significance)
+- `run_id`: Optional identifier for the run (defaults to timestamp)
+- `output_dir`: Directory to store benchmark results (must be absolute path)
+- `eval_result_filename`: Filename for individual evaluation results
+- `run_summary_filename`: Filename for run summary
+- `env_file`: Optional path to environment variables file
+
+## Environment Variables
+
+You can provide environment variables through the `env_file` configuration option. This is useful for provider API keys and other sensitive information. Example `.goosebench.env` file:
+
+```bash
+OPENAI_API_KEY=your_openai_api_key_here
+DATABRICKS_TOKEN=your_databricks_token_here
+# Add other environment variables as needed
+```
+
+**Important**: For evaluations that use LLM-as-judge (like `blog_summary` and `restaurant_research`), you must set `OPENAI_API_KEY` as the judging system uses OpenAI's GPT-4o model.
+
+## Post-Processing
+
+You can specify post-processing commands for evaluations, which will be executed after each evaluation completes. The command receives the path to the evaluation results file as its first argument.
+
+For example, the `run_vibes_judge.sh` script processes outputs from the `blog_summary` and `restaurant_research` evaluations, using LLM-based judging to assign scores.
+
+## Output Structure
+
+Results are organized in a directory structure that follows this pattern:
+
+```
+{benchmark_dir}/
+├── config.cfg                           # Configuration used for the benchmark
+├── {provider}-{model}/
+│   ├── eval-results/
+│   │   └── aggregate_metrics.csv        # Aggregated metrics for this model
+│   └── run-{run_id}/
+│       ├── {suite}/
+│       │   └── {evaluation}/
+│       │       ├── eval-results.json    # Individual evaluation results
+│       │       ├── {eval_name}.jsonl    # Session logs
+│       │       └── work_dir.json        # Info about evaluation working dir
+│       └── run-results-summary.json     # Summary of all evaluations in this run
+├── leaderboard.csv                      # Final leaderboard comparing all models
+└── all_metrics.csv                      # Union of all metrics across all models
+```
+
+### Output Files Explained
+
+#### Per-Model Files
+
+- **`eval-results/aggregate_metrics.csv`**: Contains aggregated metrics for each evaluation, averaged across all runs. Includes metrics like `score_mean`, `total_tokens_mean`, `prompt_execution_time_seconds_mean`, etc.
+
+#### Global Output Files
+
+- **`leaderboard.csv`**: Final leaderboard ranking all models by their average performance across evaluations. Contains columns like:
+  - `provider`, `model_name`: Model identification
+  - `avg_score_mean`: Average score across all evaluations
+  - `avg_prompt_execution_time_seconds_mean`: Average execution time
+  - `avg_total_tool_calls_mean`: Average number of tool calls
+  - `avg_total_tokens_mean`: Average token usage
+
+- **`all_metrics.csv`**: Comprehensive dataset containing detailed metrics for every model-evaluation combination. This is a union of all individual model metrics, useful for detailed analysis and custom reporting.
+
+Each model gets its own directory, containing run results and aggregated CSV files for analysis. The `generate-leaderboard` command processes all individual evaluation results and creates the comparative metrics files.
+
+## Error Handling and Troubleshooting
+
+**Important**: The current version of goose-bench does not have robust error handling for common issues that can occur during evaluation runs, such as:
+
+- Rate limiting from inference providers
+- Network timeouts or connection errors
+- Provider API errors that cause early session termination
+- Resource exhaustion or memory issues
+
+### Checking for Failed Evaluations
+
+After running benchmarks, you should inspect the generated metrics files to identify any evaluations that may have failed or terminated early:
+
+1. **Check the `aggregate_metrics.csv` files** in each model's `eval-results/` directory for:
+   - Missing evaluations (fewer rows than expected)
+   - Unusually low scores or metrics
+   - Zero or near-zero execution times
+   - Missing or NaN values
+
+2. **Look for `server_error_mean` column** in the aggregate metrics - values greater than 0 indicate server errors occurred during evaluation
+
+3. **Review session logs** (`.jsonl` files) in individual evaluation directories for error messages like:
+   - "Server error"
+   - "Rate limit exceeded" 
+   - "TEMPORARILY_UNAVAILABLE"
+   - Unexpected session terminations
+
+### Re-running Failed Evaluations
+
+If you identify failed evaluations, you may need to:
+
+1. **Adjust rate limiting**: Add delays between requests or reduce parallel execution
+2. **Update environment variables**: Ensure API keys and tokens are valid
+3. **Re-run specific model/evaluation combinations**: Create a new config with only the failed combinations
+4. **Check provider status**: Verify the inference provider is operational
+
+Example of creating a config to re-run failed evaluations:
+
+```json
+{
+  "models": [
+    {
+      "provider": "databricks",
+      "name": "claude-3-5-sonnet",
+      "parallel_safe": false
+    }
+  ],
+  "evals": [
+    {
+      "selector": "vibes:blog_summary",
+      "post_process_cmd": "/path/to/scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh",
+      "parallel_safe": false
+    }
+  ],
+  "repeat": 1,
+  "output_dir": "/path/to/retry-benchmark"
+}
+```
+
+We recommend monitoring evaluation progress and checking for errors regularly, especially when running large benchmark suites across multiple models.
+
+## Available Commands
+
+### List Evaluations
+```bash
+goose bench selectors --config /path/to/config.json
+```
+
+### Generate Initial Config
+```bash
+goose bench init-config --name my-benchmark-config.json
+```
+
+### Run Benchmarks
+```bash
+goose bench run --config /path/to/config.json
+```
+
+### Generate Leaderboard
+```bash
+goose bench generate-leaderboard --benchmark-dir /path/to/benchmark-output
+```
--- a/crates/goose-bench/src/eval_suites/evaluation.rs
+++ b/crates/goose-bench/src/eval_suites/evaluation.rs
@@ -3,17 +3,29 @@ use crate::bench_work_dir::BenchmarkWorkDir;
 use anyhow::Result;
 use async_trait::async_trait;
 use serde::{Deserialize, Serialize};
+use std::fmt;

 pub type Model = (String, String);
 pub type Extension = String;

-#[derive(Debug, Deserialize, Serialize)]
+#[derive(Debug, Deserialize, Serialize, Clone)]
 pub enum EvalMetricValue {
    Integer(i64),
    Float(f64),
    String(String),
    Boolean(bool),
 }
+
+impl fmt::Display for EvalMetricValue {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            EvalMetricValue::Integer(i) => write!(f, "{}", i),
+            EvalMetricValue::Float(fl) => write!(f, "{:.2}", fl),
+            EvalMetricValue::String(s) => write!(f, "{}", s),
+            EvalMetricValue::Boolean(b) => write!(f, "{}", b),
+        }
+    }
+}
 #[derive(Debug, Serialize)]
 pub struct EvalMetric {
    pub name: String,
--- a/crates/goose-bench/src/reporting.rs
+++ b/crates/goose-bench/src/reporting.rs
@@ -98,17 +98,6 @@ impl BenchmarkResults {
    }
 }

-impl fmt::Display for EvalMetricValue {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            EvalMetricValue::Integer(i) => write!(f, "{}", i),
-            EvalMetricValue::Float(fl) => write!(f, "{:.2}", fl),
-            EvalMetricValue::String(s) => write!(f, "{}", s),
-            EvalMetricValue::Boolean(b) => write!(f, "{}", b),
-        }
-    }
-}
-
 impl fmt::Display for BenchmarkResults {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        writeln!(f, "Benchmark Results")?;
--- a/crates/goose-bench/src/runners/eval_runner.rs
+++ b/crates/goose-bench/src/runners/eval_runner.rs
@@ -4,12 +4,14 @@ use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{EvaluationSuite, ExtensionRequirements};
 use crate::reporting::EvaluationResult;
 use crate::utilities::await_process_exits;
+use anyhow::{bail, Context, Result};
 use std::env;
 use std::fs;
 use std::future::Future;
 use std::path::PathBuf;
 use std::process::Command;
 use std::time::{SystemTime, UNIX_EPOCH};
+use tracing;

 #[derive(Clone)]
 pub struct EvalRunner {
@@ -17,13 +19,17 @@ pub struct EvalRunner {
 }

 impl EvalRunner {
-    pub fn from(config: String) -> anyhow::Result<EvalRunner> {
-        let config = BenchRunConfig::from_string(config)?;
+    pub fn from(config: String) -> Result<EvalRunner> {
+        let config = BenchRunConfig::from_string(config)
+            .context("Failed to parse evaluation configuration")?;
        Ok(EvalRunner { config })
    }

-    fn create_work_dir(&self, config: &BenchRunConfig) -> anyhow::Result<BenchmarkWorkDir> {
-        let goose_model = config.models.first().unwrap();
+    fn create_work_dir(&self, config: &BenchRunConfig) -> Result<BenchmarkWorkDir> {
+        let goose_model = config
+            .models
+            .first()
+            .context("No model specified in configuration")?;
        let model_name = goose_model.name.clone();
        let provider_name = goose_model.provider.clone();

@@ -48,13 +54,21 @@ impl EvalRunner {
        let work_dir = BenchmarkWorkDir::new(work_dir_name, include_dir);
        Ok(work_dir)
    }
-    pub async fn run<F, Fut>(&mut self, agent_generator: F) -> anyhow::Result<()>
+
+    pub async fn run<F, Fut>(&mut self, agent_generator: F) -> Result<()>
    where
        F: Fn(ExtensionRequirements, String) -> Fut,
        Fut: Future<Output = BenchAgent> + Send,
    {
-        let mut work_dir = self.create_work_dir(&self.config)?;
-        let bench_eval = self.config.evals.first().unwrap();
+        let mut work_dir = self
+            .create_work_dir(&self.config)
+            .context("Failed to create evaluation work directory")?;
+
+        let bench_eval = self
+            .config
+            .evals
+            .first()
+            .context("No evaluations specified in configuration")?;

        let run_id = &self
            .config
@@ -65,41 +79,89 @@ impl EvalRunner {

        // create entire dir subtree for eval and cd into dir for running eval
        work_dir.set_eval(&bench_eval.selector, run_id);
+        tracing::info!("Set evaluation directory for {}", bench_eval.selector);

        if let Some(eval) = EvaluationSuite::from(&bench_eval.selector) {
-            let now_stamp = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos();
+            let now_stamp = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .context("Failed to get current timestamp")?
+                .as_nanos();
+
            let session_id = format!("{}-{}", bench_eval.selector.clone(), now_stamp);
            let mut agent = agent_generator(eval.required_extensions(), session_id).await;
+            tracing::info!("Agent created for {}", eval.name());

            let mut result = EvaluationResult::new(eval.name().to_string());

-            if let Ok(metrics) = eval.run(&mut agent, &mut work_dir).await {
+            match eval.run(&mut agent, &mut work_dir).await {
+                Ok(metrics) => {
+                    tracing::info!("Evaluation run successful with {} metrics", metrics.len());
                    for (name, metric) in metrics {
                        result.add_metric(name, metric);
                    }
+                }
+                Err(e) => {
+                    tracing::error!("Evaluation run failed: {}", e);
+                }
+            }

            // Add any errors that occurred
-                for error in agent.get_errors().await {
+            let errors = agent.get_errors().await;
+            tracing::info!("Agent reported {} errors", errors.len());
+            for error in errors {
                result.add_error(error);
            }
-            }

-            let eval_results = serde_json::to_string_pretty(&result)?;
+            // Write results to file
+            let eval_results = serde_json::to_string_pretty(&result)
+                .context("Failed to serialize evaluation results to JSON")?;
+
+            let eval_results_file = env::current_dir()
+                .context("Failed to get current directory")?
+                .join(&self.config.eval_result_filename);
+
+            fs::write(&eval_results_file, &eval_results).with_context(|| {
+                format!(
+                    "Failed to write evaluation results to {}",
+                    eval_results_file.display()
+                )
+            })?;
+
+            tracing::info!(
+                "Wrote evaluation results to {}",
+                eval_results_file.display()
+            );

-            let eval_results_file = env::current_dir()?.join(&self.config.eval_result_filename);
-            fs::write(&eval_results_file, &eval_results)?;
            self.config.save("config.cfg".to_string());
            work_dir.save();

            // handle running post-process cmd if configured
            if let Some(cmd) = &bench_eval.post_process_cmd {
-                let handle = Command::new(cmd).arg(&eval_results_file).spawn()?;
+                tracing::info!("Running post-process command: {:?}", cmd);
+
+                let handle = Command::new(cmd)
+                    .arg(&eval_results_file)
+                    .spawn()
+                    .with_context(|| {
+                        format!("Failed to execute post-process command: {:?}", cmd)
+                    })?;
+
                await_process_exits(&mut [handle], Vec::new());
            }

            // copy session file into eval-dir
-            let here = env::current_dir()?.canonicalize()?;
-            BenchmarkWorkDir::deep_copy(agent.session_file().as_path(), here.as_path(), false)?;
+            let here = env::current_dir()
+                .context("Failed to get current directory")?
+                .canonicalize()
+                .context("Failed to canonicalize current directory path")?;
+
+            BenchmarkWorkDir::deep_copy(agent.session_file().as_path(), here.as_path(), false)
+                .context("Failed to copy session file to evaluation directory")?;
+
+            tracing::info!("Evaluation completed successfully");
+        } else {
+            tracing::error!("No evaluation found for selector: {}", bench_eval.selector);
+            bail!("No evaluation found for selector: {}", bench_eval.selector);
        }

        Ok(())
--- a/crates/goose-bench/src/runners/metric_aggregator.rs
+++ b/crates/goose-bench/src/runners/metric_aggregator.rs
@@ -0,0 +1,81 @@
+use anyhow::{bail, ensure, Context, Result};
+use std::path::PathBuf;
+use tracing;
+
+pub struct MetricAggregator;
+
+impl MetricAggregator {
+    /// Generate leaderboard and aggregated metrics CSV files from benchmark directory
+    pub fn generate_csv_from_benchmark_dir(benchmark_dir: &PathBuf) -> Result<()> {
+        use std::process::Command;
+
+        // Step 1: Run prepare_aggregate_metrics.py to create aggregate_metrics.csv files
+        let prepare_script_path = std::env::current_dir()
+            .context("Failed to get current working directory")?
+            .join("scripts")
+            .join("bench-postprocess-scripts")
+            .join("prepare_aggregate_metrics.py");
+
+        ensure!(
+            prepare_script_path.exists(),
+            "Prepare script not found: {}",
+            prepare_script_path.display()
+        );
+
+        tracing::info!(
+            "Preparing aggregate metrics from benchmark directory: {}",
+            benchmark_dir.display()
+        );
+
+        let output = Command::new(&prepare_script_path)
+            .arg("--benchmark-dir")
+            .arg(benchmark_dir)
+            .output()
+            .context("Failed to execute prepare_aggregate_metrics.py script")?;
+
+        if !output.status.success() {
+            let error_message = String::from_utf8_lossy(&output.stderr);
+            bail!("Failed to prepare aggregate metrics: {}", error_message);
+        }
+
+        let success_message = String::from_utf8_lossy(&output.stdout);
+        tracing::info!("{}", success_message);
+
+        // Step 2: Run generate_leaderboard.py to create the final leaderboard
+        let leaderboard_script_path = std::env::current_dir()
+            .context("Failed to get current working directory")?
+            .join("scripts")
+            .join("bench-postprocess-scripts")
+            .join("generate_leaderboard.py");
+
+        ensure!(
+            leaderboard_script_path.exists(),
+            "Leaderboard script not found: {}",
+            leaderboard_script_path.display()
+        );
+
+        tracing::info!(
+            "Generating leaderboard from benchmark directory: {}",
+            benchmark_dir.display()
+        );
+
+        let output = Command::new(&leaderboard_script_path)
+            .arg("--benchmark-dir")
+            .arg(benchmark_dir)
+            .arg("--leaderboard-output")
+            .arg("leaderboard.csv")
+            .arg("--union-output")
+            .arg("all_metrics.csv")
+            .output()
+            .context("Failed to execute generate_leaderboard.py script")?;
+
+        if !output.status.success() {
+            let error_message = String::from_utf8_lossy(&output.stderr);
+            bail!("Failed to generate leaderboard: {}", error_message);
+        }
+
+        let success_message = String::from_utf8_lossy(&output.stdout);
+        tracing::info!("{}", success_message);
+        Ok(())
+    }
+}
--- a/crates/goose-bench/src/runners/mod.rs
+++ b/crates/goose-bench/src/runners/mod.rs
@@ -1,3 +1,4 @@
 pub mod bench_runner;
 pub mod eval_runner;
+pub mod metric_aggregator;
 pub mod model_runner;
--- a/crates/goose-bench/src/runners/model_runner.rs
+++ b/crates/goose-bench/src/runners/model_runner.rs
@@ -3,12 +3,14 @@ use crate::eval_suites::EvaluationSuite;
 use crate::reporting::{BenchmarkResults, SuiteResult};
 use crate::runners::eval_runner::EvalRunner;
 use crate::utilities::{await_process_exits, parallel_bench_cmd};
+use anyhow::{Context, Result};
+use dotenvy::from_path_iter;
 use std::collections::HashMap;
 use std::fs::read_to_string;
-use std::io::{self, BufRead};
 use std::path::PathBuf;
 use std::process::Child;
 use std::thread;
+use tracing;

 #[derive(Clone)]
 pub struct ModelRunner {
@@ -16,23 +18,27 @@ pub struct ModelRunner {
 }

 impl ModelRunner {
-    pub fn from(config: String) -> anyhow::Result<ModelRunner> {
-        let config = BenchRunConfig::from_string(config)?;
+    pub fn from(config: String) -> Result<ModelRunner> {
+        let config =
+            BenchRunConfig::from_string(config).context("Failed to parse configuration")?;
        Ok(ModelRunner { config })
    }

-    pub fn run(&self) -> anyhow::Result<()> {
-        let model = self.config.models.first().unwrap();
+    pub fn run(&self) -> Result<()> {
+        let model = self
+            .config
+            .models
+            .first()
+            .context("No model specified in config")?;
        let suites = self.collect_evals_for_run();

        let mut handles = vec![];

        for i in 0..self.config.repeat.unwrap_or(1) {
-            let mut self_copy = self.clone();
+            let self_copy = self.clone();
            let model_clone = model.clone();
            let suites_clone = suites.clone();
-            // create thread to handle launching parallel processes to run model's evals in parallel
-            let handle = thread::spawn(move || {
+            let handle = thread::spawn(move || -> Result<()> {
                self_copy.run_benchmark(&model_clone, suites_clone, i.to_string())
            });
            handles.push(handle);
@@ -41,55 +47,32 @@ impl ModelRunner {

        let mut all_runs_results: Vec<BenchmarkResults> = Vec::new();
        for i in 0..self.config.repeat.unwrap_or(1) {
-            let run_results =
-                self.collect_run_results(model.clone(), suites.clone(), i.to_string())?;
-            all_runs_results.push(run_results);
+            match self.collect_run_results(model.clone(), suites.clone(), i.to_string()) {
+                Ok(run_results) => all_runs_results.push(run_results),
+                Err(e) => {
+                    tracing::error!("Failed to collect results for run {}: {}", i, e)
+                }
+            }
        }
-        // write summary file

        Ok(())
    }

-    fn load_env_file(&self, path: &PathBuf) -> anyhow::Result<Vec<(String, String)>> {
-        let file = std::fs::File::open(path)?;
-        let reader = io::BufReader::new(file);
-        let mut env_vars = Vec::new();
-
-        for line in reader.lines() {
-            let line = line?;
-            // Skip empty lines and comments
-            if line.trim().is_empty() || line.trim_start().starts_with('#') {
-                continue;
-            }
-
-            // Split on first '=' only
-            if let Some((key, value)) = line.split_once('=') {
-                let key = key.trim().to_string();
-                // Remove quotes if present
-                let value = value
-                    .trim()
-                    .trim_matches('"')
-                    .trim_matches('\'')
-                    .to_string();
-                env_vars.push((key, value));
-            }
-        }
-
-        Ok(env_vars)
-    }
-
    fn run_benchmark(
-        &mut self,
+        &self,
        model: &BenchModel,
        suites: HashMap<String, Vec<BenchEval>>,
        run_id: String,
-    ) -> anyhow::Result<()> {
+    ) -> Result<()> {
        let mut results_handles = HashMap::<String, Vec<Child>>::new();

        // Load environment variables from file if specified
        let mut envs = self.toolshim_envs();
        if let Some(env_file) = &self.config.env_file {
-            let env_vars = self.load_env_file(env_file)?;
+            let env_vars = ModelRunner::load_env_file(env_file).context(format!(
+                "Failed to load environment file: {}",
+                env_file.display()
+            ))?;
            envs.extend(env_vars);
        }
        envs.push(("GOOSE_MODEL".to_string(), model.clone().name));
@@ -116,9 +99,13 @@ impl ModelRunner {
            // Run parallel-safe evaluations in parallel
            if !parallel_evals.is_empty() {
                for eval_selector in &parallel_evals {
-                    self.config.run_id = Some(run_id.clone());
-                    self.config.evals = vec![(*eval_selector).clone()];
-                    let cfg = self.config.to_string()?;
+                    let mut config_copy = self.config.clone();
+                    config_copy.run_id = Some(run_id.clone());
+                    config_copy.evals = vec![(*eval_selector).clone()];
+                    let cfg = config_copy
+                        .to_string()
+                        .context("Failed to serialize configuration")?;
+
                    let handle = parallel_bench_cmd("exec-eval".to_string(), cfg, envs.clone());
                    results_handles.get_mut(suite).unwrap().push(handle);
                }
@@ -126,9 +113,13 @@ impl ModelRunner {

            // Run non-parallel-safe evaluations sequentially
            for eval_selector in &sequential_evals {
-                self.config.run_id = Some(run_id.clone());
-                self.config.evals = vec![(*eval_selector).clone()];
-                let cfg = self.config.to_string()?;
+                let mut config_copy = self.config.clone();
+                config_copy.run_id = Some(run_id.clone());
+                config_copy.evals = vec![(*eval_selector).clone()];
+                let cfg = config_copy
+                    .to_string()
+                    .context("Failed to serialize configuration")?;
+
                let handle = parallel_bench_cmd("exec-eval".to_string(), cfg, envs.clone());

                // Wait for this process to complete before starting the next one
@@ -150,7 +141,7 @@ impl ModelRunner {
        model: BenchModel,
        suites: HashMap<String, Vec<BenchEval>>,
        run_id: String,
-    ) -> anyhow::Result<BenchmarkResults> {
+    ) -> Result<BenchmarkResults> {
        let mut results = BenchmarkResults::new(model.provider.clone());

        let mut summary_path: Option<PathBuf> = None;
@@ -161,7 +152,17 @@ impl ModelRunner {
                let mut eval_path =
                    EvalRunner::path_for_eval(&model, eval_selector, run_id.clone());
                eval_path.push(self.config.eval_result_filename.clone());
-                let eval_result = serde_json::from_str(&read_to_string(&eval_path)?)?;
+
+                let content = read_to_string(&eval_path).with_context(|| {
+                    format!(
+                        "Failed to read evaluation results from {}",
+                        eval_path.display()
+                    )
+                })?;
+
+                let eval_result = serde_json::from_str(&content)
+                    .context("Failed to parse evaluation results JSON")?;
+
                suite_result.add_evaluation(eval_result);

                // use current eval to determine where the summary should be written
@@ -180,12 +181,21 @@ impl ModelRunner {
            results.add_suite(suite_result);
        }

+        if let Some(path) = summary_path {
            let mut run_summary = PathBuf::new();
-        run_summary.push(summary_path.clone().unwrap());
+            run_summary.push(path);
            run_summary.push(&self.config.run_summary_filename);

-        let output_str = serde_json::to_string_pretty(&results)?;
-        std::fs::write(run_summary, &output_str)?;
+            let output_str = serde_json::to_string_pretty(&results)
+                .context("Failed to serialize benchmark results to JSON")?;
+
+            std::fs::write(&run_summary, &output_str).with_context(|| {
+                format!(
+                    "Failed to write results summary to {}",
+                    run_summary.display()
+                )
+            })?;
+        }

        Ok(results)
    }
@@ -210,9 +220,8 @@ impl ModelRunner {

    fn toolshim_envs(&self) -> Vec<(String, String)> {
        // read tool-shim preference from config, set respective env vars accordingly
-        let model = self.config.models.first().unwrap();
-
        let mut shim_envs: Vec<(String, String)> = Vec::new();
+        if let Some(model) = self.config.models.first() {
            if let Some(shim_opt) = &model.tool_shim {
                if shim_opt.use_tool_shim {
                    shim_envs.push(("GOOSE_TOOLSHIM".to_string(), "true".to_string()));
@@ -224,6 +233,16 @@ impl ModelRunner {
                    }
                }
            }
+        }
        shim_envs
    }
+
+    fn load_env_file(path: &PathBuf) -> Result<Vec<(String, String)>> {
+        let iter =
+            from_path_iter(path).context("Failed to read environment variables from file")?;
+        let env_vars = iter
+            .map(|item| item.context("Failed to parse environment variable"))
+            .collect::<Result<_, _>>()?;
+        Ok(env_vars)
+    }
 }
--- a/crates/goose-bench/src/utilities.rs
+++ b/crates/goose-bench/src/utilities.rs
@@ -1,15 +1,14 @@
+use anyhow::Result;
 use std::env;
 use std::process::{Child, Command};
 use std::thread::JoinHandle;
+use tracing;

-pub fn await_process_exits(
-    child_processes: &mut [Child],
-    handles: Vec<JoinHandle<anyhow::Result<()>>>,
-) {
+pub fn await_process_exits(child_processes: &mut [Child], handles: Vec<JoinHandle<Result<()>>>) {
    for child in child_processes.iter_mut() {
        match child.wait() {
-            Ok(status) => println!("Child exited with status: {}", status),
-            Err(e) => println!("Error waiting for child: {}", e),
+            Ok(status) => tracing::info!("Child exited with status: {}", status),
+            Err(e) => tracing::error!("Error waiting for child: {}", e),
        }
    }

@@ -18,7 +17,7 @@ pub fn await_process_exits(
            Ok(_res) => (),
            Err(e) => {
                // Handle thread panic
-                println!("Thread panicked: {:?}", e);
+                tracing::error!("Thread panicked: {:?}", e);
            }
        }
    }
--- a/crates/goose-cli/src/cli.rs
+++ b/crates/goose-cli/src/cli.rs
@@ -17,6 +17,7 @@ use crate::session::{build_session, SessionBuilderConfig};
 use goose_bench::bench_config::BenchRunConfig;
 use goose_bench::runners::bench_runner::BenchRunner;
 use goose_bench::runners::eval_runner::EvalRunner;
+use goose_bench::runners::metric_aggregator::MetricAggregator;
 use goose_bench::runners::model_runner::ModelRunner;
 use std::io::Read;
 use std::path::PathBuf;
@@ -142,6 +143,19 @@ pub enum BenchCommand {
        #[arg(short, long, help = "A serialized config file for the eval only.")]
        config: String,
    },
+
+    #[command(
+        name = "generate-leaderboard",
+        about = "Generate a leaderboard CSV from benchmark results"
+    )]
+    GenerateLeaderboard {
+        #[arg(
+            short,
+            long,
+            help = "Path to the benchmark directory containing model evaluation results"
+        )]
+        benchmark_dir: PathBuf,
+    },
 }

 #[derive(Subcommand)]
@@ -651,6 +665,9 @@ pub async fn cli() -> Result<()> {
                BenchCommand::ExecEval { config } => {
                    EvalRunner::from(config)?.run(agent_generator).await?
                }
+                BenchCommand::GenerateLeaderboard { benchmark_dir } => {
+                    MetricAggregator::generate_csv_from_benchmark_dir(&benchmark_dir)?
+                }
            }
            return Ok(());
        }
--- a/scripts/bench-postprocess-scripts/generate_leaderboard.py
+++ b/scripts/bench-postprocess-scripts/generate_leaderboard.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+# Compatible with Python 3.6+
+"""
+Generate a leaderboard CSV from benchmark results, including server error information.
+
+This script:
+1. Looks for model folders in the benchmark directory
+2. Finds eval-results/aggregate_metrics.csv in each model folder
+3. Extracts key metrics (provider, model_name, eval_suite, eval_name, tool calls, execution time, tokens, score, prompt error, server error)
+4. Creates a union of all CSVs with these columns
+5. Creates a leaderboard.csv grouping by provider and model_name, averaging numeric columns
+
+Usage:
+    python generate_leaderboard.py --benchmark-dir /path/to/benchmark-dir
+"""
+
+import argparse
+import pandas as pd
+from pathlib import Path
+import sys
+
+
+def find_aggregate_metrics_files(benchmark_dir: Path) -> list:
+    """Find all aggregate_metrics.csv files in model subdirectories."""
+    csv_files = []
+    
+    # Look for model directories in the benchmark directory
+    for model_dir in benchmark_dir.iterdir():
+        if model_dir.is_dir():
+            # Look for eval-results/aggregate_metrics.csv in each model directory
+            eval_results_dir = model_dir / "eval-results"
+            if eval_results_dir.exists() and eval_results_dir.is_dir():
+                csv_path = eval_results_dir / "aggregate_metrics.csv"
+                if csv_path.exists():
+                    csv_files.append(csv_path)
+    
+    return csv_files
+
+
+def process_csv_files(csv_files: list) -> tuple:
+    """
+    Process all CSV files and create two dataframes:
+    1. A union of all CSVs with selected columns
+    2. A leaderboard grouping by provider and model_name with averaged metrics
+    """
+    selected_columns = [
+        'provider', 
+        'model_name', 
+        'eval_suite', 
+        'eval_name', 
+        'total_tool_calls_mean', 
+        'prompt_execution_time_mean', 
+        'total_tokens_mean', 
+        'score_mean', 
+        'prompt_error_mean',
+        'server_error_mean' 
+    ]
+    
+    all_data = []
+    
+    for csv_file in csv_files:
+        try:
+            df = pd.read_csv(csv_file)
+            
+            # Check which selected columns are available
+            missing_columns = [col for col in selected_columns if col not in df.columns]
+            if missing_columns:
+                print(f"Warning: {csv_file} is missing columns: {missing_columns}")
+                
+                # For missing columns, add them with NaN values
+                for col in missing_columns:
+                    df[col] = float('nan')
+            
+            # Select only the columns we care about
+            df_subset = df[selected_columns].copy()  # Create a copy to avoid SettingWithCopyWarning
+            
+            # Add model folder name as additional context
+            model_folder = csv_file.parent.parent.name
+            df_subset['model_folder'] = model_folder
+            
+            all_data.append(df_subset)
+            
+        except Exception as e:
+            print(f"Error processing {csv_file}: {str(e)}")
+    
+    if not all_data:
+        raise ValueError("No valid CSV files found with required columns")
+    
+    # Concatenate all dataframes to create a union
+    union_df = pd.concat(all_data, ignore_index=True)
+    
+    # Create leaderboard by grouping and averaging numerical columns
+    numeric_columns = [
+        'total_tool_calls_mean', 
+        'prompt_execution_time_mean', 
+        'total_tokens_mean', 
+        'score_mean', 
+        'prompt_error_mean',
+        'server_error_mean'
+    ]
+    
+    # Group by provider and model_name, then calculate averages for numeric columns
+    leaderboard_df = union_df.groupby(['provider', 'model_name'])[numeric_columns].mean().reset_index()
+    
+    # Sort by score_mean in descending order (highest scores first)
+    leaderboard_df = leaderboard_df.sort_values('score_mean', ascending=False)
+    
+    return union_df, leaderboard_df
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate a leaderboard CSV from benchmark results, including server error information"
+    )
+    parser.add_argument(
+        "--benchmark-dir",
+        type=str,
+        required=True,
+        help="Path to the benchmark directory containing model subdirectories"
+    )
+    parser.add_argument(
+        "--union-output",
+        type=str,
+        default="all_metrics.csv",
+        help="Output filename for the union of all CSVs (default: all_metrics.csv)"
+    )
+    parser.add_argument(
+        "--leaderboard-output",
+        type=str,
+        default="leaderboard.csv",
+        help="Output filename for the leaderboard (default: leaderboard.csv)"
+    )
+    
+    args = parser.parse_args()
+    
+    benchmark_dir = Path(args.benchmark_dir)
+    if not benchmark_dir.exists() or not benchmark_dir.is_dir():
+        print(f"Error: Benchmark directory {benchmark_dir} does not exist or is not a directory")
+        sys.exit(1)
+    
+    try:
+        # Find all aggregate_metrics.csv files in model subdirectories
+        csv_files = find_aggregate_metrics_files(benchmark_dir)
+        
+        if not csv_files:
+            print(f"No aggregate_metrics.csv files found in any model directory under {benchmark_dir}")
+            sys.exit(1)
+        
+        print(f"Found {len(csv_files)} aggregate_metrics.csv files in model directories")
+        
+        # Process and create the union and leaderboard dataframes
+        union_df, leaderboard_df = process_csv_files(csv_files)
+        
+        # Save the union CSV to the benchmark directory
+        union_output_path = benchmark_dir / args.union_output
+        union_df.to_csv(union_output_path, index=False)
+        print(f"Union CSV with all metrics saved to: {union_output_path}")
+        
+        # Save the leaderboard CSV to the benchmark directory
+        leaderboard_output_path = benchmark_dir / args.leaderboard_output
+        leaderboard_df.to_csv(leaderboard_output_path, index=False)
+        print(f"Leaderboard CSV with averaged metrics saved to: {leaderboard_output_path}")
+        
+        # Print a summary of the leaderboard
+        print("\nLeaderboard Summary:")
+        pd.set_option('display.max_columns', None)  # Show all columns
+        print(leaderboard_df.to_string(index=False))
+        
+        # Highlight models with server errors
+        if 'server_error_mean' in leaderboard_df.columns:
+            models_with_errors = leaderboard_df[leaderboard_df['server_error_mean'] > 0]
+            if not models_with_errors.empty:
+                print("\nWARNING - Models with server errors detected:")
+                for _, row in models_with_errors.iterrows():
+                    print(f"  * {row['provider']} {row['model_name']} - {row['server_error_mean']*100:.1f}% of evaluations had server errors")
+                print("\nThese models may need to be re-run to get accurate results.")
+        
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/bench-postprocess-scripts/llm-judges/blog_summary_prompt.txt
+++ b/scripts/bench-postprocess-scripts/llm-judges/blog_summary_prompt.txt
@@ -0,0 +1,8 @@
+You are evaluating a response to a summarization task and will give a score of 0, 1, or 2. The instructions were:
+
+'What are the top 5 most counterintuitive insights from this blog post? https://huyenchip.com/2025/01/07/agents.html'
+
+Does the response below appropriately answer the query (ignore formatting)?
+0 = does not provide any insights at all
+1 = provides some insights, but not all 5
+2 = provides all 5 insights
--- a/scripts/bench-postprocess-scripts/llm-judges/calculate_final_scores_vibes.py
+++ b/scripts/bench-postprocess-scripts/llm-judges/calculate_final_scores_vibes.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+Calculate final score for vibes evaluations.
+This script combines the LLM judge score with other metrics to produce a final score.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+def get_metric_value(metrics, metric_name):
+    """Extract a metric value from the metrics array."""
+    for metric in metrics:
+        if metric[0] == metric_name:
+            value = metric[1]
+            if "Float" in value:
+                return float(value["Float"])
+            elif "Integer" in value:
+                return float(value["Integer"])
+            elif "Boolean" in value:
+                return 1.0 if value["Boolean"] else 0.0
+    return None
+
+
+def calculate_score(eval_name, metrics):
+    """Calculate the final score based on the evaluation type."""
+    llm_judge_score = get_metric_value(metrics, "llm_judge_score")
+    used_fetch_tool = get_metric_value(metrics, "used_fetch_tool")
+    valid_markdown_format = get_metric_value(metrics, "valid_markdown_format")
+    
+    if llm_judge_score is None:
+        raise ValueError("llm_judge_score not found in metrics")
+    
+    # Convert boolean metrics to 0/1 if needed
+    used_fetch_tool = 1.0 if used_fetch_tool else 0.0
+    valid_markdown_format = 1.0 if valid_markdown_format else 0.0
+    
+    if eval_name == "blog_summary":
+        # max score is 4.0 as llm_judge_score is between [0,2] and used_fetch_tool/valid_markedown_format have values [0,1]
+        score = (llm_judge_score + used_fetch_tool + valid_markdown_format) / 4.0
+    elif eval_name == "restaurant_research":
+        score = (llm_judge_score + valid_markdown_format + used_fetch_tool) / 4.0
+    else:
+        raise ValueError(f"Unknown evaluation type: {eval_name}")
+    
+    return score
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: calculate_final_score.py <eval_name>")
+        sys.exit(1)
+    
+    eval_name = sys.argv[1]
+    
+    # Load eval results from current directory
+    eval_results_path = Path("eval-results.json")
+    if not eval_results_path.exists():
+        print(f"Error: eval-results.json not found in current directory")
+        sys.exit(1)
+    
+    with open(eval_results_path, 'r') as f:
+        eval_results = json.load(f)
+    
+    try:
+        # Calculate the final score
+        score = calculate_score(eval_name, eval_results["metrics"])
+        
+        # Add the score metric
+        eval_results["metrics"].append([
+            "score",
+            {"Float": score}
+        ])
+        
+        # Save updated results
+        with open(eval_results_path, 'w') as f:
+            json.dump(eval_results, f, indent=2)
+        
+        print(f"Successfully added final score: {score}")
+        
+    except Exception as e:
+        print(f"Error calculating final score: {str(e)}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/bench-postprocess-scripts/llm-judges/llm_judge.py
+++ b/scripts/bench-postprocess-scripts/llm-judges/llm_judge.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+LLM Judge post-processing script for Goose benchmarks.
+
+This script evaluates benchmark results using OpenAI's API as a judge.
+It reads the eval-results.json file and a specified output file, then uses
+OpenAI to score the output based on a provided rubric.
+
+Usage:
+    python llm_judge.py <output_file> [--rubric-max-score N] [--prompt-file PATH]
+    
+Arguments:
+    output_file: Name of the file containing the output to evaluate (e.g., blog_summary_output.txt)
+    --rubric-max-score: Maximum score for the rubric (default: 2)
+    --prompt-file: Path to custom evaluation prompt file
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from collections import Counter
+from pathlib import Path
+from typing import Dict, Any
+
+try:
+    from openai import OpenAI
+except ImportError:
+    print("Error: openai package not found. Please install it with: pip install openai")
+    sys.exit(1)
+
+
+def evaluate_with_openai(prompt: str, text: str, rubric_max_score: int = 2) -> float:
+    """Evaluate response using OpenAI's API.
+    
+    Args:
+        prompt: System prompt for evaluation
+        text: Text to evaluate
+        rubric_max_score: Maximum score for the rubric (default: 2.0)
+        
+    Returns:
+        float: Evaluation score (0 to rubric_max_score)
+        
+    Raises:
+        ValueError: If OPENAI_API_KEY environment variable is not set
+    """
+    print("Starting OpenAI evaluation...")
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        print("No OpenAI API key found!")
+        raise ValueError("OPENAI_API_KEY environment variable is not set, but is needed to run this evaluation.")
+        
+    try:
+        client = OpenAI(api_key=api_key)
+        
+        # Append output instructions to system prompt
+        output_instructions = f"""
+Output Instructions:
+Return your evaluation as a JSON object in the following format:
+{{
+    "reasoning": "Your brief reasoning for the score",
+    "score": <integer between 0 and {rubric_max_score}>
+}}
+
+IMPORTANT: 
+- Do not use any markdown formatting (no ```json blocks)
+- Do not include any additional text before or after the JSON
+- Return only the raw JSON object
+- The score must be an integer between 0 and {rubric_max_score}"""
+        
+        input_prompt = f"{prompt} {output_instructions}\nResponse to evaluate: {text}"
+        
+        # Run the chat completion 3 times and collect scores
+        scores = []
+        for i in range(3):
+            max_retries = 5
+            retry_count = 0
+            
+            while retry_count < max_retries:
+                try:
+                    response = client.chat.completions.create(
+                        model="gpt-4o",
+                        messages=[
+                            {"role": "user", "content": input_prompt}
+                        ],
+                        temperature=0.9
+                    )
+                    
+                    # Extract and parse JSON from response
+                    response_text = response.choices[0].message.content.strip()
+                    try:
+                        evaluation = json.loads(response_text)
+                        score = float(evaluation.get("score", 0.0))
+                        score = max(0.0, min(score, rubric_max_score))
+                        scores.append(score)
+                        print(f"Run {i+1} score: {score}")
+                        break  # Successfully parsed, exit retry loop
+                    except (json.JSONDecodeError, ValueError) as e:
+                        retry_count += 1
+                        print(f"Error parsing OpenAI response as JSON (attempt {retry_count}/{max_retries}): {str(e)}")
+                        print(f"Response text: {response_text}")
+                        if retry_count == max_retries:
+                            raise ValueError(f"Failed to parse OpenAI evaluation response after {max_retries} attempts: {str(e)}")
+                        print("Retrying...")
+                        time.sleep(1)  # Wait 1 second before retrying
+                        continue
+                except Exception as e:
+                    # For other exceptions (API errors, etc.), raise immediately
+                    print(f"API error: {str(e)}")
+                    raise
+        
+        # Count occurrences of each score
+        score_counts = Counter(scores)
+        
+        # If there's no single most common score (all scores are different), run one more time
+        if len(scores) == 3 and max(score_counts.values()) == 1:
+            print("No majority score found. Running tie-breaker...")
+            max_retries = 5
+            retry_count = 0
+            
+            while retry_count < max_retries:
+                try:
+                    response = client.chat.completions.create(
+                        model="gpt-4o",
+                        messages=[
+                            {"role": "user", "content": input_prompt}
+                        ],
+                        temperature=0.9
+                    )
+                    
+                    response_text = response.choices[0].message.content.strip()
+                    try:
+                        evaluation = json.loads(response_text)
+                        score = float(evaluation.get("score", 0.0))
+                        score = max(0.0, min(score, rubric_max_score))
+                        scores.append(score)
+                        print(f"Tie-breaker score: {score}")
+                        score_counts = Counter(scores)
+                        break  # Successfully parsed, exit retry loop
+                    except (json.JSONDecodeError, ValueError) as e:
+                        retry_count += 1
+                        print(f"Error parsing tie-breaker response as JSON (attempt {retry_count}/{max_retries}): {str(e)}")
+                        print(f"Response text: {response_text}")
+                        if retry_count == max_retries:
+                            raise ValueError(f"Failed to parse tie-breaker response after {max_retries} attempts: {str(e)}")
+                        print("Retrying tie-breaker...")
+                        time.sleep(1)  # Wait 1 second before retrying
+                        continue
+                except Exception as e:
+                    # For other exceptions (API errors, etc.), raise immediately
+                    print(f"API error in tie-breaker: {str(e)}")
+                    raise
+        
+        # Get the most common score
+        most_common_score = score_counts.most_common(1)[0][0]
+        print(f"Most common score: {most_common_score} (occurred {score_counts[most_common_score]} times)")
+        return most_common_score
+            
+    except Exception as e:
+        if "OPENAI_API_KEY" in str(e):
+            raise  # Re-raise API key errors
+        print(f"Error evaluating with OpenAI: {str(e)}")
+        raise ValueError(f"OpenAI evaluation failed: {str(e)}")
+
+
+def load_eval_results(working_dir: Path) -> Dict[str, Any]:
+    """Load the eval-results.json file from the working directory."""
+    eval_results_path = working_dir / "eval-results.json"
+    if not eval_results_path.exists():
+        raise FileNotFoundError(f"eval-results.json not found in {working_dir}")
+    
+    with open(eval_results_path, 'r') as f:
+        return json.load(f)
+
+
+def load_output_file(working_dir: Path, output_file: str) -> str:
+    """Load the output file to evaluate from the working directory."""
+    output_path = working_dir / output_file
+    if not output_path.exists():
+        raise FileNotFoundError(f"Output file not found: {output_path}")
+    
+    with open(output_path, 'r') as f:
+        return f.read().strip()
+
+
+def load_evaluation_prompt(working_dir: Path) -> str:
+    """Load the evaluation prompt from a file or use a default.
+    
+    This function looks for a prompt.txt file in the working directory.
+    If not found, it returns a default evaluation prompt.
+    """
+    prompt_file = working_dir / "prompt.txt"
+    if prompt_file.exists():
+        with open(prompt_file, 'r') as f:
+            return f.read().strip()
+    
+    # Default evaluation prompt
+    return """You are an expert evaluator assessing the quality of AI responses.
+Evaluate the response based on the following criteria:
+- Accuracy and correctness
+- Completeness of the answer
+- Clarity and coherence
+- Helpfulness to the user
+
+Score the response on a scale from 0 to 2:
+0 = Poor response (incorrect, incomplete, or unhelpful)
+1 = Acceptable response (partially correct but with issues)
+2 = Excellent response (correct, complete, and helpful)"""
+
+
+def main():
+    parser = argparse.ArgumentParser(description="LLM Judge post-processing script for Goose benchmarks")
+    parser.add_argument("output_file", type=str, help="Name of the output file to evaluate (e.g., blog_summary_output.txt)")
+    parser.add_argument("--rubric-max-score", type=int, default=2, help="Maximum score for the rubric (default: 2)")
+    parser.add_argument("--prompt-file", type=str, help="Path to custom evaluation prompt file")
+    
+    args = parser.parse_args()
+    
+    # Use current working directory
+    working_dir = Path.cwd()
+    
+    try:
+        # Load eval results
+        eval_results = load_eval_results(working_dir)
+        
+        # Load the output file to evaluate
+        response_text = load_output_file(working_dir, args.output_file)
+        
+        # Load evaluation prompt
+        if args.prompt_file:
+            with open(args.prompt_file, 'r') as f:
+                evaluation_prompt = f.read().strip()
+        else:
+            evaluation_prompt = load_evaluation_prompt(working_dir)
+        
+        # Evaluate with OpenAI
+        score = evaluate_with_openai(evaluation_prompt, response_text, args.rubric_max_score)
+        
+        # Update eval results with the score
+        eval_results["metrics"].append([
+            "llm_judge_score", 
+            {"Float": score}
+        ])
+
+        # Save updated results
+        eval_results_path = working_dir / "eval-results.json"
+        with open(eval_results_path, 'w') as f:
+            json.dump(eval_results, f, indent=2)
+        
+        print(f"Successfully updated eval-results.json with LLM judge score: {score}")
+        
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/bench-postprocess-scripts/llm-judges/restaurant_research_prompt.txt
+++ b/scripts/bench-postprocess-scripts/llm-judges/restaurant_research_prompt.txt
@@ -0,0 +1,15 @@
+You are evaluating an AI assistant's response to a restaurant research task. The instructions were:
+
+'Search the internet for and provide a current, detailed list of the best Sichuanese restaurants specifically in the East Village neighborhood of NYC. Format your response in Markdown using bullet points (either - or *) for each restaurant. For each restaurant include:
+- Restaurant name and what they're known for
+- Signature dishes
+- Atmosphere/setting
+- Any relevant details about reservations or dining experience
+- What distinguishes them from others
+
+Present the information in order of significance or quality. Focus specifically on Sichuanese establishments, not general Chinese restaurants. If you encounter a page you cannot access, try another one. Do not ask me for confirmation just conduct the searches yourself until you find the needed information. Remember to use your tools if applicable.'
+
+Give a score of 0, 1, or 2:
+0 = does not provide any restaurants at all
+1 = provides some restaurants, but not all are Sichuanese or in the East Village NYC
+2 = provides all Sichuanese restaurants in the East Village, probably including Mala project and Szechuan Mountain House, or Uluh. Use your memory/knowledge of the East Village NYC restaurants to double check non-East Village restaurants.
--- a/scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh
+++ b/scripts/bench-postprocess-scripts/llm-judges/run_vibes_judge.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Wrapper script for LLM judge post-processing and final score calculation
+# This script is called by the benchmark runner with the eval results file as an argument
+
+# Get the directory where this script is located
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+# Get the eval results file path from the first argument
+EVAL_RESULTS_FILE="$1"
+
+# Extract the working directory from the eval results file path
+WORKING_DIR="$(dirname "$EVAL_RESULTS_FILE")"
+
+# Change to the working directory
+cd "$WORKING_DIR"
+
+# Determine the evaluation name from the eval-results.json
+EVAL_NAME=$(python3 -c "import json; print(json.load(open('eval-results.json'))['name'])")
+
+# Set the output file name and prompt file based on the evaluation
+if [ "$EVAL_NAME" = "blog_summary" ]; then
+    OUTPUT_FILE="blog_summary_output.txt"
+    PROMPT_FILE="$SCRIPT_DIR/blog_summary_prompt.txt"
+elif [ "$EVAL_NAME" = "restaurant_research" ]; then
+    OUTPUT_FILE="restaurant_research_output.txt"
+    PROMPT_FILE="$SCRIPT_DIR/restaurant_research_prompt.txt"
+else
+    echo "Error: Unknown evaluation name: $EVAL_NAME"
+    exit 1
+fi
+
+# Run the LLM judge script with the appropriate arguments
+python3 "$SCRIPT_DIR/llm_judge.py" "$OUTPUT_FILE" --prompt-file "$PROMPT_FILE"
+
+# Check if LLM judge succeeded
+if [ $? -ne 0 ]; then
+    echo "Error: LLM judge failed"
+    exit 1
+fi
+
+# Calculate the final score
+python3 "$SCRIPT_DIR/calculate_final_scores_vibes.py" "$EVAL_NAME"
+
+# Check if score calculation succeeded
+if [ $? -ne 0 ]; then
+    echo "Error: Final score calculation failed"
+    exit 1
+fi
+
+echo "Successfully completed post-processing for $EVAL_NAME"
--- a/scripts/bench-postprocess-scripts/prepare_aggregate_metrics.py
+++ b/scripts/bench-postprocess-scripts/prepare_aggregate_metrics.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python3
+# Compatible with Python 3.6+
+"""
+Prepare aggregate_metrics.csv files from individual eval-results.json files with error detection.
+
+This script:
+1. Looks for model folders in the benchmark directory
+2. For each model folder, finds all eval-results.json files in subfolders
+3. Checks session files for server errors
+4. Extracts metrics from these files and combines them
+5. Creates an eval-results directory in each model folder
+6. Saves a aggregate_metrics.csv file with aggregated metrics
+
+Usage:
+    python prepare_aggregate_metrics.py --benchmark-dir /path/to/benchmark-dir
+"""
+
+import argparse
+import json
+import pandas as pd
+from pathlib import Path
+import sys
+
+def extract_provider_model(model_dir):
+    """Extract provider and model name from directory name."""
+    dir_name = model_dir.name
+    parts = dir_name.split('-')
+    
+    if len(parts) > 1:
+        model_name = parts[-1]  # Last part is the model name
+        provider = '-'.join(parts[:-1])  # Everything else is the provider
+    else:
+        model_name = dir_name
+        provider = "unknown"
+    
+    return provider, model_name
+
+def find_eval_results_files(model_dir):
+    """Find all eval-results.json files in a model directory."""
+    return list(model_dir.glob("**/eval-results.json"))
+
+def find_session_files(model_dir):
+    """Find all session jsonl files in a model directory."""
+    return list(model_dir.glob("**/*.jsonl"))
+
+def check_for_errors_in_session(session_file):
+    """Check if a session file contains server errors."""
+    try:
+        error_found = False
+        error_messages = []
+        
+        with open(session_file, 'r') as f:
+            for line in f:
+                try:
+                    message_obj = json.loads(line.strip())
+                    # Check for error messages in the content
+                    if 'content' in message_obj and isinstance(message_obj['content'], list):
+                        for content_item in message_obj['content']:
+                            if isinstance(content_item, dict) and 'text' in content_item:
+                                text = content_item['text']
+                                if 'Server error' in text or 'error_code' in text or 'TEMPORARILY_UNAVAILABLE' in text:
+                                    error_found = True
+                                    error_messages.append(text)
+                except json.JSONDecodeError:
+                    continue
+        
+        return error_found, error_messages
+    except Exception as e:
+        print(f"Error checking session file {session_file}: {str(e)}")
+        return False, []
+
+def extract_metrics_from_eval_file(eval_file, provider, model_name, session_files):
+    """Extract metrics from an eval-results.json file."""
+    try:
+        with open(eval_file, 'r') as f:
+            data = json.load(f)
+        
+        # Extract directory structure to determine eval suite and name
+        path_parts = eval_file.parts
+        run_index = -1
+        for i, part in enumerate(path_parts):
+            if part.startswith("run-"):
+                run_index = i
+                break
+        
+        if run_index == -1 or run_index + 2 >= len(path_parts):
+            print(f"Warning: Could not determine eval suite and name from {eval_file}")
+            return None
+        
+        run_number = path_parts[run_index].split('-')[1]  # Extract "0" from "run-0"
+        eval_suite = path_parts[run_index + 1]  # Directory after run-N
+        eval_name = path_parts[run_index + 2]  # Directory after eval_suite
+        
+        # Create a row with basic identification
+        row = {
+            'provider': provider,
+            'model_name': model_name,
+            'eval_suite': eval_suite,
+            'eval_name': eval_name,
+            'run': run_number
+        }
+        
+        # Check for server errors in session files for this evaluation
+        eval_dir = eval_file.parent
+        related_session_files = [sf for sf in session_files if eval_dir in sf.parents]
+        
+        server_error_found = False
+        for session_file in related_session_files:
+            error_found, _ = check_for_errors_in_session(session_file)
+            if error_found:
+                server_error_found = True
+                break
+        
+        # Add server error flag
+        row['server_error'] = 1 if server_error_found else 0
+        
+        # Extract all metrics (flatten the JSON structure)
+        if isinstance(data, dict):
+            metrics = {}
+            
+            # Extract top-level metrics
+            for key, value in data.items():
+                if isinstance(value, (int, float)) and not isinstance(value, bool):
+                    metrics[key] = value
+            
+            # Look for nested metrics structure (list of [name, value] pairs)
+            if 'metrics' in data and isinstance(data['metrics'], list):
+                for metric_item in data['metrics']:
+                    if isinstance(metric_item, list) and len(metric_item) == 2:
+                        metric_name = metric_item[0]
+                        metric_value = metric_item[1]
+                        
+                        # Handle different value formats
+                        if isinstance(metric_value, dict):
+                            if 'Integer' in metric_value:
+                                metrics[metric_name] = int(metric_value['Integer'])
+                            elif 'Float' in metric_value:
+                                metrics[metric_name] = float(metric_value['Float'])
+                            elif 'Bool' in metric_value:
+                                metrics[metric_name] = 1 if metric_value['Bool'] else 0
+                            # Skip string values for aggregation
+                        elif isinstance(metric_value, (int, float)) and not isinstance(metric_value, bool):
+                            metrics[metric_name] = metric_value
+                        elif isinstance(metric_value, bool):
+                            metrics[metric_name] = 1 if metric_value else 0
+            
+            # Look for metrics in other common locations
+            for metric_location in ['metrics', 'result', 'evaluation']:
+                if metric_location in data and isinstance(data[metric_location], dict):
+                    for key, value in data[metric_location].items():
+                        if isinstance(value, (int, float)) and not isinstance(value, bool):
+                            metrics[key] = value
+                        elif isinstance(value, bool):
+                            metrics[key] = 1 if value else 0
+            
+            # Add all metrics to the row
+            row.update(metrics)
+            
+            # Ensure a score is present (if not, add a placeholder)
+            if 'score' not in row:
+                # Try to use existing fields to calculate a score
+                if server_error_found:
+                    row['score'] = 0  # Failed runs get a zero score
+                else:
+                    # Set a default based on presence of "success" fields
+                    for key in row:
+                        if 'success' in key.lower() and isinstance(row[key], (int, float)):
+                            row['score'] = row[key]
+                            break
+                    else:
+                        # No success field found, mark as NaN
+                        row['score'] = float('nan')
+            
+            return row
+        else:
+            print(f"Warning: Unexpected format in {eval_file}")
+            return None
+    
+    except Exception as e:
+        print(f"Error processing {eval_file}: {str(e)}")
+        return None
+
+def process_model_directory(model_dir):
+    """Process a model directory to create aggregate_metrics.csv."""
+    provider, model_name = extract_provider_model(model_dir)
+    
+    # Find all eval results files
+    eval_files = find_eval_results_files(model_dir)
+    if not eval_files:
+        print(f"No eval-results.json files found in {model_dir}")
+        return False
+    
+    # Find all session files for error checking
+    session_files = find_session_files(model_dir)
+    
+    # Extract metrics from each eval file
+    rows = []
+    for eval_file in eval_files:
+        row = extract_metrics_from_eval_file(eval_file, provider, model_name, session_files)
+        if row is not None:
+            rows.append(row)
+    
+    if not rows:
+        print(f"No valid metrics extracted from {model_dir}")
+        return False
+    
+    # Create a dataframe from all rows
+    combined_df = pd.DataFrame(rows)
+    
+    # Calculate aggregates for numeric columns, grouped by eval_suite, eval_name
+    numeric_cols = combined_df.select_dtypes(include=['number']).columns.tolist()
+    # Exclude the run column from aggregation
+    if 'run' in numeric_cols:
+        numeric_cols.remove('run')
+    
+    # Group by provider, model_name, eval_suite, eval_name and calculate mean for numeric columns
+    group_by_cols = ['provider', 'model_name', 'eval_suite', 'eval_name']
+    agg_dict = {col: 'mean' for col in numeric_cols}
+    
+    # Only perform aggregation if we have numeric columns
+    if numeric_cols:
+        aggregate_df = combined_df.groupby(group_by_cols).agg(agg_dict).reset_index()
+        
+        # Rename columns to add _mean suffix for the averaged metrics
+        for col in numeric_cols:
+            aggregate_df = aggregate_df.rename(columns={col: f"{col}_mean"})
+    else:
+        print(f"Warning: No numeric metrics found in {model_dir}")
+        # Create a minimal dataframe with just the grouping columns
+        aggregate_df = combined_df[group_by_cols].drop_duplicates()
+    
+    # Make sure we have prompt_execution_time_mean and prompt_error_mean columns
+    # These are expected by the generate_leaderboard.py script
+    if 'prompt_execution_time_mean' not in aggregate_df.columns:
+        aggregate_df['prompt_execution_time_mean'] = float('nan')
+    
+    if 'prompt_error_mean' not in aggregate_df.columns:
+        aggregate_df['prompt_error_mean'] = float('nan')
+    
+    # Add server_error_mean column if not present
+    if 'server_error_mean' not in aggregate_df.columns:
+        aggregate_df['server_error_mean'] = 0.0
+    
+    # Create eval-results directory
+    eval_results_dir = model_dir / "eval-results"
+    eval_results_dir.mkdir(exist_ok=True)
+    
+    # Save to CSV
+    csv_path = eval_results_dir / "aggregate_metrics.csv"
+    aggregate_df.to_csv(csv_path, index=False)
+    
+    # Count number of evaluations that had server errors
+    if 'server_error_mean' in aggregate_df.columns:
+        error_count = len(aggregate_df[aggregate_df['server_error_mean'] > 0])
+        total_count = len(aggregate_df)
+        print(f"Saved aggregate metrics to {csv_path} with {len(aggregate_df)} rows " +
+              f"({error_count}/{total_count} evals had server errors)")
+    else:
+        print(f"Saved aggregate metrics to {csv_path} with {len(aggregate_df)} rows")
+    
+    return True
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Prepare aggregate_metrics.csv files from eval-results.json files with error detection"
+    )
+    parser.add_argument(
+        "--benchmark-dir",
+        type=str,
+        required=True,
+        help="Path to the benchmark directory containing model subdirectories"
+    )
+    
+    args = parser.parse_args()
+    
+    # Convert path to Path object and validate it exists
+    benchmark_dir = Path(args.benchmark_dir)
+    if not benchmark_dir.exists() or not benchmark_dir.is_dir():
+        print(f"Error: Benchmark directory {benchmark_dir} does not exist or is not a directory")
+        sys.exit(1)
+    
+    success_count = 0
+    
+    # Process each model directory
+    for model_dir in benchmark_dir.iterdir():
+        if model_dir.is_dir() and not model_dir.name.startswith('.'):
+            if process_model_directory(model_dir):
+                success_count += 1
+    
+    if success_count == 0:
+        print("No aggregate_metrics.csv files were created")
+        sys.exit(1)
+    
+    print(f"Successfully created aggregate_metrics.csv files for {success_count} model directories")
+    print("You can now run generate_leaderboard.py to create the final leaderboard.")
+    print("Note: The server_error_mean column indicates the average rate of server errors across evaluations.")
+
+if __name__ == "__main__":
+    main()
+