feat: add additional goosebench evals (#1571)

Co-authored-by: Alice Hau <alice.a.hau@gmail.com>
2025-12-18 22:54:24 +01:00 · 2025-03-10 15:11:44 -04:00
parent 8689d24407
commit bb4feacf03
14 changed files with 859 additions and 3 deletions
--- a/crates/goose-bench/src/assets/squirrel-data.csv
+++ b/crates/goose-bench/src/assets/squirrel-data.csv
--- a/crates/goose-bench/src/eval_suites/evaluation.rs
+++ b/crates/goose-bench/src/eval_suites/evaluation.rs
@@ -35,6 +35,9 @@ pub trait BenchAgent: Send + Sync {
    // Make get_errors async
    async fn get_errors(&self) -> Vec<BenchAgentError>;
    // Get token usage information
    async fn get_token_usage(&self) -> Option<i32>;
 }
 #[async_trait]
--- a/crates/goose-bench/src/eval_suites/metrics.rs
+++ b/crates/goose-bench/src/eval_suites/metrics.rs
@@ -0,0 +1,105 @@
 use crate::eval_suites::{BenchAgent, EvaluationMetric};
 use goose::message::{Message, MessageContent};
 use std::collections::HashMap;
 use std::time::Instant;
 /// Collect baseline metrics including execution time, tool usage, and token count
 pub async fn collect_baseline_metrics(
    agent: &mut Box<dyn BenchAgent>,
    prompt: String,
 ) -> (Vec<Message>, HashMap<String, EvaluationMetric>) {
    // Initialize metrics map
    let mut metrics = HashMap::new();
    // Start timer
    let start_time = Instant::now();
    // Execute prompt
    let messages = match agent.prompt(prompt).await {
        Ok(msgs) => msgs,
        Err(e) => {
            metrics.insert(
                "prompt_error".to_string(),
                EvaluationMetric::String(format!("Error: {}", e)),
            );
            Vec::new()
        }
    };
    // Calculate execution time
    let execution_time = start_time.elapsed();
    metrics.insert(
        "prompt_execution_time_seconds".to_string(),
        EvaluationMetric::Float(execution_time.as_secs_f64()),
    );
    // Count tool calls
    let (total_tool_calls, tool_calls_by_name) = count_tool_calls(&messages);
    metrics.insert(
        "total_tool_calls".to_string(),
        EvaluationMetric::Integer(total_tool_calls),
    );
    // Add tool calls by name metrics
    for (tool_name, count) in tool_calls_by_name {
        metrics.insert(
            format!("tool_calls_{}", tool_name),
            EvaluationMetric::Integer(count),
        );
    }
    // Get token usage information if available
    if let Some(token_count) = agent.get_token_usage().await {
        metrics.insert(
            "total_tokens".to_string(),
            EvaluationMetric::Integer(token_count as i64),
        );
    }
    (messages, metrics)
 }
 /// Count all tool calls in messages and categorize by tool name
 fn count_tool_calls(messages: &[Message]) -> (i64, HashMap<String, i64>) {
    let mut total_count = 0;
    let mut counts_by_name = HashMap::new();
    for message in messages {
        for content in &message.content {
            if let MessageContent::ToolRequest(tool_req) = content {
                if let Ok(tool_call) = tool_req.tool_call.as_ref() {
                    total_count += 1;
                    // Count by name
                    *counts_by_name.entry(tool_call.name.clone()).or_insert(0) += 1;
                }
            }
        }
    }
    (total_count, counts_by_name)
 }
 /// Convert HashMap of metrics to Vec
 pub fn metrics_hashmap_to_vec(
    metrics: HashMap<String, EvaluationMetric>,
 ) -> Vec<(String, EvaluationMetric)> {
    metrics.into_iter().collect()
 }
 /// Check if a specific tool was used in any of the messages
 pub fn used_tool(messages: &[Message], tool_name: &str) -> bool {
    messages.iter().any(|msg| {
        msg.content.iter().any(|content| {
            if let MessageContent::ToolRequest(tool_req) = content {
                if let Ok(tool_call) = tool_req.tool_call.as_ref() {
                    tool_call.name.contains(tool_name)
                } else {
                    false
                }
            } else {
                false
            }
        })
    })
 }
--- a/crates/goose-bench/src/eval_suites/mod.rs
+++ b/crates/goose-bench/src/eval_suites/mod.rs
@@ -1,6 +1,11 @@
 mod core;
 mod evaluation;
 mod factory;
 mod metrics;
 mod utils;
 mod vibes;
 pub use evaluation::*;
 pub use factory::{register_evaluation, EvaluationSuiteFactory};
 pub use metrics::*;
 pub use utils::*;
--- a/crates/goose-bench/src/eval_suites/utils.rs
+++ b/crates/goose-bench/src/eval_suites/utils.rs
@@ -0,0 +1,69 @@
 use crate::bench_work_dir::BenchmarkWorkDir;
 use anyhow::{Context, Result};
 use goose::message::Message;
 use goose::session::storage;
 use std::fs::{self, File};
 use std::io::Write;
 use std::path::PathBuf;
 /// Write the last agent message to a file
 /// Returns the content of the message and an error if writing failed
 pub fn write_response_to_file(
    messages: &[Message],
    _work_dir: &mut BenchmarkWorkDir, // Kept for API compatibility
    filename: &str,
 ) -> Result<String> {
    let last_msg = messages
        .last()
        .ok_or_else(|| anyhow::anyhow!("No messages to write to file"))?;
    let text_content = last_msg.as_concat_text();
    // Create a file in the current directory
    let output_path = PathBuf::from(filename);
    // Create and write to the file
    let mut file = File::create(&output_path)
        .with_context(|| format!("Failed to create file at {}", output_path.display()))?;
    file.write_all(text_content.as_bytes())
        .with_context(|| format!("Failed to write content to {}", output_path.display()))?;
    Ok(text_content)
 }
 /// Copy the most recent session file to the current working directory
 ///
 /// This function finds the most recent Goose session file (.jsonl) and copies it
 /// to the current working directory. Session files are stored by the Goose framework
 /// in a platform-specific data directory.
 ///
 /// # Returns
 /// - Ok(session_path) if successfully copied, where session_path is the path to the copied file
 /// - Err if any errors occurred during the process
 pub fn copy_session_to_cwd() -> Result<PathBuf> {
    // Try to get the most recent session file
    let src_path = storage::get_most_recent_session()
        .with_context(|| "Failed to find any recent session files")?;
    // Extract the filename from the path
    let filename = src_path
        .file_name()
        .ok_or_else(|| anyhow::anyhow!("Invalid session filename"))?;
    // Create the destination path in the current directory
    let dest_path = PathBuf::from(".").join(filename);
    // Copy the file
    fs::copy(&src_path, &dest_path).with_context(|| {
        format!(
            "Failed to copy from '{}' to '{}'",
            src_path.display(),
            dest_path.display()
        )
    })?;
    println!("Session file copied to: {}", dest_path.display());
    Ok(dest_path)
 }
--- a/crates/goose-bench/src/eval_suites/vibes/blog_summary.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/blog_summary.rs
@@ -0,0 +1,89 @@
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
    BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
 pub struct BlogSummary {}
 impl BlogSummary {
    pub fn new() -> Self {
        BlogSummary {}
    }
    fn check_markdown_numbered_list(&self, text: &str) -> bool {
        // Check if all numbers 1-5 exist in markdown numbered list format
        (1..=5).all(|n| text.contains(&format!("{}.", n)))
    }
 }
 #[async_trait]
 impl Evaluation for BlogSummary {
    async fn run(
        &self,
        mut agent: Box<dyn BenchAgent>,
        work_dir: &mut BenchmarkWorkDir,
    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
        println!("BlogSummary - run");
        // Collect baseline metrics (execution time, token usage, tool calls)
        let (response, perf_metrics) = collect_baseline_metrics(
            &mut agent,
            "What are the top 5 most counterintuitive insights from this blog post? Format your response in Markdown with 5 numbered points (1. 2. 3. 4. 5.) https://huyenchip.com/2025/01/07/agents.html".to_string()
        ).await;
        // Write response to file and get the text content
        let response_text =
            match write_response_to_file(&response, work_dir, "blog_summary_output.txt") {
                Ok(text) => text,
                Err(e) => {
                    println!("Warning: Failed to write blog summary output: {}", e);
                    // If file write fails, still continue with the evaluation
                    response
                        .last()
                        .map_or_else(String::new, |msg| msg.as_concat_text())
                }
            };
        // Convert HashMap to Vec for our metrics
        let mut metrics = metrics_hashmap_to_vec(perf_metrics);
        // Check if the content follows the markdown numbered list format
        let has_markdown_list = self.check_markdown_numbered_list(&response_text);
        metrics.push((
            "valid_markdown_format".to_string(),
            EvaluationMetric::Boolean(has_markdown_list),
        ));
        // Check if the fetch tool was used
        let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
        metrics.push((
            "used_fetch_tool".to_string(),
            EvaluationMetric::Boolean(used_fetch_tool),
        ));
        // Copy the session file to the current working directory
        if let Err(e) = copy_session_to_cwd() {
            println!("Warning: Failed to copy session file: {}", e);
        } else {
            println!("Successfully copied session file to current directory");
        }
        Ok(metrics)
    }
    fn name(&self) -> &str {
        "blog_summary"
    }
    fn required_extensions(&self) -> ExtensionRequirements {
        ExtensionRequirements {
            builtin: vec!["developer".to_string()],
            external: vec!["uvx mcp-server-fetch".to_string()],
        }
    }
 }
 register_evaluation!("vibes", BlogSummary);
--- a/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs
@@ -0,0 +1,121 @@
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
    EvaluationMetric, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
 use goose::message::MessageContent;
 use mcp_core::role::Role;
 use serde_json::{self, Value};
 use std::fs;
 pub struct FlappyBird {}
 impl FlappyBird {
    pub fn new() -> Self {
        FlappyBird {}
    }
    fn check_python_implementation(&self, content: &str) -> bool {
        content.contains("import pygame") &&
        content.contains("pygame.init()") &&
        content.contains("while") && // Game loop
        content.contains("pygame.event.get()") && // Event handling
        content.contains("def main") && // Main function
        content.contains("if __name__ == '__main__'") // Main guard
    }
 }
 #[async_trait]
 impl Evaluation for FlappyBird {
    async fn run(
        &self,
        mut agent: Box<dyn BenchAgent>,
        work_dir: &mut BenchmarkWorkDir,
    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
        println!("FlappyBird - run");
        // Collect baseline metrics (execution time, token usage, tool calls)
        let (messages, perf_metrics) = collect_baseline_metrics(
            &mut agent,
            "Create a Flappy Bird game in Python. Structure the code with a main function and use the if __name__ == '__main__': idiom. You must use pygame. The background color should be a light blue color. Pressing SPACE multiple times will accelerate the bird. The bird's shape should be a red circle. Place on the bottom some land colored as dark yellow chosen. Make a score shown on the top right side. Increment if you pass pipes and don't hit them. Make randomly spaced dark green pipes with enough space. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again. When trying to run the game, make sure to use pyenv and create the environment in the current working directory. The final game should be written to a file named flappy_bird.py. Remember to use your tools if applicable.".to_string()
        ).await;
        // Convert HashMap to Vec for our metrics
        let mut metrics = metrics_hashmap_to_vec(perf_metrics);
        // Check if the agent used the text editor tool correctly
        let valid_tool_call = messages.iter().any(|msg| {
            msg.role == Role::Assistant
                && msg.content.iter().any(|content| {
                    if let MessageContent::ToolRequest(tool_req) = content {
                        if let Ok(tool_call) = tool_req.tool_call.as_ref() {
                            // Check tool name and basic parameters
                            if tool_call.name != "developer__text_editor" {
                                return false;
                            }
                            // Parse the arguments as JSON
                            if let Ok(args) =
                                serde_json::from_value::<Value>(tool_call.arguments.clone())
                            {
                                // Only check command is write and correct filename
                                args.get("command").and_then(Value::as_str) == Some("write")
                                    && args
                                        .get("path")
                                        .and_then(Value::as_str)
                                        .is_some_and(|s| s.contains("flappy_bird.py"))
                            } else {
                                false
                            }
                        } else {
                            false
                        }
                    } else {
                        false
                    }
                })
        });
        metrics.push((
            "used_write_tool".to_string(),
            EvaluationMetric::Boolean(valid_tool_call),
        ));
        // If tool was used correctly, check the actual file content
        if valid_tool_call {
            if let Ok(file_path) = work_dir.fs_get("flappy_bird.py".to_string()) {
                if let Ok(content) = fs::read_to_string(file_path) {
                    let valid_implementation = self.check_python_implementation(&content);
                    metrics.push((
                        "valid_implementation".to_string(),
                        EvaluationMetric::Boolean(valid_implementation),
                    ));
                }
            }
        }
        // Copy the session file to the current working directory
        if let Err(e) = copy_session_to_cwd() {
            println!("Warning: Failed to copy session file: {}", e);
        } else {
            println!("Successfully copied session file to current directory");
        }
        Ok(metrics)
    }
    fn name(&self) -> &str {
        "flappy_bird"
    }
    fn required_extensions(&self) -> ExtensionRequirements {
        ExtensionRequirements {
            builtin: vec!["developer".to_string()],
            external: Vec::new(),
        }
    }
 }
 register_evaluation!("vibes", FlappyBird);
--- a/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs
@@ -0,0 +1,99 @@
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
    EvaluationMetric, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
 use goose::message::MessageContent;
 use mcp_core::role::Role;
 use serde_json::{self, Value};
 pub struct GooseWiki {}
 impl GooseWiki {
    pub fn new() -> Self {
        GooseWiki {}
    }
 }
 #[async_trait]
 impl Evaluation for GooseWiki {
    async fn run(
        &self,
        mut agent: Box<dyn BenchAgent>,
        _: &mut BenchmarkWorkDir,
    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
        println!("GooseWiki - run");
        // Collect baseline metrics (execution time, token usage, tool calls)
        let (messages, perf_metrics) = collect_baseline_metrics(
            &mut agent,
            "Create a Wikipedia-style web page about Goose (Block's AI agent) in a new index.html file. The page should be a complete, well-structured HTML document with proper head and body sections. Use heading tags (h1, h2, h3) to organize the content into clear sections. Include comprehensive information about Goose organized in a way similar to how Wikipedia presents technical topics. Remember to use your tools if applicable.".to_string()
        ).await;
        // Convert HashMap to Vec for our metrics
        let mut metrics = metrics_hashmap_to_vec(perf_metrics);
        // Check if the agent used the text editor tool to create index.html
        let valid_tool_call = messages.iter().any(|msg| {
            msg.role == Role::Assistant &&
            msg.content.iter().any(|content| {
                if let MessageContent::ToolRequest(tool_req) = content {
                    if let Ok(tool_call) = tool_req.tool_call.as_ref() {
                        // Check tool name is correct
                        if tool_call.name != "developer__text_editor" {
                            return false;
                        }
                        // Parse the arguments as JSON
                        if let Ok(args) = serde_json::from_value::<Value>(tool_call.arguments.clone()) {
                            // Check command is write and path contains index.html
                            args.get("command").and_then(Value::as_str) == Some("write") &&
                            args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("index.html")) &&
                            // Verify file_text contains basic HTML structure
                            args.get("file_text").and_then(Value::as_str).is_some_and(|s| {
                                s.contains("<html") && s.contains("</html>") &&
                                s.contains("<head") && s.contains("</head>") &&
                                s.contains("<body") && s.contains("</body>")
                            })
                        } else {
                            false
                        }
                    } else {
                        false
                    }
                } else {
                    false
                }
            })
        });
        metrics.push((
            "created_valid_html".to_string(),
            EvaluationMetric::Boolean(valid_tool_call),
        ));
        // Copy the session file to the current working directory
        if let Err(e) = copy_session_to_cwd() {
            println!("Warning: Failed to copy session file: {}", e);
        } else {
            println!("Successfully copied session file to current directory");
        }
        Ok(metrics)
    }
    fn name(&self) -> &str {
        "goose_wiki"
    }
    fn required_extensions(&self) -> ExtensionRequirements {
        ExtensionRequirements {
            builtin: vec!["developer".to_string()],
            external: Vec::new(),
        }
    }
 }
 register_evaluation!("vibes", GooseWiki);
--- a/crates/goose-bench/src/eval_suites/vibes/mod.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/mod.rs
@@ -0,0 +1,5 @@
 mod blog_summary;
 mod flappy_bird;
 mod goose_wiki;
 mod restaurant_research;
 mod squirrel_census;
--- a/crates/goose-bench/src/eval_suites/vibes/restaurant_research.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/restaurant_research.rs
@@ -0,0 +1,109 @@
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
    BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
 pub struct RestaurantResearch {}
 impl RestaurantResearch {
    pub fn new() -> Self {
        RestaurantResearch {}
    }
    fn check_markdown_bullets(&self, text: &str) -> bool {
        // Check if there's at least one bullet point and proper markdown formatting
        text.contains("- ") || text.contains("* ")
    }
    fn count_bullet_points(&self, text: &str) -> i64 {
        // Count total bullet points (either - or * style)
        let dash_bullets = text.matches("- ").count();
        let star_bullets = text.matches("* ").count();
        (dash_bullets + star_bullets) as i64
    }
 }
 #[async_trait]
 impl Evaluation for RestaurantResearch {
    async fn run(
        &self,
        mut agent: Box<dyn BenchAgent>,
        work_dir: &mut BenchmarkWorkDir,
    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
        println!("RestaurantResearch - run");
        // Collect baseline metrics (execution time, token usage, tool calls)
        let (response, perf_metrics) = collect_baseline_metrics(
            &mut agent,
            "Search the internet for and provide a current, detailed list of the best Sichuanese restaurants specifically in the East Village neighborhood of NYC. Format your response in Markdown using bullet points (either - or *) for each restaurant. For each restaurant include:
 - Restaurant name and what they're known for
 - Signature dishes
 - Atmosphere/setting
 - Any relevant details about reservations or dining experience
 - What distinguishes them from others
 Present the information in order of significance or quality. Focus specifically on Sichuanese establishments, not general Chinese restaurants. If you encounter a page you cannot access, try another one. Do not ask me for confirmation just conduct the searches yourself until you find the needed information. Remember to use your tools if applicable.".to_string()
        ).await;
        // Write response to file and get the text content
        let response_text =
            match write_response_to_file(&response, work_dir, "restaurant_research_output.txt") {
                Ok(text) => text,
                Err(e) => {
                    println!("Warning: Failed to write restaurant research output: {}", e);
                    // If file write fails, still continue with the evaluation
                    response
                        .last()
                        .map_or_else(String::new, |msg| msg.as_concat_text())
                }
            };
        // Convert HashMap to Vec for our metrics
        let mut metrics = metrics_hashmap_to_vec(perf_metrics);
        // Check markdown formatting
        let has_markdown_bullets = self.check_markdown_bullets(&response_text);
        let bullet_count = self.count_bullet_points(&response_text);
        metrics.push((
            "valid_markdown_format".to_string(),
            EvaluationMetric::Boolean(has_markdown_bullets),
        ));
        metrics.push((
            "bullet_point_count".to_string(),
            EvaluationMetric::Integer(bullet_count),
        ));
        // Check if the fetch tool was used
        let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
        metrics.push((
            "used_fetch_tool".to_string(),
            EvaluationMetric::Boolean(used_fetch_tool),
        ));
        // Copy the session file to the current working directory
        if let Err(e) = copy_session_to_cwd() {
            println!("Warning: Failed to copy session file: {}", e);
        } else {
            println!("Successfully copied session file to current directory");
        }
        Ok(metrics)
    }
    fn name(&self) -> &str {
        "restaurant_research"
    }
    fn required_extensions(&self) -> ExtensionRequirements {
        ExtensionRequirements {
            builtin: vec!["developer".to_string()],
            external: vec!["uvx mcp-server-fetch".to_string()],
        }
    }
 }
 register_evaluation!("vibes", RestaurantResearch);
--- a/crates/goose-bench/src/eval_suites/vibes/squirrel_census.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/squirrel_census.rs
@@ -0,0 +1,177 @@
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
    EvaluationMetric, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
 use goose::message::MessageContent;
 use mcp_core::role::Role;
 use serde_json::{self, Value};
 pub struct SquirrelCensus {}
 impl SquirrelCensus {
    pub fn new() -> Self {
        SquirrelCensus {}
    }
    fn check_analysis_results(&self, text: &str) -> (bool, bool, bool) {
        let text_lower = text.to_lowercase();
        let has_central_manhattan =
            text_lower.contains("central manhattan") && text.contains("174");
        let has_tompkins = text_lower.contains("tompkins square park") && text.contains("59");
        let has_gray = text_lower.contains("gray") || text_lower.contains("grey");
        (has_central_manhattan, has_tompkins, has_gray)
    }
 }
 #[async_trait]
 impl Evaluation for SquirrelCensus {
    async fn run(
        &self,
        mut agent: Box<dyn BenchAgent>,
        work_dir: &mut BenchmarkWorkDir,
    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
        println!("SquirrelCensus - run");
        // Get the path to the squirrel data file
        let squirrel_data_path = match work_dir.fs_get("./assets/squirrel-data.csv".to_string()) {
            Ok(file) => file,
            Err(_) => return Err(anyhow::anyhow!("Could not find squirrel-data.csv file")),
        };
        println!("squirrel_data_path: {:?}", squirrel_data_path);
        // Collect baseline metrics (execution time, token usage, tool calls)
        let (messages, perf_metrics) = collect_baseline_metrics(
            &mut agent,
            format!(
                "Create a Python script called analyze_squirrels.py that analyzes the CSV file at {}. Do not ask for any clarification or further instructions - proceed with the implementation as specified below.
 The script should use pandas to answer these specific questions:
 1. Which area (Area column) has the most squirrels spotted? For this area, what is the most common Primary Fur Color of squirrels?
 2. Which specific park location (Park Name column) has the most squirrels spotted? For this location, what is the most common Primary Fur Color of squirrels?
 The script should:
 - Use pandas to read and analyze the data
 - Print results in EXACTLY this format (including the markers):
  [AREA_RESULT] <area_name> - <count> squirrels spotted
  [AREA_COLOR] Most common fur color: <color> (<color_count> squirrels)
  [PARK_RESULT] <park_name> - <count> squirrels spotted
  [PARK_COLOR] Most common fur color: <color> (<color_count> squirrels)
 After writing the script, run it using python3 and show the results. Do not ask for confirmation or further instructions. Remember to use your tools if applicable.", 
                squirrel_data_path.display()
            )
        ).await;
        // Convert HashMap to Vec for our metrics
        let mut metrics = metrics_hashmap_to_vec(perf_metrics);
        // Check if agent wrote the Python script
        let wrote_script = messages.iter().any(|msg| {
            msg.role == Role::Assistant
                && msg.content.iter().any(|content| {
                    if let MessageContent::ToolRequest(tool_req) = content {
                        if let Ok(tool_call) = tool_req.tool_call.as_ref() {
                            if tool_call.name != "developer__text_editor" {
                                return false;
                            }
                            if let Ok(args) =
                                serde_json::from_value::<Value>(tool_call.arguments.clone())
                            {
                                args.get("command").and_then(Value::as_str) == Some("write")
                                    && args
                                        .get("path")
                                        .and_then(Value::as_str)
                                        .is_some_and(|s| s.contains("analyze_squirrels.py"))
                            } else {
                                false
                            }
                        } else {
                            false
                        }
                    } else {
                        false
                    }
                })
        });
        // Check if agent ran the script
        let ran_script = messages.iter().any(|msg| {
            msg.role == Role::Assistant
                && msg.content.iter().any(|content| {
                    if let MessageContent::ToolRequest(tool_req) = content {
                        if let Ok(tool_call) = tool_req.tool_call.as_ref() {
                            if tool_call.name != "developer__shell" {
                                return false;
                            }
                            if let Ok(args) =
                                serde_json::from_value::<Value>(tool_call.arguments.clone())
                            {
                                args.get("command")
                                    .and_then(Value::as_str)
                                    .is_some_and(|s| {
                                        s.contains("python") && s.contains("analyze_squirrels.py")
                                    })
                            } else {
                                false
                            }
                        } else {
                            false
                        }
                    } else {
                        false
                    }
                })
        });
        // Check the last message for correct results
        let correct_results = if let Some(last_msg) = messages.last() {
            let text_content = last_msg.as_concat_text();
            let (has_central_manhattan, has_tompkins, has_gray) =
                self.check_analysis_results(&text_content);
            has_central_manhattan && has_tompkins && has_gray
        } else {
            false
        };
        metrics.push((
            "wrote_script".to_string(),
            EvaluationMetric::Boolean(wrote_script),
        ));
        metrics.push((
            "ran_script".to_string(),
            EvaluationMetric::Boolean(ran_script),
        ));
        metrics.push((
            "correct_results".to_string(),
            EvaluationMetric::Boolean(correct_results),
        ));
        // Copy the session file to the current working directory
        if let Err(e) = copy_session_to_cwd() {
            println!("Warning: Failed to copy session file: {}", e);
        } else {
            println!("Successfully copied session file to current directory");
        }
        Ok(metrics)
    }
    fn name(&self) -> &str {
        "squirrel_census"
    }
    fn required_extensions(&self) -> ExtensionRequirements {
        ExtensionRequirements {
            builtin: vec!["developer".to_string()],
            external: Vec::new(),
        }
    }
 }
 register_evaluation!("vibes", SquirrelCensus);
--- a/crates/goose-cli/src/commands/bench.rs
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -56,6 +56,20 @@ impl BenchAgent for BenchSession {
        let errors = self.errors.lock().await;
        errors.clone()
    }
    async fn get_token_usage(&self) -> Option<i32> {
        // Get token usage from the provider
        if let Ok(usage) = self.session.get_usage().await {
            // Sum up total tokens across all usage entries
            let total_tokens = usage
                .iter()
                .map(|u| u.usage.total_tokens.unwrap_or(0))
                .sum();
            Some(total_tokens)
        } else {
            None
        }
    }
 }
 // Wrapper struct to implement BenchAgent for Arc<Mutex<BenchSession>>
@@ -72,6 +86,11 @@ impl BenchAgent for BenchAgentWrapper {
        let session = self.0.lock().await;
        session.get_errors().await
    }
    async fn get_token_usage(&self) -> Option<i32> {
        let session = self.0.lock().await;
        session.get_token_usage().await
    }
 }
 async fn run_eval(
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -16,6 +16,7 @@ use goose::agents::extension::{Envs, ExtensionConfig};
 use goose::agents::{Agent, SessionConfig};
 use goose::config::Config;
 use goose::message::{Message, MessageContent};
 use goose::providers::base::ProviderUsage;
 use goose::session;
 use mcp_core::handler::ToolError;
 use mcp_core::prompt::PromptMessage;
@@ -643,4 +644,9 @@ impl Session {
    pub fn message_history(&self) -> Vec<Message> {
        self.messages.clone()
    }
    /// Get the token usage from the agent
    pub async fn get_usage(&self) -> Result<Vec<ProviderUsage>> {
        Ok(self.agent.usage().await)
    }
 }
--- a/scripts/run-benchmarks.sh
+++ b/scripts/run-benchmarks.sh
@@ -12,6 +12,8 @@ function show_usage() {
  echo "  -s, --suites             Comma-separated list of benchmark suites to run (e.g., 'core,small_models')"
  echo "  -o, --output-dir         Directory to store benchmark results (default: './benchmark-results')"
  echo "  -d, --debug              Use debug build instead of release build"
  echo "  -t, --toolshim           Enable toolshim mode by setting GOOSE_TOOLSHIM=1"
  echo "  -m, --toolshim-model     Set the toolshim model (sets GOOSE_TOOLSHIM_MODEL)"
  echo "  -h, --help               Show this help message"
  echo ""
  echo "Example:"
@@ -23,6 +25,8 @@ PROVIDER_MODELS=""
 SUITES=""
 OUTPUT_DIR="./benchmark-results"
 DEBUG_MODE=false
 TOOLSHIM=false
 TOOLSHIM_MODEL=""
 while [[ $# -gt 0 ]]; do
  case "$1" in
@@ -42,6 +46,14 @@ while [[ $# -gt 0 ]]; do
      DEBUG_MODE=true
      shift
      ;;
    -t|--toolshim)
      TOOLSHIM=true
      shift
      ;;
    -m|--toolshim-model)
      TOOLSHIM_MODEL="$2"
      shift 2
      ;;
    -h|--help)
      show_usage
      exit 0
@@ -80,6 +92,12 @@ if [ "$DEBUG_MODE" = true ]; then
 else
  echo "Mode: Release" >> "$SUMMARY_FILE"
 fi
 if [ "$TOOLSHIM" = true ]; then
  echo "Toolshim: Enabled" >> "$SUMMARY_FILE"
  if [[ -n "$TOOLSHIM_MODEL" ]]; then
    echo "Toolshim Model: $TOOLSHIM_MODEL" >> "$SUMMARY_FILE"
  fi
 fi
 echo "" >> "$SUMMARY_FILE"
 # Determine which binary to use
@@ -140,6 +158,14 @@ for ((i=0; i<$COUNT; i++)); do
  export GOOSE_PROVIDER="$provider"
  export GOOSE_MODEL="$model"
  # Set toolshim environment variables if enabled
  if [ "$TOOLSHIM" = true ]; then
    export GOOSE_TOOLSHIM=1
    if [[ -n "$TOOLSHIM_MODEL" ]]; then
      export GOOSE_TOOLSHIM_MODEL="$TOOLSHIM_MODEL"
    fi
  fi
  # Run the benchmark and save results to JSON
  echo "Running benchmark for $provider/$model with suites: $SUITES"
  OUTPUT_FILE="$OUTPUT_DIR/${provider}-${model}.json"
@@ -174,6 +200,7 @@ for ((i=0; i<$COUNT; i++)); do
          TOTAL_METRICS=0
          FAILED_METRICS=0
          PASSED_METRICS=0
          OTHER_METRICS=0
          TOTAL_ERRORS=0
          # Process each suite
@@ -194,14 +221,28 @@ for ((i=0; i<$COUNT; i++)); do
              ERROR_COUNT=$(jq ".suites[$j].evaluations[$k].errors | length" "$OUTPUT_FILE")
              TOTAL_ERRORS=$((TOTAL_ERRORS + ERROR_COUNT))
-              # Check for failures in metrics
+              # Count boolean metrics (passed and failed)
              BOOLEAN_COUNT=$(jq -r ".suites[$j].evaluations[$k].metrics[] | 
                select(.[1].Boolean != null) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')
              # Count failed boolean metrics
              FAILURES=$(jq -r ".suites[$j].evaluations[$k].metrics[] | 
                select(
                  .[1].Boolean == false or .[1].Boolean == \"false\" or .[1].Boolean == 0 or .[1].Boolean == \"0\"
                ) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')
              # Count passed boolean metrics
              PASSES=$((BOOLEAN_COUNT - FAILURES))
              # Count non-boolean metrics
              NON_BOOLEAN=$((METRIC_COUNT - BOOLEAN_COUNT))
              # Update global counters
              FAILED_METRICS=$((FAILED_METRICS + FAILURES))
              PASSED_METRICS=$((PASSED_METRICS + PASSES))
              OTHER_METRICS=$((OTHER_METRICS + NON_BOOLEAN))
              if [ "$FAILURES" -gt 0 ] || [ "$ERROR_COUNT" -gt 0 ]; then
                FAILED_METRICS=$((FAILED_METRICS + FAILURES))
                echo "  ❌ $EVAL_NAME:" >> "$ANALYSIS_FILE"
                if [ "$FAILURES" -gt 0 ]; then
@@ -221,7 +262,7 @@ for ((i=0; i<$COUNT; i++)); do
                  jq -r ".suites[$j].evaluations[$k].errors[] | \"      [\(.level)] \(.message)\"" "$OUTPUT_FILE" >> "$ANALYSIS_FILE"
                fi
              else
-                PASSED_METRICS=$((PASSED_METRICS + METRIC_COUNT))
+                # This line is no longer needed since we count passes/fails/others individually
                echo "  ✅ $EVAL_NAME: All metrics passed, no errors" >> "$ANALYSIS_FILE"
              fi
            done
@@ -235,8 +276,15 @@ for ((i=0; i<$COUNT; i++)); do
          echo "Total Metrics: $TOTAL_METRICS" >> "$ANALYSIS_FILE"
          echo "Passed Metrics: $PASSED_METRICS" >> "$ANALYSIS_FILE"
          echo "Failed Metrics: $FAILED_METRICS" >> "$ANALYSIS_FILE"
          echo "Other Metrics: $OTHER_METRICS" >> "$ANALYSIS_FILE"
          echo "Total Errors: $TOTAL_ERRORS" >> "$ANALYSIS_FILE"
          # Verification of metrics counting
          COUNTED_METRICS=$((PASSED_METRICS + FAILED_METRICS + OTHER_METRICS))
          if [ "$COUNTED_METRICS" -ne "$TOTAL_METRICS" ]; then
            echo "⚠️ Metrics counting discrepancy: $COUNTED_METRICS counted vs $TOTAL_METRICS total" >> "$ANALYSIS_FILE"
          fi
          # Determine success/failure
          if [ "$FAILED_METRICS" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then
            if [ "$FAILED_METRICS" -gt 0 ]; then