feat: add additional goosebench evals (#1571)

Co-authored-by: Alice Hau <alice.a.hau@gmail.com>
This commit is contained in:
Alice Hau
2025-03-10 15:11:44 -04:00
committed by GitHub
parent 8689d24407
commit bb4feacf03
14 changed files with 859 additions and 3 deletions

File diff suppressed because one or more lines are too long

View File

@@ -35,6 +35,9 @@ pub trait BenchAgent: Send + Sync {
// Make get_errors async // Make get_errors async
async fn get_errors(&self) -> Vec<BenchAgentError>; async fn get_errors(&self) -> Vec<BenchAgentError>;
// Get token usage information
async fn get_token_usage(&self) -> Option<i32>;
} }
#[async_trait] #[async_trait]

View File

@@ -0,0 +1,105 @@
use crate::eval_suites::{BenchAgent, EvaluationMetric};
use goose::message::{Message, MessageContent};
use std::collections::HashMap;
use std::time::Instant;
/// Collect baseline metrics including execution time, tool usage, and token count
pub async fn collect_baseline_metrics(
agent: &mut Box<dyn BenchAgent>,
prompt: String,
) -> (Vec<Message>, HashMap<String, EvaluationMetric>) {
// Initialize metrics map
let mut metrics = HashMap::new();
// Start timer
let start_time = Instant::now();
// Execute prompt
let messages = match agent.prompt(prompt).await {
Ok(msgs) => msgs,
Err(e) => {
metrics.insert(
"prompt_error".to_string(),
EvaluationMetric::String(format!("Error: {}", e)),
);
Vec::new()
}
};
// Calculate execution time
let execution_time = start_time.elapsed();
metrics.insert(
"prompt_execution_time_seconds".to_string(),
EvaluationMetric::Float(execution_time.as_secs_f64()),
);
// Count tool calls
let (total_tool_calls, tool_calls_by_name) = count_tool_calls(&messages);
metrics.insert(
"total_tool_calls".to_string(),
EvaluationMetric::Integer(total_tool_calls),
);
// Add tool calls by name metrics
for (tool_name, count) in tool_calls_by_name {
metrics.insert(
format!("tool_calls_{}", tool_name),
EvaluationMetric::Integer(count),
);
}
// Get token usage information if available
if let Some(token_count) = agent.get_token_usage().await {
metrics.insert(
"total_tokens".to_string(),
EvaluationMetric::Integer(token_count as i64),
);
}
(messages, metrics)
}
/// Count all tool calls in messages and categorize by tool name
fn count_tool_calls(messages: &[Message]) -> (i64, HashMap<String, i64>) {
let mut total_count = 0;
let mut counts_by_name = HashMap::new();
for message in messages {
for content in &message.content {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
total_count += 1;
// Count by name
*counts_by_name.entry(tool_call.name.clone()).or_insert(0) += 1;
}
}
}
}
(total_count, counts_by_name)
}
/// Convert HashMap of metrics to Vec
pub fn metrics_hashmap_to_vec(
metrics: HashMap<String, EvaluationMetric>,
) -> Vec<(String, EvaluationMetric)> {
metrics.into_iter().collect()
}
/// Check if a specific tool was used in any of the messages
pub fn used_tool(messages: &[Message], tool_name: &str) -> bool {
messages.iter().any(|msg| {
msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
tool_call.name.contains(tool_name)
} else {
false
}
} else {
false
}
})
})
}

View File

@@ -1,6 +1,11 @@
mod core; mod core;
mod evaluation; mod evaluation;
mod factory; mod factory;
mod metrics;
mod utils;
mod vibes;
pub use evaluation::*; pub use evaluation::*;
pub use factory::{register_evaluation, EvaluationSuiteFactory}; pub use factory::{register_evaluation, EvaluationSuiteFactory};
pub use metrics::*;
pub use utils::*;

View File

@@ -0,0 +1,69 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use anyhow::{Context, Result};
use goose::message::Message;
use goose::session::storage;
use std::fs::{self, File};
use std::io::Write;
use std::path::PathBuf;
/// Write the last agent message to a file
/// Returns the content of the message and an error if writing failed
pub fn write_response_to_file(
messages: &[Message],
_work_dir: &mut BenchmarkWorkDir, // Kept for API compatibility
filename: &str,
) -> Result<String> {
let last_msg = messages
.last()
.ok_or_else(|| anyhow::anyhow!("No messages to write to file"))?;
let text_content = last_msg.as_concat_text();
// Create a file in the current directory
let output_path = PathBuf::from(filename);
// Create and write to the file
let mut file = File::create(&output_path)
.with_context(|| format!("Failed to create file at {}", output_path.display()))?;
file.write_all(text_content.as_bytes())
.with_context(|| format!("Failed to write content to {}", output_path.display()))?;
Ok(text_content)
}
/// Copy the most recent session file to the current working directory
///
/// This function finds the most recent Goose session file (.jsonl) and copies it
/// to the current working directory. Session files are stored by the Goose framework
/// in a platform-specific data directory.
///
/// # Returns
/// - Ok(session_path) if successfully copied, where session_path is the path to the copied file
/// - Err if any errors occurred during the process
pub fn copy_session_to_cwd() -> Result<PathBuf> {
// Try to get the most recent session file
let src_path = storage::get_most_recent_session()
.with_context(|| "Failed to find any recent session files")?;
// Extract the filename from the path
let filename = src_path
.file_name()
.ok_or_else(|| anyhow::anyhow!("Invalid session filename"))?;
// Create the destination path in the current directory
let dest_path = PathBuf::from(".").join(filename);
// Copy the file
fs::copy(&src_path, &dest_path).with_context(|| {
format!(
"Failed to copy from '{}' to '{}'",
src_path.display(),
dest_path.display()
)
})?;
println!("Session file copied to: {}", dest_path.display());
Ok(dest_path)
}

View File

@@ -0,0 +1,89 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
pub struct BlogSummary {}
impl BlogSummary {
pub fn new() -> Self {
BlogSummary {}
}
fn check_markdown_numbered_list(&self, text: &str) -> bool {
// Check if all numbers 1-5 exist in markdown numbered list format
(1..=5).all(|n| text.contains(&format!("{}.", n)))
}
}
#[async_trait]
impl Evaluation for BlogSummary {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("BlogSummary - run");
// Collect baseline metrics (execution time, token usage, tool calls)
let (response, perf_metrics) = collect_baseline_metrics(
&mut agent,
"What are the top 5 most counterintuitive insights from this blog post? Format your response in Markdown with 5 numbered points (1. 2. 3. 4. 5.) https://huyenchip.com/2025/01/07/agents.html".to_string()
).await;
// Write response to file and get the text content
let response_text =
match write_response_to_file(&response, work_dir, "blog_summary_output.txt") {
Ok(text) => text,
Err(e) => {
println!("Warning: Failed to write blog summary output: {}", e);
// If file write fails, still continue with the evaluation
response
.last()
.map_or_else(String::new, |msg| msg.as_concat_text())
}
};
// Convert HashMap to Vec for our metrics
let mut metrics = metrics_hashmap_to_vec(perf_metrics);
// Check if the content follows the markdown numbered list format
let has_markdown_list = self.check_markdown_numbered_list(&response_text);
metrics.push((
"valid_markdown_format".to_string(),
EvaluationMetric::Boolean(has_markdown_list),
));
// Check if the fetch tool was used
let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
metrics.push((
"used_fetch_tool".to_string(),
EvaluationMetric::Boolean(used_fetch_tool),
));
// Copy the session file to the current working directory
if let Err(e) = copy_session_to_cwd() {
println!("Warning: Failed to copy session file: {}", e);
} else {
println!("Successfully copied session file to current directory");
}
Ok(metrics)
}
fn name(&self) -> &str {
"blog_summary"
}
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: vec!["uvx mcp-server-fetch".to_string()],
}
}
}
register_evaluation!("vibes", BlogSummary);

View File

@@ -0,0 +1,121 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
EvaluationMetric, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
use goose::message::MessageContent;
use mcp_core::role::Role;
use serde_json::{self, Value};
use std::fs;
pub struct FlappyBird {}
impl FlappyBird {
pub fn new() -> Self {
FlappyBird {}
}
fn check_python_implementation(&self, content: &str) -> bool {
content.contains("import pygame") &&
content.contains("pygame.init()") &&
content.contains("while") && // Game loop
content.contains("pygame.event.get()") && // Event handling
content.contains("def main") && // Main function
content.contains("if __name__ == '__main__'") // Main guard
}
}
#[async_trait]
impl Evaluation for FlappyBird {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("FlappyBird - run");
// Collect baseline metrics (execution time, token usage, tool calls)
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
"Create a Flappy Bird game in Python. Structure the code with a main function and use the if __name__ == '__main__': idiom. You must use pygame. The background color should be a light blue color. Pressing SPACE multiple times will accelerate the bird. The bird's shape should be a red circle. Place on the bottom some land colored as dark yellow chosen. Make a score shown on the top right side. Increment if you pass pipes and don't hit them. Make randomly spaced dark green pipes with enough space. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again. When trying to run the game, make sure to use pyenv and create the environment in the current working directory. The final game should be written to a file named flappy_bird.py. Remember to use your tools if applicable.".to_string()
).await;
// Convert HashMap to Vec for our metrics
let mut metrics = metrics_hashmap_to_vec(perf_metrics);
// Check if the agent used the text editor tool correctly
let valid_tool_call = messages.iter().any(|msg| {
msg.role == Role::Assistant
&& msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
// Check tool name and basic parameters
if tool_call.name != "developer__text_editor" {
return false;
}
// Parse the arguments as JSON
if let Ok(args) =
serde_json::from_value::<Value>(tool_call.arguments.clone())
{
// Only check command is write and correct filename
args.get("command").and_then(Value::as_str) == Some("write")
&& args
.get("path")
.and_then(Value::as_str)
.is_some_and(|s| s.contains("flappy_bird.py"))
} else {
false
}
} else {
false
}
} else {
false
}
})
});
metrics.push((
"used_write_tool".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
));
// If tool was used correctly, check the actual file content
if valid_tool_call {
if let Ok(file_path) = work_dir.fs_get("flappy_bird.py".to_string()) {
if let Ok(content) = fs::read_to_string(file_path) {
let valid_implementation = self.check_python_implementation(&content);
metrics.push((
"valid_implementation".to_string(),
EvaluationMetric::Boolean(valid_implementation),
));
}
}
}
// Copy the session file to the current working directory
if let Err(e) = copy_session_to_cwd() {
println!("Warning: Failed to copy session file: {}", e);
} else {
println!("Successfully copied session file to current directory");
}
Ok(metrics)
}
fn name(&self) -> &str {
"flappy_bird"
}
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}
register_evaluation!("vibes", FlappyBird);

View File

@@ -0,0 +1,99 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
EvaluationMetric, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
use goose::message::MessageContent;
use mcp_core::role::Role;
use serde_json::{self, Value};
pub struct GooseWiki {}
impl GooseWiki {
pub fn new() -> Self {
GooseWiki {}
}
}
#[async_trait]
impl Evaluation for GooseWiki {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("GooseWiki - run");
// Collect baseline metrics (execution time, token usage, tool calls)
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
"Create a Wikipedia-style web page about Goose (Block's AI agent) in a new index.html file. The page should be a complete, well-structured HTML document with proper head and body sections. Use heading tags (h1, h2, h3) to organize the content into clear sections. Include comprehensive information about Goose organized in a way similar to how Wikipedia presents technical topics. Remember to use your tools if applicable.".to_string()
).await;
// Convert HashMap to Vec for our metrics
let mut metrics = metrics_hashmap_to_vec(perf_metrics);
// Check if the agent used the text editor tool to create index.html
let valid_tool_call = messages.iter().any(|msg| {
msg.role == Role::Assistant &&
msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
// Check tool name is correct
if tool_call.name != "developer__text_editor" {
return false;
}
// Parse the arguments as JSON
if let Ok(args) = serde_json::from_value::<Value>(tool_call.arguments.clone()) {
// Check command is write and path contains index.html
args.get("command").and_then(Value::as_str) == Some("write") &&
args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("index.html")) &&
// Verify file_text contains basic HTML structure
args.get("file_text").and_then(Value::as_str).is_some_and(|s| {
s.contains("<html") && s.contains("</html>") &&
s.contains("<head") && s.contains("</head>") &&
s.contains("<body") && s.contains("</body>")
})
} else {
false
}
} else {
false
}
} else {
false
}
})
});
metrics.push((
"created_valid_html".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
));
// Copy the session file to the current working directory
if let Err(e) = copy_session_to_cwd() {
println!("Warning: Failed to copy session file: {}", e);
} else {
println!("Successfully copied session file to current directory");
}
Ok(metrics)
}
fn name(&self) -> &str {
"goose_wiki"
}
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}
register_evaluation!("vibes", GooseWiki);

View File

@@ -0,0 +1,5 @@
mod blog_summary;
mod flappy_bird;
mod goose_wiki;
mod restaurant_research;
mod squirrel_census;

View File

@@ -0,0 +1,109 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
pub struct RestaurantResearch {}
impl RestaurantResearch {
pub fn new() -> Self {
RestaurantResearch {}
}
fn check_markdown_bullets(&self, text: &str) -> bool {
// Check if there's at least one bullet point and proper markdown formatting
text.contains("- ") || text.contains("* ")
}
fn count_bullet_points(&self, text: &str) -> i64 {
// Count total bullet points (either - or * style)
let dash_bullets = text.matches("- ").count();
let star_bullets = text.matches("* ").count();
(dash_bullets + star_bullets) as i64
}
}
#[async_trait]
impl Evaluation for RestaurantResearch {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("RestaurantResearch - run");
// Collect baseline metrics (execution time, token usage, tool calls)
let (response, perf_metrics) = collect_baseline_metrics(
&mut agent,
"Search the internet for and provide a current, detailed list of the best Sichuanese restaurants specifically in the East Village neighborhood of NYC. Format your response in Markdown using bullet points (either - or *) for each restaurant. For each restaurant include:
- Restaurant name and what they're known for
- Signature dishes
- Atmosphere/setting
- Any relevant details about reservations or dining experience
- What distinguishes them from others
Present the information in order of significance or quality. Focus specifically on Sichuanese establishments, not general Chinese restaurants. If you encounter a page you cannot access, try another one. Do not ask me for confirmation just conduct the searches yourself until you find the needed information. Remember to use your tools if applicable.".to_string()
).await;
// Write response to file and get the text content
let response_text =
match write_response_to_file(&response, work_dir, "restaurant_research_output.txt") {
Ok(text) => text,
Err(e) => {
println!("Warning: Failed to write restaurant research output: {}", e);
// If file write fails, still continue with the evaluation
response
.last()
.map_or_else(String::new, |msg| msg.as_concat_text())
}
};
// Convert HashMap to Vec for our metrics
let mut metrics = metrics_hashmap_to_vec(perf_metrics);
// Check markdown formatting
let has_markdown_bullets = self.check_markdown_bullets(&response_text);
let bullet_count = self.count_bullet_points(&response_text);
metrics.push((
"valid_markdown_format".to_string(),
EvaluationMetric::Boolean(has_markdown_bullets),
));
metrics.push((
"bullet_point_count".to_string(),
EvaluationMetric::Integer(bullet_count),
));
// Check if the fetch tool was used
let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
metrics.push((
"used_fetch_tool".to_string(),
EvaluationMetric::Boolean(used_fetch_tool),
));
// Copy the session file to the current working directory
if let Err(e) = copy_session_to_cwd() {
println!("Warning: Failed to copy session file: {}", e);
} else {
println!("Successfully copied session file to current directory");
}
Ok(metrics)
}
fn name(&self) -> &str {
"restaurant_research"
}
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: vec!["uvx mcp-server-fetch".to_string()],
}
}
}
register_evaluation!("vibes", RestaurantResearch);

View File

@@ -0,0 +1,177 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
EvaluationMetric, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
use goose::message::MessageContent;
use mcp_core::role::Role;
use serde_json::{self, Value};
pub struct SquirrelCensus {}
impl SquirrelCensus {
pub fn new() -> Self {
SquirrelCensus {}
}
fn check_analysis_results(&self, text: &str) -> (bool, bool, bool) {
let text_lower = text.to_lowercase();
let has_central_manhattan =
text_lower.contains("central manhattan") && text.contains("174");
let has_tompkins = text_lower.contains("tompkins square park") && text.contains("59");
let has_gray = text_lower.contains("gray") || text_lower.contains("grey");
(has_central_manhattan, has_tompkins, has_gray)
}
}
#[async_trait]
impl Evaluation for SquirrelCensus {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("SquirrelCensus - run");
// Get the path to the squirrel data file
let squirrel_data_path = match work_dir.fs_get("./assets/squirrel-data.csv".to_string()) {
Ok(file) => file,
Err(_) => return Err(anyhow::anyhow!("Could not find squirrel-data.csv file")),
};
println!("squirrel_data_path: {:?}", squirrel_data_path);
// Collect baseline metrics (execution time, token usage, tool calls)
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
format!(
"Create a Python script called analyze_squirrels.py that analyzes the CSV file at {}. Do not ask for any clarification or further instructions - proceed with the implementation as specified below.
The script should use pandas to answer these specific questions:
1. Which area (Area column) has the most squirrels spotted? For this area, what is the most common Primary Fur Color of squirrels?
2. Which specific park location (Park Name column) has the most squirrels spotted? For this location, what is the most common Primary Fur Color of squirrels?
The script should:
- Use pandas to read and analyze the data
- Print results in EXACTLY this format (including the markers):
[AREA_RESULT] <area_name> - <count> squirrels spotted
[AREA_COLOR] Most common fur color: <color> (<color_count> squirrels)
[PARK_RESULT] <park_name> - <count> squirrels spotted
[PARK_COLOR] Most common fur color: <color> (<color_count> squirrels)
After writing the script, run it using python3 and show the results. Do not ask for confirmation or further instructions. Remember to use your tools if applicable.",
squirrel_data_path.display()
)
).await;
// Convert HashMap to Vec for our metrics
let mut metrics = metrics_hashmap_to_vec(perf_metrics);
// Check if agent wrote the Python script
let wrote_script = messages.iter().any(|msg| {
msg.role == Role::Assistant
&& msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
if tool_call.name != "developer__text_editor" {
return false;
}
if let Ok(args) =
serde_json::from_value::<Value>(tool_call.arguments.clone())
{
args.get("command").and_then(Value::as_str) == Some("write")
&& args
.get("path")
.and_then(Value::as_str)
.is_some_and(|s| s.contains("analyze_squirrels.py"))
} else {
false
}
} else {
false
}
} else {
false
}
})
});
// Check if agent ran the script
let ran_script = messages.iter().any(|msg| {
msg.role == Role::Assistant
&& msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
if tool_call.name != "developer__shell" {
return false;
}
if let Ok(args) =
serde_json::from_value::<Value>(tool_call.arguments.clone())
{
args.get("command")
.and_then(Value::as_str)
.is_some_and(|s| {
s.contains("python") && s.contains("analyze_squirrels.py")
})
} else {
false
}
} else {
false
}
} else {
false
}
})
});
// Check the last message for correct results
let correct_results = if let Some(last_msg) = messages.last() {
let text_content = last_msg.as_concat_text();
let (has_central_manhattan, has_tompkins, has_gray) =
self.check_analysis_results(&text_content);
has_central_manhattan && has_tompkins && has_gray
} else {
false
};
metrics.push((
"wrote_script".to_string(),
EvaluationMetric::Boolean(wrote_script),
));
metrics.push((
"ran_script".to_string(),
EvaluationMetric::Boolean(ran_script),
));
metrics.push((
"correct_results".to_string(),
EvaluationMetric::Boolean(correct_results),
));
// Copy the session file to the current working directory
if let Err(e) = copy_session_to_cwd() {
println!("Warning: Failed to copy session file: {}", e);
} else {
println!("Successfully copied session file to current directory");
}
Ok(metrics)
}
fn name(&self) -> &str {
"squirrel_census"
}
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}
register_evaluation!("vibes", SquirrelCensus);

View File

@@ -56,6 +56,20 @@ impl BenchAgent for BenchSession {
let errors = self.errors.lock().await; let errors = self.errors.lock().await;
errors.clone() errors.clone()
} }
async fn get_token_usage(&self) -> Option<i32> {
// Get token usage from the provider
if let Ok(usage) = self.session.get_usage().await {
// Sum up total tokens across all usage entries
let total_tokens = usage
.iter()
.map(|u| u.usage.total_tokens.unwrap_or(0))
.sum();
Some(total_tokens)
} else {
None
}
}
} }
// Wrapper struct to implement BenchAgent for Arc<Mutex<BenchSession>> // Wrapper struct to implement BenchAgent for Arc<Mutex<BenchSession>>
@@ -72,6 +86,11 @@ impl BenchAgent for BenchAgentWrapper {
let session = self.0.lock().await; let session = self.0.lock().await;
session.get_errors().await session.get_errors().await
} }
async fn get_token_usage(&self) -> Option<i32> {
let session = self.0.lock().await;
session.get_token_usage().await
}
} }
async fn run_eval( async fn run_eval(

View File

@@ -16,6 +16,7 @@ use goose::agents::extension::{Envs, ExtensionConfig};
use goose::agents::{Agent, SessionConfig}; use goose::agents::{Agent, SessionConfig};
use goose::config::Config; use goose::config::Config;
use goose::message::{Message, MessageContent}; use goose::message::{Message, MessageContent};
use goose::providers::base::ProviderUsage;
use goose::session; use goose::session;
use mcp_core::handler::ToolError; use mcp_core::handler::ToolError;
use mcp_core::prompt::PromptMessage; use mcp_core::prompt::PromptMessage;
@@ -643,4 +644,9 @@ impl Session {
pub fn message_history(&self) -> Vec<Message> { pub fn message_history(&self) -> Vec<Message> {
self.messages.clone() self.messages.clone()
} }
/// Get the token usage from the agent
pub async fn get_usage(&self) -> Result<Vec<ProviderUsage>> {
Ok(self.agent.usage().await)
}
} }

View File

@@ -12,6 +12,8 @@ function show_usage() {
echo " -s, --suites Comma-separated list of benchmark suites to run (e.g., 'core,small_models')" echo " -s, --suites Comma-separated list of benchmark suites to run (e.g., 'core,small_models')"
echo " -o, --output-dir Directory to store benchmark results (default: './benchmark-results')" echo " -o, --output-dir Directory to store benchmark results (default: './benchmark-results')"
echo " -d, --debug Use debug build instead of release build" echo " -d, --debug Use debug build instead of release build"
echo " -t, --toolshim Enable toolshim mode by setting GOOSE_TOOLSHIM=1"
echo " -m, --toolshim-model Set the toolshim model (sets GOOSE_TOOLSHIM_MODEL)"
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
echo "" echo ""
echo "Example:" echo "Example:"
@@ -23,6 +25,8 @@ PROVIDER_MODELS=""
SUITES="" SUITES=""
OUTPUT_DIR="./benchmark-results" OUTPUT_DIR="./benchmark-results"
DEBUG_MODE=false DEBUG_MODE=false
TOOLSHIM=false
TOOLSHIM_MODEL=""
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case "$1" in case "$1" in
@@ -42,6 +46,14 @@ while [[ $# -gt 0 ]]; do
DEBUG_MODE=true DEBUG_MODE=true
shift shift
;; ;;
-t|--toolshim)
TOOLSHIM=true
shift
;;
-m|--toolshim-model)
TOOLSHIM_MODEL="$2"
shift 2
;;
-h|--help) -h|--help)
show_usage show_usage
exit 0 exit 0
@@ -80,6 +92,12 @@ if [ "$DEBUG_MODE" = true ]; then
else else
echo "Mode: Release" >> "$SUMMARY_FILE" echo "Mode: Release" >> "$SUMMARY_FILE"
fi fi
if [ "$TOOLSHIM" = true ]; then
echo "Toolshim: Enabled" >> "$SUMMARY_FILE"
if [[ -n "$TOOLSHIM_MODEL" ]]; then
echo "Toolshim Model: $TOOLSHIM_MODEL" >> "$SUMMARY_FILE"
fi
fi
echo "" >> "$SUMMARY_FILE" echo "" >> "$SUMMARY_FILE"
# Determine which binary to use # Determine which binary to use
@@ -140,6 +158,14 @@ for ((i=0; i<$COUNT; i++)); do
export GOOSE_PROVIDER="$provider" export GOOSE_PROVIDER="$provider"
export GOOSE_MODEL="$model" export GOOSE_MODEL="$model"
# Set toolshim environment variables if enabled
if [ "$TOOLSHIM" = true ]; then
export GOOSE_TOOLSHIM=1
if [[ -n "$TOOLSHIM_MODEL" ]]; then
export GOOSE_TOOLSHIM_MODEL="$TOOLSHIM_MODEL"
fi
fi
# Run the benchmark and save results to JSON # Run the benchmark and save results to JSON
echo "Running benchmark for $provider/$model with suites: $SUITES" echo "Running benchmark for $provider/$model with suites: $SUITES"
OUTPUT_FILE="$OUTPUT_DIR/${provider}-${model}.json" OUTPUT_FILE="$OUTPUT_DIR/${provider}-${model}.json"
@@ -174,6 +200,7 @@ for ((i=0; i<$COUNT; i++)); do
TOTAL_METRICS=0 TOTAL_METRICS=0
FAILED_METRICS=0 FAILED_METRICS=0
PASSED_METRICS=0 PASSED_METRICS=0
OTHER_METRICS=0
TOTAL_ERRORS=0 TOTAL_ERRORS=0
# Process each suite # Process each suite
@@ -194,14 +221,28 @@ for ((i=0; i<$COUNT; i++)); do
ERROR_COUNT=$(jq ".suites[$j].evaluations[$k].errors | length" "$OUTPUT_FILE") ERROR_COUNT=$(jq ".suites[$j].evaluations[$k].errors | length" "$OUTPUT_FILE")
TOTAL_ERRORS=$((TOTAL_ERRORS + ERROR_COUNT)) TOTAL_ERRORS=$((TOTAL_ERRORS + ERROR_COUNT))
# Check for failures in metrics # Count boolean metrics (passed and failed)
BOOLEAN_COUNT=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
select(.[1].Boolean != null) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')
# Count failed boolean metrics
FAILURES=$(jq -r ".suites[$j].evaluations[$k].metrics[] | FAILURES=$(jq -r ".suites[$j].evaluations[$k].metrics[] |
select( select(
.[1].Boolean == false or .[1].Boolean == \"false\" or .[1].Boolean == 0 or .[1].Boolean == \"0\" .[1].Boolean == false or .[1].Boolean == \"false\" or .[1].Boolean == 0 or .[1].Boolean == \"0\"
) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ') ) | .[0]" "$OUTPUT_FILE" | wc -l | tr -d ' ')
# Count passed boolean metrics
PASSES=$((BOOLEAN_COUNT - FAILURES))
# Count non-boolean metrics
NON_BOOLEAN=$((METRIC_COUNT - BOOLEAN_COUNT))
# Update global counters
FAILED_METRICS=$((FAILED_METRICS + FAILURES))
PASSED_METRICS=$((PASSED_METRICS + PASSES))
OTHER_METRICS=$((OTHER_METRICS + NON_BOOLEAN))
if [ "$FAILURES" -gt 0 ] || [ "$ERROR_COUNT" -gt 0 ]; then if [ "$FAILURES" -gt 0 ] || [ "$ERROR_COUNT" -gt 0 ]; then
FAILED_METRICS=$((FAILED_METRICS + FAILURES))
echo "$EVAL_NAME:" >> "$ANALYSIS_FILE" echo "$EVAL_NAME:" >> "$ANALYSIS_FILE"
if [ "$FAILURES" -gt 0 ]; then if [ "$FAILURES" -gt 0 ]; then
@@ -221,7 +262,7 @@ for ((i=0; i<$COUNT; i++)); do
jq -r ".suites[$j].evaluations[$k].errors[] | \" [\(.level)] \(.message)\"" "$OUTPUT_FILE" >> "$ANALYSIS_FILE" jq -r ".suites[$j].evaluations[$k].errors[] | \" [\(.level)] \(.message)\"" "$OUTPUT_FILE" >> "$ANALYSIS_FILE"
fi fi
else else
PASSED_METRICS=$((PASSED_METRICS + METRIC_COUNT)) # This line is no longer needed since we count passes/fails/others individually
echo "$EVAL_NAME: All metrics passed, no errors" >> "$ANALYSIS_FILE" echo "$EVAL_NAME: All metrics passed, no errors" >> "$ANALYSIS_FILE"
fi fi
done done
@@ -235,8 +276,15 @@ for ((i=0; i<$COUNT; i++)); do
echo "Total Metrics: $TOTAL_METRICS" >> "$ANALYSIS_FILE" echo "Total Metrics: $TOTAL_METRICS" >> "$ANALYSIS_FILE"
echo "Passed Metrics: $PASSED_METRICS" >> "$ANALYSIS_FILE" echo "Passed Metrics: $PASSED_METRICS" >> "$ANALYSIS_FILE"
echo "Failed Metrics: $FAILED_METRICS" >> "$ANALYSIS_FILE" echo "Failed Metrics: $FAILED_METRICS" >> "$ANALYSIS_FILE"
echo "Other Metrics: $OTHER_METRICS" >> "$ANALYSIS_FILE"
echo "Total Errors: $TOTAL_ERRORS" >> "$ANALYSIS_FILE" echo "Total Errors: $TOTAL_ERRORS" >> "$ANALYSIS_FILE"
# Verification of metrics counting
COUNTED_METRICS=$((PASSED_METRICS + FAILED_METRICS + OTHER_METRICS))
if [ "$COUNTED_METRICS" -ne "$TOTAL_METRICS" ]; then
echo "⚠️ Metrics counting discrepancy: $COUNTED_METRICS counted vs $TOTAL_METRICS total" >> "$ANALYSIS_FILE"
fi
# Determine success/failure # Determine success/failure
if [ "$FAILED_METRICS" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then if [ "$FAILED_METRICS" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then
if [ "$FAILED_METRICS" -gt 0 ]; then if [ "$FAILED_METRICS" -gt 0 ]; then