diff --git a/crates/goose-bench/src/eval_suites/core/developer/create_file.rs b/crates/goose-bench/src/eval_suites/core/developer/create_file.rs index b2784e2a..5cfe895d 100644 --- a/crates/goose-bench/src/eval_suites/core/developer/create_file.rs +++ b/crates/goose-bench/src/eval_suites/core/developer/create_file.rs @@ -110,6 +110,12 @@ impl Evaluation for DeveloperCreateFile { "Complete create and read".to_string(), EvalMetricValue::Boolean(write_tool_call && read_tool_call), )); + + metrics.push(( + "score".to_string(), + EvalMetricValue::Float(((write_tool_call as u8) + (read_tool_call as u8)) as f64 / 2.0), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/core/developer/list_files.rs b/crates/goose-bench/src/eval_suites/core/developer/list_files.rs index 4148738b..8881b690 100644 --- a/crates/goose-bench/src/eval_suites/core/developer/list_files.rs +++ b/crates/goose-bench/src/eval_suites/core/developer/list_files.rs @@ -69,6 +69,12 @@ impl Evaluation for DeveloperListFiles { "Using the shell command tool".to_string(), EvalMetricValue::Boolean(valid_tool_call), )); + + metrics.push(( + "score".to_string(), + EvalMetricValue::Float((valid_tool_call as u8) as f64 / 1.0), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs b/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs index 7412c966..dd280fbf 100644 --- a/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs +++ b/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs @@ -197,6 +197,14 @@ impl Evaluation for SimpleRepoCloneTest { EvalMetricValue::Boolean(git_clone_executed && test_added), )); + metrics.push(( + "score".to_string(), + EvalMetricValue::Float( + ((git_clone_executed as u8) + (test_added as u8) + (test_executed as u8)) as f64 + / 3.0, + ), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs b/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs index b725fcbb..8a3deb3e 100644 --- a/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs +++ b/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs @@ -92,6 +92,11 @@ impl Evaluation for DeveloperSearchReplace { EvalMetricValue::Boolean(changes_match), )); + metrics.push(( + "score".to_string(), + EvalMetricValue::Float((changes_match as u8) as f64 / 1.0), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs b/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs index cbde37e0..9cbca88a 100644 --- a/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs +++ b/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs @@ -85,10 +85,11 @@ impl Evaluation for FlappyBird { )); // If tool was used correctly, check the actual file content + let mut valid_implementation = false; if valid_tool_call { if let Ok(file_path) = run_loc.fs_get("flappy_bird.py".to_string()) { if let Ok(content) = fs::read_to_string(file_path) { - let valid_implementation = self.check_python_implementation(&content); + valid_implementation = self.check_python_implementation(&content); metrics.push(( "valid_implementation".to_string(), EvalMetricValue::Boolean(valid_implementation), @@ -97,6 +98,13 @@ impl Evaluation for FlappyBird { } } + metrics.push(( + "score".to_string(), + EvalMetricValue::Float( + ((valid_implementation as u8) + (valid_tool_call as u8)) as f64 / 2.0, + ), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs b/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs index 13dd2779..480989e9 100644 --- a/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs +++ b/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs @@ -9,6 +9,7 @@ use async_trait::async_trait; use goose::message::MessageContent; use mcp_core::role::Role; use serde_json::{self, Value}; +use std::fs; pub struct GooseWiki {} @@ -16,6 +17,25 @@ impl GooseWiki { pub fn new() -> Self { GooseWiki {} } + + fn check_html_implementation(&self, content: &str) -> bool { + // Check for basic structure + let has_basic_structure = content.contains("") + && content.contains("") + && content.contains(""); + + // Check for Wikipedia-style content + let has_wiki_elements = content.contains("") || content.contains("(tool_call.arguments.clone()) { - // Check command is write and path contains index.html - args.get("command").and_then(Value::as_str) == Some("write") && - args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("index.html")) && - // Verify file_text contains basic HTML structure - args.get("file_text").and_then(Value::as_str).is_some_and(|s| { - s.contains("") && - s.contains("") && - s.contains("") - }) + // Parse the arguments as JSON + if let Ok(args) = + serde_json::from_value::(tool_call.arguments.clone()) + { + // Only check command is write and correct filename + args.get("command").and_then(Value::as_str) == Some("write") + && args + .get("path") + .and_then(Value::as_str) + .is_some_and(|s| s.contains("index.html")) + } else { + false + } } else { false } } else { false } - } else { - false - } - }) + }) }); metrics.push(( - "created_valid_html".to_string(), + "used_write_tool".to_string(), EvalMetricValue::Boolean(valid_tool_call), )); + let mut valid_implementation = false; + // If tool was used correctly, check the actual file content + if valid_tool_call { + if let Ok(file_path) = _run_loc.fs_get("index.html".to_string()) { + if let Ok(content) = fs::read_to_string(file_path) { + valid_implementation = self.check_html_implementation(&content); + metrics.push(( + "valid_implementation".to_string(), + EvalMetricValue::Boolean(valid_implementation), + )); + } + } + } + + metrics.push(( + "score".to_string(), + EvalMetricValue::Float( + ((valid_implementation as u8) + (valid_tool_call as u8)) as f64 / 2.0, + ), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/vibes/squirrel_census.rs b/crates/goose-bench/src/eval_suites/vibes/squirrel_census.rs index 4bfb1ed2..84062b72 100644 --- a/crates/goose-bench/src/eval_suites/vibes/squirrel_census.rs +++ b/crates/goose-bench/src/eval_suites/vibes/squirrel_census.rs @@ -148,9 +148,10 @@ After writing the script, run it using python3 and show the results. Do not ask "ran_script".to_string(), EvalMetricValue::Boolean(ran_script), )); + metrics.push(( - "correct_results".to_string(), - EvalMetricValue::Boolean(correct_results), + "score".to_string(), + EvalMetricValue::Float((correct_results as u8) as f64 / 1.0), )); Ok(metrics)