diff --git a/crates/goose-bench/src/eval_suites/core/developer/create_file.rs b/crates/goose-bench/src/eval_suites/core/developer/create_file.rs index b2784e2a..5cfe895d 100644 --- a/crates/goose-bench/src/eval_suites/core/developer/create_file.rs +++ b/crates/goose-bench/src/eval_suites/core/developer/create_file.rs @@ -110,6 +110,12 @@ impl Evaluation for DeveloperCreateFile { "Complete create and read".to_string(), EvalMetricValue::Boolean(write_tool_call && read_tool_call), )); + + metrics.push(( + "score".to_string(), + EvalMetricValue::Float(((write_tool_call as u8) + (read_tool_call as u8)) as f64 / 2.0), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/core/developer/list_files.rs b/crates/goose-bench/src/eval_suites/core/developer/list_files.rs index 4148738b..8881b690 100644 --- a/crates/goose-bench/src/eval_suites/core/developer/list_files.rs +++ b/crates/goose-bench/src/eval_suites/core/developer/list_files.rs @@ -69,6 +69,12 @@ impl Evaluation for DeveloperListFiles { "Using the shell command tool".to_string(), EvalMetricValue::Boolean(valid_tool_call), )); + + metrics.push(( + "score".to_string(), + EvalMetricValue::Float((valid_tool_call as u8) as f64 / 1.0), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs b/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs index 7412c966..dd280fbf 100644 --- a/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs +++ b/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs @@ -197,6 +197,14 @@ impl Evaluation for SimpleRepoCloneTest { EvalMetricValue::Boolean(git_clone_executed && test_added), )); + metrics.push(( + "score".to_string(), + EvalMetricValue::Float( + ((git_clone_executed as u8) + (test_added as u8) + (test_executed as u8)) as f64 + / 3.0, + ), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs b/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs index b725fcbb..8a3deb3e 100644 --- a/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs +++ b/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs @@ -92,6 +92,11 @@ impl Evaluation for DeveloperSearchReplace { EvalMetricValue::Boolean(changes_match), )); + metrics.push(( + "score".to_string(), + EvalMetricValue::Float((changes_match as u8) as f64 / 1.0), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs b/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs index cbde37e0..9cbca88a 100644 --- a/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs +++ b/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs @@ -85,10 +85,11 @@ impl Evaluation for FlappyBird { )); // If tool was used correctly, check the actual file content + let mut valid_implementation = false; if valid_tool_call { if let Ok(file_path) = run_loc.fs_get("flappy_bird.py".to_string()) { if let Ok(content) = fs::read_to_string(file_path) { - let valid_implementation = self.check_python_implementation(&content); + valid_implementation = self.check_python_implementation(&content); metrics.push(( "valid_implementation".to_string(), EvalMetricValue::Boolean(valid_implementation), @@ -97,6 +98,13 @@ impl Evaluation for FlappyBird { } } + metrics.push(( + "score".to_string(), + EvalMetricValue::Float( + ((valid_implementation as u8) + (valid_tool_call as u8)) as f64 / 2.0, + ), + )); + Ok(metrics) } diff --git a/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs b/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs index 13dd2779..480989e9 100644 --- a/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs +++ b/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs @@ -9,6 +9,7 @@ use async_trait::async_trait; use goose::message::MessageContent; use mcp_core::role::Role; use serde_json::{self, Value}; +use std::fs; pub struct GooseWiki {} @@ -16,6 +17,25 @@ impl GooseWiki { pub fn new() -> Self { GooseWiki {} } + + fn check_html_implementation(&self, content: &str) -> bool { + // Check for basic structure + let has_basic_structure = content.contains("") + && content.contains("
") + && content.contains(""); + + // Check for Wikipedia-style content + let has_wiki_elements = content.contains("