feat: update goosebench vibes suite metrics (#2135)

This commit is contained in:
Alice Hau
2025-04-10 16:06:22 -04:00
committed by GitHub
parent df84c47c73
commit 4f590175cb
7 changed files with 101 additions and 27 deletions

View File

@@ -110,6 +110,12 @@ impl Evaluation for DeveloperCreateFile {
"Complete create and read".to_string(),
EvalMetricValue::Boolean(write_tool_call && read_tool_call),
));
metrics.push((
"score".to_string(),
EvalMetricValue::Float(((write_tool_call as u8) + (read_tool_call as u8)) as f64 / 2.0),
));
Ok(metrics)
}

View File

@@ -69,6 +69,12 @@ impl Evaluation for DeveloperListFiles {
"Using the shell command tool".to_string(),
EvalMetricValue::Boolean(valid_tool_call),
));
metrics.push((
"score".to_string(),
EvalMetricValue::Float((valid_tool_call as u8) as f64 / 1.0),
));
Ok(metrics)
}

View File

@@ -197,6 +197,14 @@ impl Evaluation for SimpleRepoCloneTest {
EvalMetricValue::Boolean(git_clone_executed && test_added),
));
metrics.push((
"score".to_string(),
EvalMetricValue::Float(
((git_clone_executed as u8) + (test_added as u8) + (test_executed as u8)) as f64
/ 3.0,
),
));
Ok(metrics)
}

View File

@@ -92,6 +92,11 @@ impl Evaluation for DeveloperSearchReplace {
EvalMetricValue::Boolean(changes_match),
));
metrics.push((
"score".to_string(),
EvalMetricValue::Float((changes_match as u8) as f64 / 1.0),
));
Ok(metrics)
}

View File

@@ -85,10 +85,11 @@ impl Evaluation for FlappyBird {
));
// If tool was used correctly, check the actual file content
let mut valid_implementation = false;
if valid_tool_call {
if let Ok(file_path) = run_loc.fs_get("flappy_bird.py".to_string()) {
if let Ok(content) = fs::read_to_string(file_path) {
let valid_implementation = self.check_python_implementation(&content);
valid_implementation = self.check_python_implementation(&content);
metrics.push((
"valid_implementation".to_string(),
EvalMetricValue::Boolean(valid_implementation),
@@ -97,6 +98,13 @@ impl Evaluation for FlappyBird {
}
}
metrics.push((
"score".to_string(),
EvalMetricValue::Float(
((valid_implementation as u8) + (valid_tool_call as u8)) as f64 / 2.0,
),
));
Ok(metrics)
}

View File

@@ -9,6 +9,7 @@ use async_trait::async_trait;
use goose::message::MessageContent;
use mcp_core::role::Role;
use serde_json::{self, Value};
use std::fs;
pub struct GooseWiki {}
@@ -16,6 +17,25 @@ impl GooseWiki {
pub fn new() -> Self {
GooseWiki {}
}
fn check_html_implementation(&self, content: &str) -> bool {
// Check for basic structure
let has_basic_structure = content.contains("<html")
&& content.contains("</html>")
&& content.contains("<head")
&& content.contains("</head>")
&& content.contains("<body")
&& content.contains("</body>");
// Check for Wikipedia-style content
let has_wiki_elements = content.contains("<h1") && // Has headings
(content.contains("<h2") || content.contains("<h3")) && // Has subheadings
content.contains("Goose") && // Mentions Goose
content.contains("AI") && // Mentions AI
(content.contains("<p>") || content.contains("<div")); // Has paragraphs
has_basic_structure && has_wiki_elements
}
}
#[async_trait]
@@ -38,43 +58,63 @@ impl Evaluation for GooseWiki {
// Check if the agent used the text editor tool to create index.html
let valid_tool_call = messages.iter().any(|msg| {
msg.role == Role::Assistant &&
msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
// Check tool name is correct
if tool_call.name != "developer__text_editor" {
return false;
}
msg.role == Role::Assistant
&& msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
// Check tool name is correct
if tool_call.name != "developer__text_editor" {
return false;
}
// Parse the arguments as JSON
if let Ok(args) = serde_json::from_value::<Value>(tool_call.arguments.clone()) {
// Check command is write and path contains index.html
args.get("command").and_then(Value::as_str) == Some("write") &&
args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("index.html")) &&
// Verify file_text contains basic HTML structure
args.get("file_text").and_then(Value::as_str).is_some_and(|s| {
s.contains("<html") && s.contains("</html>") &&
s.contains("<head") && s.contains("</head>") &&
s.contains("<body") && s.contains("</body>")
})
// Parse the arguments as JSON
if let Ok(args) =
serde_json::from_value::<Value>(tool_call.arguments.clone())
{
// Only check command is write and correct filename
args.get("command").and_then(Value::as_str) == Some("write")
&& args
.get("path")
.and_then(Value::as_str)
.is_some_and(|s| s.contains("index.html"))
} else {
false
}
} else {
false
}
} else {
false
}
} else {
false
}
})
})
});
metrics.push((
"created_valid_html".to_string(),
"used_write_tool".to_string(),
EvalMetricValue::Boolean(valid_tool_call),
));
let mut valid_implementation = false;
// If tool was used correctly, check the actual file content
if valid_tool_call {
if let Ok(file_path) = _run_loc.fs_get("index.html".to_string()) {
if let Ok(content) = fs::read_to_string(file_path) {
valid_implementation = self.check_html_implementation(&content);
metrics.push((
"valid_implementation".to_string(),
EvalMetricValue::Boolean(valid_implementation),
));
}
}
}
metrics.push((
"score".to_string(),
EvalMetricValue::Float(
((valid_implementation as u8) + (valid_tool_call as u8)) as f64 / 2.0,
),
));
Ok(metrics)
}

View File

@@ -148,9 +148,10 @@ After writing the script, run it using python3 and show the results. Do not ask
"ran_script".to_string(),
EvalMetricValue::Boolean(ran_script),
));
metrics.push((
"correct_results".to_string(),
EvalMetricValue::Boolean(correct_results),
"score".to_string(),
EvalMetricValue::Float((correct_results as u8) as f64 / 1.0),
));
Ok(metrics)