mirror of
https://github.com/aljazceru/goose.git
synced 2026-02-23 15:34:27 +01:00
feat: update goosebench vibes suite metrics (#2135)
This commit is contained in:
@@ -110,6 +110,12 @@ impl Evaluation for DeveloperCreateFile {
|
||||
"Complete create and read".to_string(),
|
||||
EvalMetricValue::Boolean(write_tool_call && read_tool_call),
|
||||
));
|
||||
|
||||
metrics.push((
|
||||
"score".to_string(),
|
||||
EvalMetricValue::Float(((write_tool_call as u8) + (read_tool_call as u8)) as f64 / 2.0),
|
||||
));
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
||||
@@ -69,6 +69,12 @@ impl Evaluation for DeveloperListFiles {
|
||||
"Using the shell command tool".to_string(),
|
||||
EvalMetricValue::Boolean(valid_tool_call),
|
||||
));
|
||||
|
||||
metrics.push((
|
||||
"score".to_string(),
|
||||
EvalMetricValue::Float((valid_tool_call as u8) as f64 / 1.0),
|
||||
));
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
||||
@@ -197,6 +197,14 @@ impl Evaluation for SimpleRepoCloneTest {
|
||||
EvalMetricValue::Boolean(git_clone_executed && test_added),
|
||||
));
|
||||
|
||||
metrics.push((
|
||||
"score".to_string(),
|
||||
EvalMetricValue::Float(
|
||||
((git_clone_executed as u8) + (test_added as u8) + (test_executed as u8)) as f64
|
||||
/ 3.0,
|
||||
),
|
||||
));
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
||||
@@ -92,6 +92,11 @@ impl Evaluation for DeveloperSearchReplace {
|
||||
EvalMetricValue::Boolean(changes_match),
|
||||
));
|
||||
|
||||
metrics.push((
|
||||
"score".to_string(),
|
||||
EvalMetricValue::Float((changes_match as u8) as f64 / 1.0),
|
||||
));
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
||||
@@ -85,10 +85,11 @@ impl Evaluation for FlappyBird {
|
||||
));
|
||||
|
||||
// If tool was used correctly, check the actual file content
|
||||
let mut valid_implementation = false;
|
||||
if valid_tool_call {
|
||||
if let Ok(file_path) = run_loc.fs_get("flappy_bird.py".to_string()) {
|
||||
if let Ok(content) = fs::read_to_string(file_path) {
|
||||
let valid_implementation = self.check_python_implementation(&content);
|
||||
valid_implementation = self.check_python_implementation(&content);
|
||||
metrics.push((
|
||||
"valid_implementation".to_string(),
|
||||
EvalMetricValue::Boolean(valid_implementation),
|
||||
@@ -97,6 +98,13 @@ impl Evaluation for FlappyBird {
|
||||
}
|
||||
}
|
||||
|
||||
metrics.push((
|
||||
"score".to_string(),
|
||||
EvalMetricValue::Float(
|
||||
((valid_implementation as u8) + (valid_tool_call as u8)) as f64 / 2.0,
|
||||
),
|
||||
));
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ use async_trait::async_trait;
|
||||
use goose::message::MessageContent;
|
||||
use mcp_core::role::Role;
|
||||
use serde_json::{self, Value};
|
||||
use std::fs;
|
||||
|
||||
pub struct GooseWiki {}
|
||||
|
||||
@@ -16,6 +17,25 @@ impl GooseWiki {
|
||||
pub fn new() -> Self {
|
||||
GooseWiki {}
|
||||
}
|
||||
|
||||
fn check_html_implementation(&self, content: &str) -> bool {
|
||||
// Check for basic structure
|
||||
let has_basic_structure = content.contains("<html")
|
||||
&& content.contains("</html>")
|
||||
&& content.contains("<head")
|
||||
&& content.contains("</head>")
|
||||
&& content.contains("<body")
|
||||
&& content.contains("</body>");
|
||||
|
||||
// Check for Wikipedia-style content
|
||||
let has_wiki_elements = content.contains("<h1") && // Has headings
|
||||
(content.contains("<h2") || content.contains("<h3")) && // Has subheadings
|
||||
content.contains("Goose") && // Mentions Goose
|
||||
content.contains("AI") && // Mentions AI
|
||||
(content.contains("<p>") || content.contains("<div")); // Has paragraphs
|
||||
|
||||
has_basic_structure && has_wiki_elements
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -38,43 +58,63 @@ impl Evaluation for GooseWiki {
|
||||
|
||||
// Check if the agent used the text editor tool to create index.html
|
||||
let valid_tool_call = messages.iter().any(|msg| {
|
||||
msg.role == Role::Assistant &&
|
||||
msg.content.iter().any(|content| {
|
||||
if let MessageContent::ToolRequest(tool_req) = content {
|
||||
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
|
||||
// Check tool name is correct
|
||||
if tool_call.name != "developer__text_editor" {
|
||||
return false;
|
||||
}
|
||||
msg.role == Role::Assistant
|
||||
&& msg.content.iter().any(|content| {
|
||||
if let MessageContent::ToolRequest(tool_req) = content {
|
||||
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
|
||||
// Check tool name is correct
|
||||
if tool_call.name != "developer__text_editor" {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Parse the arguments as JSON
|
||||
if let Ok(args) = serde_json::from_value::<Value>(tool_call.arguments.clone()) {
|
||||
// Check command is write and path contains index.html
|
||||
args.get("command").and_then(Value::as_str) == Some("write") &&
|
||||
args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("index.html")) &&
|
||||
// Verify file_text contains basic HTML structure
|
||||
args.get("file_text").and_then(Value::as_str).is_some_and(|s| {
|
||||
s.contains("<html") && s.contains("</html>") &&
|
||||
s.contains("<head") && s.contains("</head>") &&
|
||||
s.contains("<body") && s.contains("</body>")
|
||||
})
|
||||
// Parse the arguments as JSON
|
||||
if let Ok(args) =
|
||||
serde_json::from_value::<Value>(tool_call.arguments.clone())
|
||||
{
|
||||
// Only check command is write and correct filename
|
||||
args.get("command").and_then(Value::as_str) == Some("write")
|
||||
&& args
|
||||
.get("path")
|
||||
.and_then(Value::as_str)
|
||||
.is_some_and(|s| s.contains("index.html"))
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
metrics.push((
|
||||
"created_valid_html".to_string(),
|
||||
"used_write_tool".to_string(),
|
||||
EvalMetricValue::Boolean(valid_tool_call),
|
||||
));
|
||||
|
||||
let mut valid_implementation = false;
|
||||
// If tool was used correctly, check the actual file content
|
||||
if valid_tool_call {
|
||||
if let Ok(file_path) = _run_loc.fs_get("index.html".to_string()) {
|
||||
if let Ok(content) = fs::read_to_string(file_path) {
|
||||
valid_implementation = self.check_html_implementation(&content);
|
||||
metrics.push((
|
||||
"valid_implementation".to_string(),
|
||||
EvalMetricValue::Boolean(valid_implementation),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metrics.push((
|
||||
"score".to_string(),
|
||||
EvalMetricValue::Float(
|
||||
((valid_implementation as u8) + (valid_tool_call as u8)) as f64 / 2.0,
|
||||
),
|
||||
));
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
||||
@@ -148,9 +148,10 @@ After writing the script, run it using python3 and show the results. Do not ask
|
||||
"ran_script".to_string(),
|
||||
EvalMetricValue::Boolean(ran_script),
|
||||
));
|
||||
|
||||
metrics.push((
|
||||
"correct_results".to_string(),
|
||||
EvalMetricValue::Boolean(correct_results),
|
||||
"score".to_string(),
|
||||
EvalMetricValue::Float((correct_results as u8) as f64 / 1.0),
|
||||
));
|
||||
|
||||
Ok(metrics)
|
||||
|
||||
Reference in New Issue
Block a user