mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-18 14:44:21 +01:00
feat: split required_extensions in bench to builtin/external (#1547)
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
// Create a new file called test.txt with the content 'Hello, World!
|
||||
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
@@ -26,11 +26,11 @@ impl Evaluation for DeveloperCreateFile {
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
// Send the prompt to list files
|
||||
// Send the prompt to create and read file
|
||||
let messages = agent.prompt("Create a new file called test.txt in the current directory with the content 'Hello, World!'. Then read the contents of the new file to confirm.".to_string()).await?;
|
||||
// println!("asdhflkahjsdflkasdfl");
|
||||
|
||||
let valid_tool_call = messages.iter().any(|msg| {
|
||||
// Check for write operation
|
||||
let write_tool_call = messages.iter().any(|msg| {
|
||||
// Check if it's an assistant message
|
||||
msg.role == Role::Assistant &&
|
||||
// Check if any content item is a tool request for creating a file
|
||||
@@ -60,9 +60,47 @@ impl Evaluation for DeveloperCreateFile {
|
||||
})
|
||||
});
|
||||
|
||||
// Check for read operation
|
||||
let read_tool_call = messages.iter().any(|msg| {
|
||||
// Check if it's an assistant message
|
||||
msg.role == Role::Assistant &&
|
||||
// Check if any content item is a tool request for reading a file
|
||||
msg.content.iter().any(|content| {
|
||||
if let MessageContent::ToolRequest(tool_req) = content {
|
||||
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
|
||||
// Check tool name is correct
|
||||
if tool_call.name != "developer__text_editor" {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Parse the arguments as JSON
|
||||
if let Ok(args) = serde_json::from_value::<Value>(tool_call.arguments.clone()) {
|
||||
// Check all required parameters match exactly
|
||||
args.get("command").and_then(Value::as_str) == Some("view") &&
|
||||
args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("test.txt"))
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
metrics.push((
|
||||
"Create files".to_string(),
|
||||
EvaluationMetric::Boolean(valid_tool_call),
|
||||
"Create file".to_string(),
|
||||
EvaluationMetric::Boolean(write_tool_call),
|
||||
));
|
||||
metrics.push((
|
||||
"Read file".to_string(),
|
||||
EvaluationMetric::Boolean(read_tool_call),
|
||||
));
|
||||
metrics.push((
|
||||
"Complete create and read".to_string(),
|
||||
EvaluationMetric::Boolean(write_tool_call && read_tool_call),
|
||||
));
|
||||
Ok(metrics)
|
||||
}
|
||||
@@ -71,8 +109,11 @@ impl Evaluation for DeveloperCreateFile {
|
||||
"developer_create_read_file"
|
||||
}
|
||||
|
||||
fn required_extensions(&self) -> Vec<String> {
|
||||
vec!["developer".to_string()]
|
||||
fn required_extensions(&self) -> ExtensionRequirements {
|
||||
ExtensionRequirements {
|
||||
builtin: vec!["developer".to_string()],
|
||||
external: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
@@ -36,8 +36,8 @@ impl Evaluation for ExampleEval {
|
||||
"example_eval"
|
||||
}
|
||||
|
||||
fn required_extensions(&self) -> Vec<String> {
|
||||
Vec::new() // Example eval doesn't require any extensions
|
||||
fn required_extensions(&self) -> ExtensionRequirements {
|
||||
ExtensionRequirements::default() // Example eval doesn't require any extensions
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
@@ -88,8 +88,11 @@ impl Evaluation for DeveloperImage {
|
||||
"developer_image"
|
||||
}
|
||||
|
||||
fn required_extensions(&self) -> Vec<String> {
|
||||
vec!["developer".to_string()]
|
||||
fn required_extensions(&self) -> ExtensionRequirements {
|
||||
ExtensionRequirements {
|
||||
builtin: vec!["developer".to_string()],
|
||||
external: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
@@ -72,8 +72,11 @@ impl Evaluation for DeveloperListFiles {
|
||||
"developer_list_files"
|
||||
}
|
||||
|
||||
fn required_extensions(&self) -> Vec<String> {
|
||||
vec!["developer".to_string()]
|
||||
fn required_extensions(&self) -> ExtensionRequirements {
|
||||
ExtensionRequirements {
|
||||
builtin: vec!["developer".to_string()],
|
||||
external: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// Create a new file called test.txt with the content 'Hello, World!
|
||||
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
@@ -71,8 +71,11 @@ impl Evaluation for MemoryRememberMemory {
|
||||
"memory_remember_memory"
|
||||
}
|
||||
|
||||
fn required_extensions(&self) -> Vec<String> {
|
||||
vec!["memory".to_string()]
|
||||
fn required_extensions(&self) -> ExtensionRequirements {
|
||||
ExtensionRequirements {
|
||||
builtin: vec!["memory".to_string()],
|
||||
external: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// Create a new file called test.txt with the content 'Hello, World!
|
||||
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
@@ -69,8 +69,11 @@ impl Evaluation for ComputerControllerScript {
|
||||
"computercontroller_script"
|
||||
}
|
||||
|
||||
fn required_extensions(&self) -> Vec<String> {
|
||||
vec!["computercontroller".to_string()]
|
||||
fn required_extensions(&self) -> ExtensionRequirements {
|
||||
ExtensionRequirements {
|
||||
builtin: vec!["computercontroller".to_string()],
|
||||
external: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
@@ -102,8 +102,11 @@ impl Evaluation for DeveloperSearchReplace {
|
||||
"developer_search_replace"
|
||||
}
|
||||
|
||||
fn required_extensions(&self) -> Vec<String> {
|
||||
vec!["developer".to_string()]
|
||||
fn required_extensions(&self) -> ExtensionRequirements {
|
||||
ExtensionRequirements {
|
||||
builtin: vec!["developer".to_string()],
|
||||
external: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// Create a new file called test.txt with the content 'Hello, World!
|
||||
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
@@ -71,8 +71,11 @@ impl Evaluation for ComputerControllerWebScrape {
|
||||
"computercontroller_web_scrape"
|
||||
}
|
||||
|
||||
fn required_extensions(&self) -> Vec<String> {
|
||||
vec!["computercontroller".to_string()]
|
||||
fn required_extensions(&self) -> ExtensionRequirements {
|
||||
ExtensionRequirements {
|
||||
builtin: vec!["computercontroller".to_string()],
|
||||
external: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,6 +23,12 @@ pub enum EvaluationMetric {
|
||||
Boolean(bool),
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ExtensionRequirements {
|
||||
pub builtin: Vec<String>,
|
||||
pub external: Vec<String>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait BenchAgent: Send + Sync {
|
||||
async fn prompt(&mut self, p: String) -> Result<Vec<Message>>;
|
||||
@@ -41,7 +47,10 @@ pub trait Evaluation: Send + Sync {
|
||||
|
||||
fn name(&self) -> &str;
|
||||
|
||||
fn required_extensions(&self) -> Vec<String> {
|
||||
Vec::new() // Default implementation returns empty vec
|
||||
fn required_extensions(&self) -> ExtensionRequirements {
|
||||
ExtensionRequirements {
|
||||
builtin: Vec::new(),
|
||||
external: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -82,10 +82,11 @@ async fn run_eval(
|
||||
let mut result = EvaluationResult::new(evaluation.name().to_string());
|
||||
|
||||
if let Ok(work_dir) = work_dir.move_to(format!("./{}", &evaluation.name())) {
|
||||
let required_extensions = evaluation.required_extensions();
|
||||
let requirements = evaluation.required_extensions();
|
||||
|
||||
// Create session with error capture
|
||||
let base_session = build_session(None, false, Vec::new(), required_extensions).await;
|
||||
let base_session =
|
||||
build_session(None, false, requirements.external, requirements.builtin).await;
|
||||
|
||||
let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session)));
|
||||
let bench_session_clone = bench_session.clone();
|
||||
|
||||
Reference in New Issue
Block a user