feat: split required_extensions in bench to builtin/external (#1547)

This commit is contained in:
Zaki Ali
2025-03-06 17:12:21 -08:00
committed by GitHub
parent fb444728f0
commit ebf7cb1231
10 changed files with 102 additions and 33 deletions

View File

@@ -1,6 +1,6 @@
// Create a new file called test.txt with the content 'Hello, World!
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
@@ -26,11 +26,11 @@ impl Evaluation for DeveloperCreateFile {
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let mut metrics = Vec::new();
// Send the prompt to list files
// Send the prompt to create and read file
let messages = agent.prompt("Create a new file called test.txt in the current directory with the content 'Hello, World!'. Then read the contents of the new file to confirm.".to_string()).await?;
// println!("asdhflkahjsdflkasdfl");
let valid_tool_call = messages.iter().any(|msg| {
// Check for write operation
let write_tool_call = messages.iter().any(|msg| {
// Check if it's an assistant message
msg.role == Role::Assistant &&
// Check if any content item is a tool request for creating a file
@@ -60,9 +60,47 @@ impl Evaluation for DeveloperCreateFile {
})
});
// Check for read operation
let read_tool_call = messages.iter().any(|msg| {
// Check if it's an assistant message
msg.role == Role::Assistant &&
// Check if any content item is a tool request for reading a file
msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
// Check tool name is correct
if tool_call.name != "developer__text_editor" {
return false;
}
// Parse the arguments as JSON
if let Ok(args) = serde_json::from_value::<Value>(tool_call.arguments.clone()) {
// Check all required parameters match exactly
args.get("command").and_then(Value::as_str) == Some("view") &&
args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("test.txt"))
} else {
false
}
} else {
false
}
} else {
false
}
})
});
metrics.push((
"Create files".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
"Create file".to_string(),
EvaluationMetric::Boolean(write_tool_call),
));
metrics.push((
"Read file".to_string(),
EvaluationMetric::Boolean(read_tool_call),
));
metrics.push((
"Complete create and read".to_string(),
EvaluationMetric::Boolean(write_tool_call && read_tool_call),
));
Ok(metrics)
}
@@ -71,8 +109,11 @@ impl Evaluation for DeveloperCreateFile {
"developer_create_read_file"
}
fn required_extensions(&self) -> Vec<String> {
vec!["developer".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

View File

@@ -1,4 +1,4 @@
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
@@ -36,8 +36,8 @@ impl Evaluation for ExampleEval {
"example_eval"
}
fn required_extensions(&self) -> Vec<String> {
Vec::new() // Example eval doesn't require any extensions
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements::default() // Example eval doesn't require any extensions
}
}

View File

@@ -1,4 +1,4 @@
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
@@ -88,8 +88,11 @@ impl Evaluation for DeveloperImage {
"developer_image"
}
fn required_extensions(&self) -> Vec<String> {
vec!["developer".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

View File

@@ -1,4 +1,4 @@
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
@@ -72,8 +72,11 @@ impl Evaluation for DeveloperListFiles {
"developer_list_files"
}
fn required_extensions(&self) -> Vec<String> {
vec!["developer".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

View File

@@ -1,6 +1,6 @@
// Create a new file called test.txt with the content 'Hello, World!
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
@@ -71,8 +71,11 @@ impl Evaluation for MemoryRememberMemory {
"memory_remember_memory"
}
fn required_extensions(&self) -> Vec<String> {
vec!["memory".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["memory".to_string()],
external: Vec::new(),
}
}
}

View File

@@ -1,6 +1,6 @@
// Create a new file called test.txt with the content 'Hello, World!
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
@@ -69,8 +69,11 @@ impl Evaluation for ComputerControllerScript {
"computercontroller_script"
}
fn required_extensions(&self) -> Vec<String> {
vec!["computercontroller".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["computercontroller".to_string()],
external: Vec::new(),
}
}
}

View File

@@ -1,4 +1,4 @@
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
@@ -102,8 +102,11 @@ impl Evaluation for DeveloperSearchReplace {
"developer_search_replace"
}
fn required_extensions(&self) -> Vec<String> {
vec!["developer".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

View File

@@ -1,6 +1,6 @@
// Create a new file called test.txt with the content 'Hello, World!
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
@@ -71,8 +71,11 @@ impl Evaluation for ComputerControllerWebScrape {
"computercontroller_web_scrape"
}
fn required_extensions(&self) -> Vec<String> {
vec!["computercontroller".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["computercontroller".to_string()],
external: Vec::new(),
}
}
}

View File

@@ -23,6 +23,12 @@ pub enum EvaluationMetric {
Boolean(bool),
}
#[derive(Debug, Default)]
pub struct ExtensionRequirements {
pub builtin: Vec<String>,
pub external: Vec<String>,
}
#[async_trait]
pub trait BenchAgent: Send + Sync {
async fn prompt(&mut self, p: String) -> Result<Vec<Message>>;
@@ -41,7 +47,10 @@ pub trait Evaluation: Send + Sync {
fn name(&self) -> &str;
fn required_extensions(&self) -> Vec<String> {
Vec::new() // Default implementation returns empty vec
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: Vec::new(),
external: Vec::new(),
}
}
}

View File

@@ -82,10 +82,11 @@ async fn run_eval(
let mut result = EvaluationResult::new(evaluation.name().to_string());
if let Ok(work_dir) = work_dir.move_to(format!("./{}", &evaluation.name())) {
let required_extensions = evaluation.required_extensions();
let requirements = evaluation.required_extensions();
// Create session with error capture
let base_session = build_session(None, false, Vec::new(), required_extensions).await;
let base_session =
build_session(None, false, requirements.external, requirements.builtin).await;
let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session)));
let bench_session_clone = bench_session.clone();