feat: efficient benching (#1921)

Co-authored-by: Tyler Rockwood <rockwotj@gmail.com>
Co-authored-by: Kalvin C <kalvinnchau@users.noreply.github.com>
Co-authored-by: Alice Hau <110418948+ahau-square@users.noreply.github.com>
This commit is contained in:
marcelle
2025-04-08 14:43:43 -04:00
committed by GitHub
parent 319f2301f3
commit 8fbd9eb327
37 changed files with 1162 additions and 444 deletions

3
.gitignore vendored
View File

@@ -39,3 +39,6 @@ debug_*.txt
# Benchmark paths
benchmark-*
benchconf.json
scripts/fake.sh
do_not_version/

1
Cargo.lock generated
View File

@@ -2372,6 +2372,7 @@ dependencies = [
"serde",
"serde_json",
"tokio",
"toml",
"tracing",
"tracing-subscriber",
"winapi",

View File

@@ -24,6 +24,7 @@ tokio = { version = "1.43", features = ["full"] }
include_dir = "0.7.4"
once_cell = "1.19"
regex = "1.11.1"
toml = "0.8.20"
[target.'cfg(target_os = "windows")'.dependencies]
winapi = { version = "0.3", features = ["wincred"] }

View File

@@ -0,0 +1,107 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use serde::{Deserialize, Serialize};
use std::fs;
use std::fs::read_to_string;
use std::path::PathBuf;
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct BenchToolShimOpt {
pub use_tool_shim: bool,
pub tool_shim_model: Option<String>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct BenchModel {
pub provider: String,
pub name: String,
pub parallel_safe: bool,
pub tool_shim: Option<BenchToolShimOpt>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct BenchEval {
pub selector: String,
pub post_process_cmd: Option<PathBuf>,
pub parallel_safe: bool,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct BenchRunConfig {
pub models: Vec<BenchModel>,
pub evals: Vec<BenchEval>,
pub include_dirs: Vec<PathBuf>,
pub repeat: Option<usize>,
pub run_id: Option<String>,
pub eval_result_filename: String,
pub run_summary_filename: String,
pub env_file: Option<PathBuf>,
}
impl Default for BenchRunConfig {
fn default() -> Self {
BenchRunConfig {
models: vec![
BenchModel {
provider: "databricks".to_string(),
name: "goose".to_string(),
parallel_safe: true,
tool_shim: Some(BenchToolShimOpt {
use_tool_shim: false,
tool_shim_model: None,
}),
},
BenchModel {
provider: "databricks".to_string(),
name: "goose-claude-3-5-sonnet".to_string(),
parallel_safe: true,
tool_shim: None,
},
],
evals: vec![BenchEval {
selector: "core".into(),
post_process_cmd: None,
parallel_safe: true, // Default to true
}],
include_dirs: vec![],
repeat: Some(2),
run_id: None,
eval_result_filename: "eval-results.json".to_string(),
run_summary_filename: "run-results-summary.json".to_string(),
env_file: None,
}
}
}
impl BenchRunConfig {
pub fn from_string(cfg: String) -> anyhow::Result<Self> {
let mut config: Self = serde_json::from_str(cfg.as_str())?;
// update include_dirs to contain full-paths only
config.include_dirs = BenchmarkWorkDir::canonical_dirs(config.include_dirs);
Self::canonicalize_eval_post_proc_cmd(&mut config);
Ok(config)
}
fn canonicalize_eval_post_proc_cmd(config: &mut BenchRunConfig) {
// update eval post-process script paths to all be full-paths
config.evals.iter_mut().for_each(|eval| {
if let Some(post_process_cmd) = &eval.post_process_cmd {
let canon = BenchmarkWorkDir::canonical_dirs(vec![post_process_cmd.clone()]);
let full_path_cmd = canon[0].clone();
if !full_path_cmd.exists() {
panic!("BenchConfigError: Eval post-process command not found. File {:?} does not exist", full_path_cmd);
}
eval.post_process_cmd = Some(full_path_cmd);
}
});
}
pub fn from(cfg: PathBuf) -> anyhow::Result<Self> {
let config = Self::from_string(read_to_string(cfg)?)?;
Ok(config)
}
pub fn to_string(&self) -> anyhow::Result<String> {
Ok(serde_json::to_string_pretty(self)?)
}
pub fn save(&self, name: String) {
let config = self.to_string().unwrap();
fs::write(name, config).expect("Unable to write bench config file");
}
}

View File

@@ -0,0 +1,58 @@
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use goose::message::Message;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use std::sync::Arc;
use tokio::sync::Mutex;
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct BenchAgentError {
pub message: String,
pub level: String, // ERROR, WARN, etc.
pub timestamp: DateTime<Utc>,
}
// avoid tying benchmarking to current session-impl.
#[async_trait]
pub trait BenchBaseSession: Send + Sync {
async fn headless(&mut self, message: String) -> anyhow::Result<()>;
fn session_file(&self) -> PathBuf;
fn message_history(&self) -> Vec<Message>;
fn get_total_token_usage(&self) -> anyhow::Result<Option<i32>>;
}
// struct for managing agent-session-access. to be passed to evals for benchmarking
pub struct BenchAgent {
session: Box<dyn BenchBaseSession>,
errors: Arc<Mutex<Vec<BenchAgentError>>>,
}
impl BenchAgent {
pub fn new(session: Box<dyn BenchBaseSession>) -> Self {
let errors = Arc::new(Mutex::new(Vec::new()));
Self { session, errors }
}
pub(crate) async fn prompt(&mut self, p: String) -> anyhow::Result<Vec<Message>> {
// Clear previous errors
{
let mut errors = self.errors.lock().await;
errors.clear();
}
self.session.headless(p).await?;
Ok(self.session.message_history())
}
pub async fn get_errors(&self) -> Vec<BenchAgentError> {
let errors = self.errors.lock().await;
errors.clone()
}
pub(crate) async fn get_token_usage(&self) -> Option<i32> {
self.session.get_total_token_usage().ok().flatten()
}
pub(crate) fn session_file(&self) -> PathBuf {
self.session.session_file()
}
}

View File

@@ -1,5 +1,6 @@
use chrono::Local;
use include_dir::{include_dir, Dir};
use serde::{Deserialize, Serialize};
use std::fs;
use std::io;
use std::io::ErrorKind;
@@ -9,11 +10,12 @@ use std::process::Command;
pub static BUILTIN_EVAL_ASSETS: Dir = include_dir!("$CARGO_MANIFEST_DIR/src/assets");
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct BenchmarkWorkDir {
pub base_path: PathBuf,
run_dir: PathBuf,
cwd: PathBuf,
run_name: String,
pub run_dir: PathBuf,
pub cwd: PathBuf,
pub run_id: Option<String>,
}
impl Default for BenchmarkWorkDir {
@@ -21,26 +23,17 @@ impl Default for BenchmarkWorkDir {
Self::new("work_dir".to_string(), Vec::new())
}
}
impl BenchmarkWorkDir {
pub fn new(work_dir_name: String, include_dirs: Vec<PathBuf>) -> Self {
let run_dir = std::env::current_dir().unwrap().canonicalize().unwrap();
let base_path = PathBuf::from(format!("./benchmark-{}", work_dir_name));
let base_path = PathBuf::from(format!("./{}", work_dir_name));
fs::create_dir_all(&base_path).unwrap();
let current_time = Local::now().format("%H:%M:%S").to_string();
let current_date = Local::now().format("%Y-%m-%d").to_string();
let run_name = format!("{}-{}", &current_date, current_time);
let mut base_path = PathBuf::from(&base_path).canonicalize().unwrap();
base_path.push(run_name.clone());
fs::create_dir_all(&base_path).unwrap();
base_path.pop();
let base_path = PathBuf::from(&base_path).canonicalize().unwrap();
// abs paths from dir-strings
let dirs = include_dirs
.iter()
.map(|d| d.canonicalize().unwrap())
.collect::<Vec<_>>();
let dirs = Self::canonical_dirs(include_dirs);
// deep copy each dir
let _: Vec<_> = dirs
@@ -56,9 +49,32 @@ impl BenchmarkWorkDir {
base_path: base_path.clone(),
run_dir,
cwd: base_path.clone(),
run_name,
run_id: None,
}
}
pub fn init_experiment() {
// create experiment folder
let current_time = Local::now().format("%H:%M:%S").to_string();
let current_date = Local::now().format("%Y-%m-%d").to_string();
let exp_name = format!("{}-{}", &current_date, current_time);
let base_path = PathBuf::from(format!("./benchmark-{}", exp_name));
fs::create_dir_all(&base_path).unwrap();
std::env::set_current_dir(&base_path).unwrap();
}
pub fn canonical_dirs(include_dirs: Vec<PathBuf>) -> Vec<PathBuf> {
include_dirs
.iter()
.map(|d| {
let canon = d.canonicalize();
if canon.is_err() {
eprintln!("{:?} can't be canonicalized", d);
panic!();
}
canon.unwrap()
})
.collect::<Vec<_>>()
}
fn copy_auto_included_dirs(dest: &Path) {
let mut assets_dest = dest.to_path_buf();
assets_dest.push("assets");
@@ -73,10 +89,21 @@ impl BenchmarkWorkDir {
self.cwd = path;
Ok(self)
}
pub fn set_eval(&mut self, eval: &str) {
pub(crate) fn _run_dir(&mut self) -> Option<PathBuf> {
if let Some(run_id) = &self.run_id {
let mut eval_dir = self.base_path.clone();
eval_dir.push(run_id);
return Some(eval_dir);
}
None
}
pub fn set_eval(&mut self, eval: &str, run_id: String) {
self.run_id = Some(run_id.clone());
let eval = eval.replace(":", std::path::MAIN_SEPARATOR_STR);
let mut eval_dir = self.base_path.clone();
eval_dir.push(self.run_name.clone());
eval_dir.push(run_id);
eval_dir.push(eval);
self.cd(eval_dir.clone())
@@ -134,7 +161,7 @@ impl BenchmarkWorkDir {
Ok(PathBuf::from(path))
}
fn deep_copy<P, Q>(src: P, dst: Q, recursive: bool) -> io::Result<()>
pub(crate) fn deep_copy<P, Q>(src: P, dst: Q, recursive: bool) -> io::Result<()>
where
P: AsRef<Path>,
Q: AsRef<Path>,
@@ -162,6 +189,11 @@ impl BenchmarkWorkDir {
Err(io::Error::new(ErrorKind::Other, error_message))
}
}
pub fn save(&self) {
let work_dir = serde_json::to_string_pretty(&self).unwrap();
fs::write("work_dir.json", work_dir).expect("Unable to write work-dir as file");
}
}
impl Drop for BenchmarkWorkDir {

View File

@@ -1,4 +1,4 @@
use crate::eval_suites::BenchAgentError;
use crate::bench_session::BenchAgentError;
use chrono::Utc;
use once_cell::sync::Lazy;
use std::sync::Arc;

View File

@@ -1,8 +1,9 @@
// Create a new file called test.txt with the content 'Hello, World!
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
ExtensionRequirements,
};
use crate::register_evaluation;
@@ -24,12 +25,12 @@ impl ComputerControllerScript {
impl Evaluation for ComputerControllerScript {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
_run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
// Send the prompt to list files
let (messages, perf_metrics) =
collect_baseline_metrics(&mut agent, "Make a beep sound".to_string()).await;
collect_baseline_metrics(agent, "Make a beep sound".to_string()).await;
// Convert HashMap to Vec for our metrics
let mut metrics = metrics_hashmap_to_vec(perf_metrics);
@@ -64,7 +65,7 @@ impl Evaluation for ComputerControllerScript {
metrics.push((
"Running os scripts".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
EvalMetricValue::Boolean(valid_tool_call),
));
Ok(metrics)
}

View File

@@ -1,8 +1,9 @@
// Create a new file called test.txt with the content 'Hello, World!
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
ExtensionRequirements,
};
use crate::register_evaluation;
@@ -24,12 +25,12 @@ impl ComputerControllerWebScrape {
impl Evaluation for ComputerControllerWebScrape {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
_run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
// Send the prompt to list files
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"What are the headlines on hackernews? Organize the list into categories.".to_string(),
)
.await;
@@ -67,7 +68,7 @@ impl Evaluation for ComputerControllerWebScrape {
metrics.push((
"Retrieve and scrape web pages".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
EvalMetricValue::Boolean(valid_tool_call),
));
Ok(metrics)
}

View File

@@ -1,8 +1,9 @@
// Create a new file called test.txt with the content 'Hello, World!
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
ExtensionRequirements,
};
use crate::register_evaluation;
@@ -24,12 +25,12 @@ impl DeveloperCreateFile {
impl Evaluation for DeveloperCreateFile {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
_run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
// Send the prompt to create and read
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"Create a new file called test.txt in the current directory with the content 'Hello, World!'. Then read the contents of the new file to confirm.".to_string()
).await;
@@ -99,15 +100,15 @@ impl Evaluation for DeveloperCreateFile {
metrics.push((
"Create file".to_string(),
EvaluationMetric::Boolean(write_tool_call),
EvalMetricValue::Boolean(write_tool_call),
));
metrics.push((
"Read file".to_string(),
EvaluationMetric::Boolean(read_tool_call),
EvalMetricValue::Boolean(read_tool_call),
));
metrics.push((
"Complete create and read".to_string(),
EvaluationMetric::Boolean(write_tool_call && read_tool_call),
EvalMetricValue::Boolean(write_tool_call && read_tool_call),
));
Ok(metrics)
}

View File

@@ -1,6 +1,7 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
ExtensionRequirements,
};
use crate::register_evaluation;
@@ -22,14 +23,12 @@ impl DeveloperListFiles {
impl Evaluation for DeveloperListFiles {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
_run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
// Send the prompt to list files
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
"list the files in the current directory".to_string(),
)
let (messages, perf_metrics) =
collect_baseline_metrics(agent, "list the files in the current directory".to_string())
.await;
// Convert HashMap to Vec for our metrics
@@ -68,7 +67,7 @@ impl Evaluation for DeveloperListFiles {
metrics.push((
"Using the shell command tool".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
EvalMetricValue::Boolean(valid_tool_call),
));
Ok(metrics)
}

View File

@@ -1,6 +1,7 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
ExtensionRequirements,
};
use crate::register_evaluation;
@@ -22,15 +23,15 @@ impl SimpleRepoCloneTest {
impl Evaluation for SimpleRepoCloneTest {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
agent: &mut BenchAgent,
_work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
// Send the prompt to clone the repo and add a test
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"Clone the Git repository https://github.com/michaelneale/mcp-read-pdf to a temporary location. \
Then add a new test file that verifies the PDF reading functionality. The test should \
check if the PDF content can be read and processed correctly.".to_string()
check if the PDF content can be read and processed correctly.".to_string(),
).await;
// Convert HashMap to Vec for our metrics
@@ -177,23 +178,23 @@ impl Evaluation for SimpleRepoCloneTest {
// Add metrics
metrics.push((
"Git repo cloned".to_string(),
EvaluationMetric::Boolean(git_clone_executed),
EvalMetricValue::Boolean(git_clone_executed),
));
metrics.push((
"Repository explored".to_string(),
EvaluationMetric::Boolean(repo_explored),
EvalMetricValue::Boolean(repo_explored),
));
metrics.push((
"Test file added".to_string(),
EvaluationMetric::Boolean(test_added),
EvalMetricValue::Boolean(test_added),
));
metrics.push((
"Test executed".to_string(),
EvaluationMetric::Boolean(test_executed),
EvalMetricValue::Boolean(test_executed),
));
metrics.push((
"Complete task".to_string(),
EvaluationMetric::Boolean(git_clone_executed && test_added),
EvalMetricValue::Boolean(git_clone_executed && test_added),
));
Ok(metrics)

View File

@@ -1,6 +1,7 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
ExtensionRequirements,
};
use crate::register_evaluation;
@@ -23,12 +24,12 @@ impl DeveloperImage {
impl Evaluation for DeveloperImage {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
_run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
// Send the prompt to list files
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"Take a screenshot of the display 0 and describe what you see.".to_string(),
)
.await;
@@ -85,7 +86,7 @@ impl Evaluation for DeveloperImage {
// Both the tool call and response must be valid
metrics.push((
"Take a screenshot and upload images".to_string(),
EvaluationMetric::Boolean(valid_tool_call && valid_response),
EvalMetricValue::Boolean(valid_tool_call && valid_response),
));
Ok(metrics)
}

View File

@@ -1,6 +1,7 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
ExtensionRequirements,
};
use crate::register_evaluation;
@@ -20,10 +21,10 @@ impl DeveloperSearchReplace {
impl Evaluation for DeveloperSearchReplace {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let _target_file = match work_dir.fs_get("./assets/kubernetes_swagger.json".to_string()) {
agent: &mut BenchAgent,
run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
let _target_file = match run_loc.fs_get("./assets/kubernetes_swagger.json".to_string()) {
Ok(file) => file,
Err(_) => {
return Err(anyhow::anyhow!(
@@ -31,12 +32,12 @@ impl Evaluation for DeveloperSearchReplace {
))
}
};
let mut source_file = work_dir.base_path.clone();
let mut source_file = run_loc.base_path.clone();
source_file.push("assets/kubernetes_swagger.json");
// Send the prompt to modify the file
let (_messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"Remove the io.k8s.api.admissionregistration.v1.ServiceReference definition block and replace with a new definition for io.k8s.api.admissionregistration.v1.FakeServiceReference. Update the fields in the definition as well to be consistent. Don't change the property names. Don't update any references to the old definition. Only modify the definition and it's description to 'FakeServiceReference simulates a reference to a fake service for testing purposes.'.The file to modify is kubernetes_swagger.json.".to_string()
).await;
@@ -49,7 +50,7 @@ impl Evaluation for DeveloperSearchReplace {
.join("kubernetes_swagger.json");
// Read the expected patch file from the assets directory
let patch_file_path = work_dir.base_path.join("assets").join("kubernetes.patch");
let patch_file_path = run_loc.base_path.join("assets").join("kubernetes.patch");
if !patch_file_path.exists() {
return Err(anyhow::anyhow!("Could not find patch file"));
}
@@ -88,7 +89,7 @@ impl Evaluation for DeveloperSearchReplace {
metrics.push((
"Changes match expected patch".to_string(),
EvaluationMetric::Boolean(changes_match),
EvalMetricValue::Boolean(changes_match),
));
Ok(metrics)

View File

@@ -1,5 +1,6 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::eval_suites::{EvalMetricValue, Evaluation, ExtensionRequirements};
use crate::register_evaluation;
use async_trait::async_trait;
// use std::fs;
@@ -16,20 +17,17 @@ impl ExampleEval {
impl Evaluation for ExampleEval {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
_run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
println!("ExampleEval - run");
let mut metrics = Vec::new();
let _ = agent.prompt("What can you do?".to_string()).await;
metrics.push((
"example_metric".to_string(),
EvaluationMetric::Boolean(true),
));
metrics.push(("example_metric".to_string(), EvalMetricValue::Boolean(true)));
metrics.push(("example_count".to_string(), EvaluationMetric::Integer(42)));
metrics.push(("example_count".to_string(), EvalMetricValue::Integer(42)));
Ok(metrics)
}

View File

@@ -1,8 +1,9 @@
// Create a new file called test.txt with the content 'Hello, World!
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
ExtensionRequirements,
};
use crate::register_evaluation;
@@ -24,12 +25,12 @@ impl MemoryRememberMemory {
impl Evaluation for MemoryRememberMemory {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
_run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
// Send the prompt to list files
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"Save this fact: The capital of France is Paris.".to_string(),
)
.await;
@@ -69,7 +70,7 @@ impl Evaluation for MemoryRememberMemory {
metrics.push((
"Saving facts".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
EvalMetricValue::Boolean(valid_tool_call),
));
Ok(metrics)
}

View File

@@ -1,27 +1,24 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use anyhow::Result;
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use goose::message::Message;
use serde::Serialize;
use serde::{Deserialize, Serialize};
pub type Model = (String, String);
pub type Extension = String;
#[derive(Debug, Serialize, Clone)]
pub struct BenchAgentError {
pub message: String,
pub level: String, // ERROR, WARN, etc.
pub timestamp: DateTime<Utc>,
}
#[derive(Debug, Serialize)]
pub enum EvaluationMetric {
#[derive(Debug, Deserialize, Serialize)]
pub enum EvalMetricValue {
Integer(i64),
Float(f64),
String(String),
Boolean(bool),
}
#[derive(Debug, Serialize)]
pub struct EvalMetric {
pub name: String,
pub value: EvalMetricValue,
}
#[derive(Debug, Default)]
pub struct ExtensionRequirements {
@@ -30,24 +27,13 @@ pub struct ExtensionRequirements {
pub remote: Vec<String>,
}
#[async_trait]
pub trait BenchAgent: Send + Sync {
async fn prompt(&mut self, p: String) -> Result<Vec<Message>>;
// Make get_errors async
async fn get_errors(&self) -> Vec<BenchAgentError>;
// Get token usage information
async fn get_token_usage(&self) -> Option<i32>;
}
#[async_trait]
pub trait Evaluation: Send + Sync {
async fn run(
&self,
agent: Box<dyn BenchAgent>,
agent: &mut BenchAgent,
run_loc: &mut BenchmarkWorkDir,
) -> Result<Vec<(String, EvaluationMetric)>>;
) -> Result<Vec<(String, EvalMetricValue)>>;
fn name(&self) -> &str;

View File

@@ -1,13 +1,14 @@
use crate::eval_suites::{BenchAgent, EvaluationMetric};
use crate::bench_session::BenchAgent;
use crate::eval_suites::EvalMetricValue;
use goose::message::{Message, MessageContent};
use std::collections::HashMap;
use std::time::Instant;
/// Collect baseline metrics including execution time, tool usage, and token count
pub async fn collect_baseline_metrics(
agent: &mut Box<dyn BenchAgent>,
agent: &mut BenchAgent,
prompt: String,
) -> (Vec<Message>, HashMap<String, EvaluationMetric>) {
) -> (Vec<Message>, HashMap<String, EvalMetricValue>) {
// Initialize metrics map
let mut metrics = HashMap::new();
@@ -20,7 +21,7 @@ pub async fn collect_baseline_metrics(
Err(e) => {
metrics.insert(
"prompt_error".to_string(),
EvaluationMetric::String(format!("Error: {}", e)),
EvalMetricValue::String(format!("Error: {}", e)),
);
Vec::new()
}
@@ -30,21 +31,21 @@ pub async fn collect_baseline_metrics(
let execution_time = start_time.elapsed();
metrics.insert(
"prompt_execution_time_seconds".to_string(),
EvaluationMetric::Float(execution_time.as_secs_f64()),
EvalMetricValue::Float(execution_time.as_secs_f64()),
);
// Count tool calls
let (total_tool_calls, tool_calls_by_name) = count_tool_calls(&messages);
metrics.insert(
"total_tool_calls".to_string(),
EvaluationMetric::Integer(total_tool_calls),
EvalMetricValue::Integer(total_tool_calls),
);
// Add tool calls by name metrics
for (tool_name, count) in tool_calls_by_name {
metrics.insert(
format!("tool_calls_{}", tool_name),
EvaluationMetric::Integer(count),
EvalMetricValue::Integer(count),
);
}
@@ -52,7 +53,7 @@ pub async fn collect_baseline_metrics(
if let Some(token_count) = agent.get_token_usage().await {
metrics.insert(
"total_tokens".to_string(),
EvaluationMetric::Integer(token_count as i64),
EvalMetricValue::Integer(token_count as i64),
);
}
@@ -82,8 +83,8 @@ fn count_tool_calls(messages: &[Message]) -> (i64, HashMap<String, i64>) {
/// Convert HashMap of metrics to Vec
pub fn metrics_hashmap_to_vec(
metrics: HashMap<String, EvaluationMetric>,
) -> Vec<(String, EvaluationMetric)> {
metrics: HashMap<String, EvalMetricValue>,
) -> Vec<(String, EvalMetricValue)> {
metrics.into_iter().collect()
}

View File

@@ -1,7 +1,8 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
EvalMetricValue, Evaluation, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
@@ -23,20 +24,20 @@ impl BlogSummary {
impl Evaluation for BlogSummary {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
println!("BlogSummary - run");
// Collect baseline metrics (execution time, token usage, tool calls)
let (response, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"What are the top 5 most counterintuitive insights from this blog post? Format your response in Markdown with 5 numbered points (1. 2. 3. 4. 5.) https://huyenchip.com/2025/01/07/agents.html".to_string()
).await;
// Write response to file and get the text content
let response_text =
match write_response_to_file(&response, work_dir, "blog_summary_output.txt") {
match write_response_to_file(&response, run_loc, "blog_summary_output.txt") {
Ok(text) => text,
Err(e) => {
println!("Warning: Failed to write blog summary output: {}", e);
@@ -54,14 +55,14 @@ impl Evaluation for BlogSummary {
let has_markdown_list = self.check_markdown_numbered_list(&response_text);
metrics.push((
"valid_markdown_format".to_string(),
EvaluationMetric::Boolean(has_markdown_list),
EvalMetricValue::Boolean(has_markdown_list),
));
// Check if the fetch tool was used
let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
metrics.push((
"used_fetch_tool".to_string(),
EvaluationMetric::Boolean(used_fetch_tool),
EvalMetricValue::Boolean(used_fetch_tool),
));
// Copy the session file to the current working directory

View File

@@ -1,7 +1,8 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
EvaluationMetric, ExtensionRequirements,
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, EvalMetricValue,
Evaluation, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
@@ -31,14 +32,14 @@ impl FlappyBird {
impl Evaluation for FlappyBird {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
println!("FlappyBird - run");
// Collect baseline metrics (execution time, token usage, tool calls)
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"Create a Flappy Bird game in Python. Structure the code with a main function and use the if __name__ == '__main__': idiom. You must use pygame. The background color should be a light blue color. Pressing SPACE multiple times will accelerate the bird. The bird's shape should be a red circle. Place on the bottom some land colored as dark yellow chosen. Make a score shown on the top right side. Increment if you pass pipes and don't hit them. Make randomly spaced dark green pipes with enough space. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again. When trying to run the game, make sure to use pyenv and create the environment in the current working directory. The final game should be written to a file named flappy_bird.py. Remember to use your tools if applicable.".to_string()
).await;
@@ -80,17 +81,17 @@ impl Evaluation for FlappyBird {
metrics.push((
"used_write_tool".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
EvalMetricValue::Boolean(valid_tool_call),
));
// If tool was used correctly, check the actual file content
if valid_tool_call {
if let Ok(file_path) = work_dir.fs_get("flappy_bird.py".to_string()) {
if let Ok(file_path) = run_loc.fs_get("flappy_bird.py".to_string()) {
if let Ok(content) = fs::read_to_string(file_path) {
let valid_implementation = self.check_python_implementation(&content);
metrics.push((
"valid_implementation".to_string(),
EvaluationMetric::Boolean(valid_implementation),
EvalMetricValue::Boolean(valid_implementation),
));
}
}

View File

@@ -1,7 +1,8 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
EvaluationMetric, ExtensionRequirements,
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, EvalMetricValue,
Evaluation, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
@@ -21,14 +22,14 @@ impl GooseWiki {
impl Evaluation for GooseWiki {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
_run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
println!("GooseWiki - run");
// Collect baseline metrics (execution time, token usage, tool calls)
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"Create a Wikipedia-style web page about Goose (Block's AI agent) in a new index.html file. The page should be a complete, well-structured HTML document with proper head and body sections. Use heading tags (h1, h2, h3) to organize the content into clear sections. Include comprehensive information about Goose organized in a way similar to how Wikipedia presents technical topics. Remember to use your tools if applicable.".to_string()
).await;
@@ -71,7 +72,7 @@ impl Evaluation for GooseWiki {
metrics.push((
"created_valid_html".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
EvalMetricValue::Boolean(valid_tool_call),
));
// Copy the session file to the current working directory

View File

@@ -1,7 +1,8 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
EvalMetricValue, Evaluation, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
@@ -30,14 +31,14 @@ impl RestaurantResearch {
impl Evaluation for RestaurantResearch {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
println!("RestaurantResearch - run");
// Collect baseline metrics (execution time, token usage, tool calls)
let (response, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
"Search the internet for and provide a current, detailed list of the best Sichuanese restaurants specifically in the East Village neighborhood of NYC. Format your response in Markdown using bullet points (either - or *) for each restaurant. For each restaurant include:
- Restaurant name and what they're known for
- Signature dishes
@@ -50,7 +51,7 @@ Present the information in order of significance or quality. Focus specifically
// Write response to file and get the text content
let response_text =
match write_response_to_file(&response, work_dir, "restaurant_research_output.txt") {
match write_response_to_file(&response, run_loc, "restaurant_research_output.txt") {
Ok(text) => text,
Err(e) => {
println!("Warning: Failed to write restaurant research output: {}", e);
@@ -70,18 +71,18 @@ Present the information in order of significance or quality. Focus specifically
metrics.push((
"valid_markdown_format".to_string(),
EvaluationMetric::Boolean(has_markdown_bullets),
EvalMetricValue::Boolean(has_markdown_bullets),
));
metrics.push((
"bullet_point_count".to_string(),
EvaluationMetric::Integer(bullet_count),
EvalMetricValue::Integer(bullet_count),
));
// Check if the fetch tool was used
let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
metrics.push((
"used_fetch_tool".to_string(),
EvaluationMetric::Boolean(used_fetch_tool),
EvalMetricValue::Boolean(used_fetch_tool),
));
// Copy the session file to the current working directory

View File

@@ -1,7 +1,8 @@
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
EvaluationMetric, ExtensionRequirements,
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, EvalMetricValue,
Evaluation, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;
@@ -30,13 +31,13 @@ impl SquirrelCensus {
impl Evaluation for SquirrelCensus {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
agent: &mut BenchAgent,
run_loc: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
println!("SquirrelCensus - run");
// Get the path to the squirrel data file
let squirrel_data_path = match work_dir.fs_get("./assets/squirrel-data.csv".to_string()) {
let squirrel_data_path = match run_loc.fs_get("./assets/squirrel-data.csv".to_string()) {
Ok(file) => file,
Err(_) => return Err(anyhow::anyhow!("Could not find squirrel-data.csv file")),
};
@@ -45,7 +46,7 @@ impl Evaluation for SquirrelCensus {
// Collect baseline metrics (execution time, token usage, tool calls)
let (messages, perf_metrics) = collect_baseline_metrics(
&mut agent,
agent,
format!(
"Create a Python script called analyze_squirrels.py that analyzes the CSV file at {}. Do not ask for any clarification or further instructions - proceed with the implementation as specified below.
@@ -141,15 +142,15 @@ After writing the script, run it using python3 and show the results. Do not ask
metrics.push((
"wrote_script".to_string(),
EvaluationMetric::Boolean(wrote_script),
EvalMetricValue::Boolean(wrote_script),
));
metrics.push((
"ran_script".to_string(),
EvaluationMetric::Boolean(ran_script),
EvalMetricValue::Boolean(ran_script),
));
metrics.push((
"correct_results".to_string(),
EvaluationMetric::Boolean(correct_results),
EvalMetricValue::Boolean(correct_results),
));
// Copy the session file to the current working directory

View File

@@ -1,4 +1,8 @@
pub mod bench_config;
pub mod bench_session;
pub mod bench_work_dir;
pub mod error_capture;
pub mod eval_suites;
pub mod reporting;
pub mod runners;
pub mod utilities;

View File

@@ -1,25 +1,26 @@
use crate::eval_suites::{BenchAgentError, EvaluationMetric};
use crate::bench_session::BenchAgentError;
use crate::eval_suites::EvalMetricValue;
use chrono::Local;
use serde::Serialize;
use serde::{Deserialize, Serialize};
use std::fmt;
/// Represents a single evaluation result
#[derive(Default, Serialize)]
#[derive(Default, Deserialize, Serialize)]
pub struct EvaluationResult {
pub name: String,
pub metrics: Vec<(String, EvaluationMetric)>,
pub metrics: Vec<(String, EvalMetricValue)>,
pub errors: Vec<BenchAgentError>,
}
/// Represents results for an entire suite
#[derive(Default, Serialize)]
#[derive(Default, Deserialize, Serialize)]
pub struct SuiteResult {
pub name: String,
pub evaluations: Vec<EvaluationResult>,
}
/// Contains all benchmark results and metadata
#[derive(Default, Serialize)]
#[derive(Default, Deserialize, Serialize)]
pub struct BenchmarkResults {
pub provider: String,
pub start_time: String,
@@ -35,7 +36,7 @@ impl EvaluationResult {
}
}
pub fn add_metric(&mut self, name: String, metric: EvaluationMetric) {
pub fn add_metric(&mut self, name: String, metric: EvalMetricValue) {
self.metrics.push((name, metric));
}
@@ -97,13 +98,13 @@ impl BenchmarkResults {
}
}
impl fmt::Display for EvaluationMetric {
impl fmt::Display for EvalMetricValue {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
EvaluationMetric::Integer(i) => write!(f, "{}", i),
EvaluationMetric::Float(fl) => write!(f, "{:.2}", fl),
EvaluationMetric::String(s) => write!(f, "{}", s),
EvaluationMetric::Boolean(b) => write!(f, "{}", b),
EvalMetricValue::Integer(i) => write!(f, "{}", i),
EvalMetricValue::Float(fl) => write!(f, "{:.2}", fl),
EvalMetricValue::String(s) => write!(f, "{}", s),
EvalMetricValue::Boolean(b) => write!(f, "{}", b),
}
}
}

View File

@@ -0,0 +1,75 @@
use crate::bench_config::{BenchModel, BenchRunConfig};
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::EvaluationSuite;
use crate::runners::model_runner::ModelRunner;
use crate::utilities::{await_process_exits, parallel_bench_cmd};
use std::path::PathBuf;
#[derive(Clone)]
pub struct BenchRunner {
config: BenchRunConfig,
}
impl BenchRunner {
pub fn new(config: PathBuf) -> anyhow::Result<BenchRunner> {
let config = BenchRunConfig::from(config)?;
BenchmarkWorkDir::init_experiment();
config.save("config.cfg".to_string());
Ok(BenchRunner { config })
}
pub fn from(config: String) -> anyhow::Result<BenchRunner> {
let config = BenchRunConfig::from_string(config)?;
Ok(BenchRunner { config })
}
pub fn run(&mut self) -> anyhow::Result<()> {
// split models that must run serial from those that can be run in parallel
let (parallel_models, serial_models): &(Vec<BenchModel>, Vec<BenchModel>) = &self
.config
.models
.clone()
.into_iter()
.partition(|model| model.parallel_safe);
// exec parallel models
let mut parallel_models_handle = Vec::new();
for model in parallel_models {
self.config.models = vec![model.clone()];
let cfg = self.config.to_string()?;
let model_handle = parallel_bench_cmd("eval-model".to_string(), cfg, Vec::new());
parallel_models_handle.push(model_handle);
}
// exec serial models
for model in serial_models {
self.config.models = vec![model.clone()];
ModelRunner::from(self.config.to_string()?)?.run()?;
}
await_process_exits(&mut parallel_models_handle, Vec::new());
Ok(())
}
pub fn list_selectors(_config: Option<PathBuf>) -> anyhow::Result<()> {
let selector_eval_counts = EvaluationSuite::available_selectors();
let mut keys: Vec<_> = selector_eval_counts.keys().collect();
keys.sort();
let max_key_len = keys.iter().map(|k| k.len()).max().unwrap_or(0);
println!(
"selector {} => Eval Count",
" ".repeat(max_key_len - "selector".len())
);
println!("{}", "-".repeat(max_key_len + 6));
for selector in keys {
println!(
"{} {} => {}",
selector,
" ".repeat(max_key_len - selector.len()),
selector_eval_counts.get(selector).unwrap()
);
}
Ok(())
}
}

View File

@@ -0,0 +1,120 @@
use crate::bench_config::{BenchEval, BenchModel, BenchRunConfig};
use crate::bench_session::BenchAgent;
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{EvaluationSuite, ExtensionRequirements};
use crate::reporting::EvaluationResult;
use crate::utilities::await_process_exits;
use std::env;
use std::fs;
use std::future::Future;
use std::path::PathBuf;
use std::process::Command;
#[derive(Clone)]
pub struct EvalRunner {
config: BenchRunConfig,
}
impl EvalRunner {
pub fn from(config: String) -> anyhow::Result<EvalRunner> {
let config = BenchRunConfig::from_string(config)?;
Ok(EvalRunner { config })
}
fn create_work_dir(&self, config: &BenchRunConfig) -> anyhow::Result<BenchmarkWorkDir> {
let goose_model = config.models.first().unwrap();
let model_name = goose_model.name.clone();
let provider_name = goose_model.provider.clone();
// construct work-dir name to have a shim component only if shim configured to be used
let work_dir_name_shim = {
let mut shim_name = "".to_string();
if let Some(shim_opt) = &goose_model.tool_shim {
if shim_opt.use_tool_shim {
let shim_model = if let Some(shim_model) = &shim_opt.tool_shim_model {
shim_model.clone()
} else {
"default".to_string()
};
shim_name = format!("-{}-shim-model", shim_model);
}
}
shim_name
};
let include_dir = config.include_dirs.clone();
let work_dir_name = format!("{}-{}{}", provider_name, model_name, work_dir_name_shim);
let work_dir = BenchmarkWorkDir::new(work_dir_name, include_dir);
Ok(work_dir)
}
pub async fn run<F, Fut>(&mut self, agent_generator: F) -> anyhow::Result<()>
where
F: Fn(ExtensionRequirements, String) -> Fut,
Fut: Future<Output = BenchAgent> + Send,
{
let mut work_dir = self.create_work_dir(&self.config)?;
let bench_eval = self.config.evals.first().unwrap();
let run_id = &self
.config
.run_id
.clone()
.unwrap_or_else(|| "run-0".to_string());
let run_id = format!("run-{}", run_id.clone());
// create entire dir subtree for eval and cd into dir for running eval
work_dir.set_eval(&bench_eval.selector, run_id);
if let Some(eval) = EvaluationSuite::from(&bench_eval.selector) {
let session_id = bench_eval.selector.clone();
let mut agent = agent_generator(eval.required_extensions(), session_id).await;
let mut result = EvaluationResult::new(eval.name().to_string());
if let Ok(metrics) = eval.run(&mut agent, &mut work_dir).await {
for (name, metric) in metrics {
result.add_metric(name, metric);
}
// Add any errors that occurred
for error in agent.get_errors().await {
result.add_error(error);
}
}
let eval_results = serde_json::to_string_pretty(&result)?;
let eval_results_file = env::current_dir()?.join(&self.config.eval_result_filename);
fs::write(&eval_results_file, &eval_results)?;
self.config.save("config.cfg".to_string());
work_dir.save();
// handle running post-process cmd if configured
if let Some(cmd) = &bench_eval.post_process_cmd {
let handle = Command::new(cmd).arg(&eval_results_file).spawn()?;
await_process_exits(&mut [handle], Vec::new());
}
// copy session file into eval-dir
let here = env::current_dir()?.canonicalize()?;
BenchmarkWorkDir::deep_copy(agent.session_file().as_path(), here.as_path(), false)?;
}
Ok(())
}
pub fn path_for_eval(model: &BenchModel, eval: &BenchEval, run_id: String) -> PathBuf {
let provider = model.provider.clone();
let model = model.name.clone();
let eval_path = &eval.selector.replace(":", std::path::MAIN_SEPARATOR_STR);
let eval_results_location = format!(
"{}-{}/run-{}{}{}",
&provider,
model,
run_id,
std::path::MAIN_SEPARATOR_STR,
eval_path
);
PathBuf::from(eval_results_location.clone())
}
}

View File

@@ -0,0 +1,3 @@
pub mod bench_runner;
pub mod eval_runner;
pub mod model_runner;

View File

@@ -0,0 +1,236 @@
use crate::bench_config::{BenchEval, BenchModel, BenchRunConfig};
use crate::eval_suites::EvaluationSuite;
use crate::reporting::{BenchmarkResults, SuiteResult};
use crate::runners::eval_runner::EvalRunner;
use crate::utilities::{await_process_exits, parallel_bench_cmd, union_hashmaps};
use std::collections::HashMap;
use std::fs::read_to_string;
use std::io::{self, BufRead};
use std::path::PathBuf;
use std::process::Child;
use std::thread;
#[derive(Clone)]
pub struct ModelRunner {
config: BenchRunConfig,
}
impl ModelRunner {
pub fn from(config: String) -> anyhow::Result<ModelRunner> {
let config = BenchRunConfig::from_string(config)?;
Ok(ModelRunner { config })
}
pub fn run(&self) -> anyhow::Result<()> {
let model = self.config.models.first().unwrap();
let suites = self.collect_evals_for_run();
let mut handles = vec![];
for i in 0..self.config.repeat.unwrap_or(1) {
let mut self_copy = self.clone();
let model_clone = model.clone();
let suites_clone = suites.clone();
// create thread to handle launching parallel processes to run model's evals in parallel
let handle = thread::spawn(move || {
self_copy.run_benchmark(&model_clone, suites_clone, i.to_string())
});
handles.push(handle);
}
await_process_exits(&mut Vec::new(), handles);
let mut all_runs_results: Vec<BenchmarkResults> = Vec::new();
for i in 0..self.config.repeat.unwrap_or(1) {
let run_results =
self.collect_run_results(model.clone(), suites.clone(), i.to_string())?;
all_runs_results.push(run_results);
}
// write summary file
Ok(())
}
fn load_env_file(&self, path: &PathBuf) -> anyhow::Result<Vec<(String, String)>> {
let file = std::fs::File::open(path)?;
let reader = io::BufReader::new(file);
let mut env_vars = Vec::new();
for line in reader.lines() {
let line = line?;
// Skip empty lines and comments
if line.trim().is_empty() || line.trim_start().starts_with('#') {
continue;
}
// Split on first '=' only
if let Some((key, value)) = line.split_once('=') {
let key = key.trim().to_string();
// Remove quotes if present
let value = value
.trim()
.trim_matches('"')
.trim_matches('\'')
.to_string();
env_vars.push((key, value));
}
}
Ok(env_vars)
}
fn run_benchmark(
&mut self,
model: &BenchModel,
suites: HashMap<String, Vec<BenchEval>>,
run_id: String,
) -> anyhow::Result<()> {
let mut results_handles = HashMap::<String, Vec<Child>>::new();
// Load environment variables from file if specified
let mut envs = self.toolshim_envs();
if let Some(env_file) = &self.config.env_file {
let env_vars = self.load_env_file(env_file)?;
envs.extend(env_vars);
}
envs.push(("GOOSE_MODEL".to_string(), model.clone().name));
envs.push(("GOOSE_PROVIDER".to_string(), model.clone().provider));
// Only run in parallel if the model is parallel_safe
let run_parallel = model.parallel_safe;
for (suite, evals) in suites.iter() {
results_handles.insert((*suite).clone(), Vec::new());
// Group evaluations by parallel_safe
let mut parallel_evals = Vec::new();
let mut sequential_evals = Vec::new();
for eval in evals {
if eval.parallel_safe && run_parallel {
parallel_evals.push(eval);
} else {
sequential_evals.push(eval);
}
}
// Run parallel-safe evaluations in parallel
if !parallel_evals.is_empty() {
for eval_selector in &parallel_evals {
self.config.run_id = Some(run_id.clone());
self.config.evals = vec![(*eval_selector).clone()];
let cfg = self.config.to_string()?;
let handle = parallel_bench_cmd("exec-eval".to_string(), cfg, envs.clone());
results_handles.get_mut(suite).unwrap().push(handle);
}
}
// Run non-parallel-safe evaluations sequentially
for eval_selector in &sequential_evals {
self.config.run_id = Some(run_id.clone());
self.config.evals = vec![(*eval_selector).clone()];
let cfg = self.config.to_string()?;
let handle = parallel_bench_cmd("exec-eval".to_string(), cfg, envs.clone());
// Wait for this process to complete before starting the next one
let mut child_procs = vec![handle];
await_process_exits(&mut child_procs, Vec::new());
}
}
// Wait for any remaining parallel processes to complete
for (_, child_procs) in results_handles.iter_mut() {
await_process_exits(child_procs, Vec::new());
}
Ok(())
}
fn collect_run_results(
&self,
model: BenchModel,
suites: HashMap<String, Vec<BenchEval>>,
run_id: String,
) -> anyhow::Result<BenchmarkResults> {
let mut results = BenchmarkResults::new(model.provider.clone());
let mut summary_path: Option<PathBuf> = None;
for (suite, evals) in suites.iter() {
let mut suite_result = SuiteResult::new(suite.clone());
for eval_selector in evals {
let mut eval_path =
EvalRunner::path_for_eval(&model, eval_selector, run_id.clone());
eval_path.push(self.config.eval_result_filename.clone());
let eval_result = serde_json::from_str(&read_to_string(&eval_path)?)?;
suite_result.add_evaluation(eval_result);
// use current eval to determine where the summary should be written
if summary_path.is_none() {
let mut result = PathBuf::new();
let mut iter = eval_path.components();
if let Some(first) = iter.next() {
result.push(first);
if let Some(second) = iter.next() {
result.push(second);
}
}
summary_path = Some(result);
}
}
results.add_suite(suite_result);
}
let mut run_summary = PathBuf::new();
run_summary.push(summary_path.clone().unwrap());
run_summary.push(&self.config.run_summary_filename);
let output_str = serde_json::to_string_pretty(&results)?;
std::fs::write(run_summary, &output_str)?;
Ok(results)
}
fn collect_evals_for_run(&self) -> HashMap<String, Vec<BenchEval>> {
// convert suites map {suite_name => [eval_selector_str] to map suite_name => [BenchEval]
let suites = self
.config
.evals
.iter()
.map(|eval| {
EvaluationSuite::select(vec![eval.clone().selector])
.iter()
.map(|(suite, evals)| {
let bench_evals = evals
.iter()
.map(|suite_eval| {
let mut updated_eval = eval.clone();
updated_eval.selector = (*suite_eval).to_string();
updated_eval
})
.collect::<Vec<_>>();
(suite.clone(), bench_evals)
})
.collect()
})
.collect();
union_hashmaps(suites)
}
fn toolshim_envs(&self) -> Vec<(String, String)> {
// read tool-shim preference from config, set respective env vars accordingly
let model = self.config.models.first().unwrap();
let mut shim_envs: Vec<(String, String)> = Vec::new();
if let Some(shim_opt) = &model.tool_shim {
if shim_opt.use_tool_shim {
shim_envs.push(("GOOSE_TOOLSHIM".to_string(), "true".to_string()));
if let Some(shim_model) = &shim_opt.tool_shim_model {
shim_envs.push((
"GOOSE_TOOLSHIM_OLLAMA_MODEL".to_string(),
shim_model.clone(),
));
}
}
}
shim_envs
}
}

View File

@@ -0,0 +1,52 @@
use std::collections::HashMap;
use std::env;
use std::process::{Child, Command};
use std::thread::JoinHandle;
pub fn union_hashmaps<K, V>(maps: Vec<HashMap<K, V>>) -> HashMap<K, V>
where
K: Eq + std::hash::Hash,
V: Clone,
{
// We can use the fold method to accumulate all maps into one
maps.into_iter().fold(HashMap::new(), |mut result, map| {
// For each map in the vector, extend the result with its entries
result.extend(map);
result
})
}
pub fn await_process_exits(
child_processes: &mut [Child],
handles: Vec<JoinHandle<anyhow::Result<()>>>,
) {
for child in child_processes.iter_mut() {
match child.wait() {
Ok(status) => println!("Child exited with status: {}", status),
Err(e) => println!("Error waiting for child: {}", e),
}
}
for handle in handles {
match handle.join() {
Ok(_res) => (),
Err(e) => {
// Handle thread panic
println!("Thread panicked: {:?}", e);
}
}
}
}
pub fn parallel_bench_cmd(bench_cmd: String, config: String, envs: Vec<(String, String)>) -> Child {
let current_exe = env::current_exe().expect("Failed to get current executable path");
let mut cmd = Command::new(current_exe);
cmd.arg("bench").arg(bench_cmd).arg("--config").arg(config);
for (key, value) in envs.into_iter() {
cmd.env(key, value);
}
cmd.spawn().expect("Failed to spawn child process")
}

View File

@@ -4,7 +4,7 @@ use clap::{Args, Parser, Subcommand};
use goose::config::Config;
use crate::commands::agent_version::AgentCommand;
use crate::commands::bench::{list_selectors, run_benchmark};
use crate::commands::bench::agent_generator;
use crate::commands::configure::handle_configure;
use crate::commands::info::handle_info;
use crate::commands::mcp::run_server;
@@ -12,6 +12,10 @@ use crate::commands::session::handle_session_list;
use crate::logging::setup_logging;
use crate::session;
use crate::session::build_session;
use goose_bench::bench_config::BenchRunConfig;
use goose_bench::runners::bench_runner::BenchRunner;
use goose_bench::runners::eval_runner::EvalRunner;
use goose_bench::runners::model_runner::ModelRunner;
use std::io::Read;
use std::path::PathBuf;
@@ -71,6 +75,47 @@ enum SessionCommand {
},
}
#[derive(Subcommand)]
pub enum BenchCommand {
#[command(name = "init-config", about = "Create a new starter-config")]
InitConfig {
#[arg(short, long, help = "filename with extension for generated config")]
name: String,
},
#[command(about = "Run all benchmarks from a config")]
Run {
#[arg(
short,
long,
help = "A config file generated by the config-init command"
)]
config: PathBuf,
},
#[command(about = "List all available selectors")]
Selectors {
#[arg(
short,
long,
help = "A config file generated by the config-init command"
)]
config: Option<PathBuf>,
},
#[command(name = "eval-model", about = "Run an eval of model")]
EvalModel {
#[arg(short, long, help = "A serialized config file for the model only.")]
config: String,
},
#[command(name = "exec-eval", about = "run a single eval")]
ExecEval {
#[arg(short, long, help = "A serialized config file for the eval only.")]
config: String,
},
}
#[derive(Subcommand)]
enum Command {
/// Configure Goose settings
@@ -255,63 +300,8 @@ enum Command {
},
Bench {
#[arg(
short = 's',
long = "selectors",
value_name = "EVALUATIONS_SELECTOR",
help = "Run this list of bench-suites.",
long_help = "Specify a comma-separated list of evaluation-suite names to be run.",
value_delimiter = ','
)]
selectors: Vec<String>,
#[arg(
short = 'i',
long = "include-dir",
value_name = "DIR_NAME",
action = clap::ArgAction::Append,
long_help = "Make one or more dirs available to all bench suites. Specify either a single dir-name, a comma-separated list of dir-names, or use this multiple instances of this flag to specify multiple dirs.",
value_delimiter = ','
)]
include_dirs: Vec<PathBuf>,
#[arg(
long = "repeat",
value_name = "QUANTITY",
long_help = "Number of times to repeat the benchmark run.",
default_value = "1"
)]
repeat: usize,
#[arg(
long = "list",
value_name = "LIST",
help = "List all selectors and the number of evaluations they select."
)]
list: bool,
#[arg(
long = "output",
short = 'o',
value_name = "FILE",
help = "Save benchmark results to a file"
)]
output: Option<PathBuf>,
#[arg(
long = "format",
value_name = "FORMAT",
help = "Output format (text, json)",
default_value = "text"
)]
format: String,
#[arg(
long = "summary",
help = "Show only summary results",
action = clap::ArgAction::SetTrue
)]
summary: bool,
#[command(subcommand)]
cmd: BenchCommand,
},
}
@@ -346,10 +336,10 @@ pub async fn cli() -> Result<()> {
remote_extension,
builtin,
}) => {
match command {
return match command {
Some(SessionCommand::List { verbose, format }) => {
handle_session_list(verbose, format)?;
return Ok(());
Ok(())
}
None => {
// Run session command by default
@@ -367,9 +357,9 @@ pub async fn cli() -> Result<()> {
None,
)?;
let _ = session.interactive(None).await;
return Ok(());
}
Ok(())
}
};
}
Some(Command::Run {
instructions,
@@ -438,58 +428,22 @@ pub async fn cli() -> Result<()> {
crate::commands::update::update(canary, reconfigure)?;
return Ok(());
}
Some(Command::Bench {
selectors,
include_dirs,
repeat,
list,
output,
format,
summary,
}) => {
if list {
return list_selectors().await;
}
let selectors = if selectors.is_empty() {
vec!["core".to_string()]
} else {
selectors
};
let current_dir = std::env::current_dir()?;
for i in 0..repeat {
if repeat > 1 {
println!("\nRun {} of {}:", i + 1, repeat);
}
let results = run_benchmark(selectors.clone(), include_dirs.clone()).await?;
// Handle output based on format
let output_str = match format.as_str() {
"json" => serde_json::to_string_pretty(&results)?,
_ => results.to_string(), // Uses Display impl
};
// Save to file if specified
if let Some(path) = &output {
std::fs::write(current_dir.join(path), &output_str)?;
println!("Results saved to: {}", path.display());
} else {
// Print to console
if summary {
println!("{}", results.summary());
} else {
println!("{}", output_str);
}
Some(Command::Bench { cmd }) => {
match cmd {
BenchCommand::Selectors { config } => BenchRunner::list_selectors(config)?,
BenchCommand::InitConfig { name } => BenchRunConfig::default().save(name),
BenchCommand::Run { config } => BenchRunner::new(config)?.run()?,
BenchCommand::EvalModel { config } => ModelRunner::from(config)?.run()?,
BenchCommand::ExecEval { config } => {
EvalRunner::from(config)?.run(agent_generator).await?
}
}
return Ok(());
}
None => {
if !Config::global().exists() {
return if !Config::global().exists() {
let _ = handle_configure().await;
return Ok(());
Ok(())
} else {
// Run session command by default
let mut session = build_session(None, false, vec![], vec![], vec![], false).await;
@@ -498,8 +452,8 @@ pub async fn cli() -> Result<()> {
None,
)?;
let _ = session.interactive(None).await;
return Ok(());
}
Ok(())
};
}
}
Ok(())

View File

@@ -1,88 +1,37 @@
use crate::logging;
use crate::session::build_session;
use crate::Session;
use crate::{logging, session, Session};
use async_trait::async_trait;
use goose::config::Config;
use goose::message::Message;
use goose_bench::bench_work_dir::BenchmarkWorkDir;
use goose_bench::eval_suites::{BenchAgent, BenchAgentError, Evaluation, EvaluationSuite};
use goose_bench::reporting::{BenchmarkResults, EvaluationResult, SuiteResult};
use goose_bench::bench_session::{BenchAgent, BenchBaseSession};
use goose_bench::eval_suites::ExtensionRequirements;
use std::path::PathBuf;
use std::sync::Arc;
use tokio::sync::Mutex;
pub struct BenchSession {
session: Session,
errors: Arc<Mutex<Vec<BenchAgentError>>>,
}
impl BenchSession {
pub fn new(session: Session) -> Self {
let errors = Arc::new(Mutex::new(Vec::new()));
// Initialize logging with error capture
logging::setup_logging(Some("bench"), Some(errors.clone()))
.expect("Failed to initialize logging");
Self { session, errors }
}
}
// allow session obj to be used in benchmarking
#[async_trait]
impl BenchAgent for BenchSession {
async fn prompt(&mut self, p: String) -> anyhow::Result<Vec<Message>> {
// Clear previous errors
{
let mut errors = self.errors.lock().await;
errors.clear();
impl BenchBaseSession for Session {
async fn headless(&mut self, message: String) -> anyhow::Result<()> {
self.headless(message).await
}
self.session.headless(p).await?;
Ok(self.session.message_history())
fn session_file(&self) -> PathBuf {
self.session_file()
}
async fn get_errors(&self) -> Vec<BenchAgentError> {
let errors = self.errors.lock().await;
errors.clone()
fn message_history(&self) -> Vec<Message> {
self.message_history()
}
async fn get_token_usage(&self) -> Option<i32> {
self.session.get_total_token_usage().ok().flatten()
fn get_total_token_usage(&self) -> anyhow::Result<Option<i32>> {
self.get_total_token_usage()
}
}
pub async fn agent_generator(
requirements: ExtensionRequirements,
session_id: String,
) -> BenchAgent {
let identifier = Some(session::Identifier::Name(session_id));
// Wrapper struct to implement BenchAgent for Arc<Mutex<BenchSession>>
struct BenchAgentWrapper(Arc<Mutex<BenchSession>>);
#[async_trait]
impl BenchAgent for BenchAgentWrapper {
async fn prompt(&mut self, p: String) -> anyhow::Result<Vec<Message>> {
let mut session = self.0.lock().await;
session.prompt(p).await
}
async fn get_errors(&self) -> Vec<BenchAgentError> {
let session = self.0.lock().await;
session.get_errors().await
}
async fn get_token_usage(&self) -> Option<i32> {
let session = self.0.lock().await;
session.get_token_usage().await
}
}
async fn run_eval(
evaluation: Box<dyn Evaluation>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<EvaluationResult> {
let mut result = EvaluationResult::new(evaluation.name().to_string());
let requirements = evaluation.required_extensions();
// Create session with error capture
let base_session = build_session(
None,
identifier,
false,
requirements.external,
requirements.remote,
@@ -91,84 +40,12 @@ async fn run_eval(
)
.await;
let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session)));
let bench_session_clone = bench_session.clone();
// package session obj into benchmark-compatible struct
let bench_agent = BenchAgent::new(Box::new(base_session));
if let Ok(metrics) = evaluation
.run(Box::new(BenchAgentWrapper(bench_session)), work_dir)
.await
{
for (name, metric) in metrics {
result.add_metric(name, metric);
}
// Add any errors that occurred
let agent = BenchAgentWrapper(bench_session_clone);
for error in agent.get_errors().await {
result.add_error(error);
}
}
let current_dir = std::env::current_dir()?;
let output_str = serde_json::to_string_pretty(&result)?;
std::fs::write(current_dir.join("eval_result.json"), &output_str)?;
Ok(result)
}
pub async fn run_benchmark(
selectors: Vec<String>,
include_dirs: Vec<PathBuf>,
) -> anyhow::Result<BenchmarkResults> {
let config = Config::global();
let goose_model: String = config
.get_param("GOOSE_MODEL")
.expect("No model configured. Run 'goose configure' first");
let provider_name: String = config
.get_param("GOOSE_PROVIDER")
.expect("No provider configured. Run 'goose configure' first");
let mut results = BenchmarkResults::new(provider_name.clone());
let work_dir = Mutex::new(BenchmarkWorkDir::new(
format!("{}-{}", provider_name, goose_model),
include_dirs.clone(),
));
for (suite, evals) in EvaluationSuite::select(selectors).iter() {
let mut suite_result = SuiteResult::new(suite.clone());
for eval_selector in evals {
if let Some(eval) = EvaluationSuite::from(eval_selector) {
let mut work_dir = work_dir.lock().await;
work_dir.set_eval(eval_selector);
let eval_result = run_eval(eval, &mut work_dir).await?;
suite_result.add_evaluation(eval_result);
}
}
results.add_suite(suite_result);
}
Ok(results)
}
pub async fn list_selectors() -> anyhow::Result<()> {
let selector_eval_counts = EvaluationSuite::available_selectors();
let mut keys: Vec<_> = selector_eval_counts.keys().collect();
keys.sort();
let max_key_len = keys.iter().map(|k| k.len()).max().unwrap_or(0);
println!(
"selector {} => Eval Count",
" ".repeat(max_key_len - "selector".len())
);
println!("{}", "-".repeat(max_key_len + 6));
for selector in keys {
println!(
"{} {} => {}",
selector,
" ".repeat(max_key_len - selector.len()),
selector_eval_counts.get(selector).unwrap()
);
}
Ok(())
// Initialize logging with error capture
let errors = Some(Arc::new(Mutex::new(bench_agent.get_errors().await)));
logging::setup_logging(Some("bench"), errors).expect("Failed to initialize logging");
bench_agent
}

View File

@@ -12,8 +12,8 @@ use tracing_subscriber::{
};
use goose::tracing::langfuse_layer;
use goose_bench::bench_session::BenchAgentError;
use goose_bench::error_capture::ErrorCaptureLayer;
use goose_bench::eval_suites::BenchAgentError;
// Used to ensure we only set up tracing once
static INIT: Once = Once::new();

View File

@@ -0,0 +1,189 @@
---
sidebar_position: 7
---
# Benchmarking with Goose
The Goose benchmarking system allows you to evaluate goose performance on complex tasks with one or more system configurations.<br></br>
This guide covers how to use the `goose bench` command to run benchmarks and analyze results.
## Configuration File
The benchmark configuration is specified in a JSON file with the following structure:
```json
{
"models": [
{
"provider": "databricks",
"name": "goose",
"parallel_safe": true,
"tool_shim": {
"use_tool_shim": false,
"tool_shim_model": null
}
}
],
"evals": [
{
"selector": "core",
"post_process_cmd": null,
"parallel_safe": true
}
],
"include_dirs": [],
"repeat": 2,
"run_id": null,
"eval_result_filename": "eval-results.json",
"run_summary_filename": "run-results-summary.json",
"env_file": null
}
```
### Configuration Options
#### Models Section
Each model entry in the `models` array specifies:
- `provider`: The model provider (e.g., "databricks")
- `name`: Model identifier
- `parallel_safe`: Whether the model can be run in parallel
- `tool_shim`: Optional configuration for tool shimming
- `use_tool_shim`: Enable/disable tool shimming
- `tool_shim_model`: Optional model to use for tool shimming
#### Evals Section
Each evaluation entry in the `evals` array specifies:
- `selector`: The evaluation suite to run (e.g., "core")
- `post_process_cmd`: Optional path to a post-processing script
- `parallel_safe`: Whether the evaluation can run in parallel
#### General Options
- `include_dirs`: Additional directories to include in the evaluation
- `repeat`: Number of times to repeat each evaluation
- `run_id`: Optional identifier for the benchmark run
- `eval_result_filename`: Name of the evaluation results file
- `run_summary_filename`: Name of the summary results file
- `env_file`: Optional path to an environment file
##### Mechanics of include_dirs option
The `include_dirs` config parameter makes the items at all paths listed within the option, available to all evaluations.<br></br>
It accomplishes this by:
* copying each included asset into the top-level directory created for each model/provider pair
* at evaluation run-time
* whichever assets is explicitly required by an evaluation gets copied into the eval-specific directory
* only if the evaluation-code specifically pulls it in
* and only if the evaluation actually is covered by one of the configured selectors and therefore runs
## Running Benchmarks
### Quick Start
1. The benchmarking system includes several evaluation suites.<br></br>
Run the following to see a listing of every valid selector:
```bash
goose bench selectors
```
2. Create a basic configuration file:
```bash
goose bench init-config -n bench-config.json
cat bench-config.json
{
"models": [
{
"provider": "databricks",
"name": "goose",
"parallel_safe": true
}
],
"evals": [
{
"selector": "core",
"parallel_safe": true
}
],
"repeat": 1
}
...etc.
```
2. Run the benchmark:
```bash
goose bench run -c bench-config.json
```
### Customizing Evaluations
You can customize runs in several ways:
1. Using Post-Processing Commands after evaluation:
```json
{
"evals": [
{
"selector": "core",
"post_process_cmd": "/path/to/process-script.sh",
"parallel_safe": true
}
]
}
```
2. Including Additional Data:
```json
{
"include_dirs": [
"/path/to/custom/eval/data"
]
}
```
3. Setting Environment Variables:
```json
{
"env_file": "/path/to/env-file"
}
```
## Output and Results
The benchmark generates two main output files within a file-hierarchy similar to the following.<br></br>
Results from running ach model/provider pair are stored within their own directory:
```bash
benchmark-${datetime}/
${model}-${provider}[-tool-shim[-${shim-model}]]/
run-${i}/
${an-include_dir-asset}
run-results-summary.json
core/developer/list_files/
${an-include_dir-asset}
run-results-summary.json
```
1. `eval-results.json`: Contains detailed results from each evaluation, including:
- Individual test case results
- Model responses
- Scoring metrics
- Error logs
2. `run-results-summary.json`: A collection of all eval results across all suites.
### Debug Mode
For detailed logging, you can enable debug mode:
```bash
RUST_LOG=debug goose bench bench-config.json
```
## Advanced Usage
### Tool Shimming
Tool shimming allows you to use a non-tool-capable models with Goose, provided Ollama is installed on the system.<br></br>
See this guide for important details on [tool shimming](experimental-features).

View File

@@ -232,6 +232,16 @@ Used to show the available implementations of the agent loop itself
goose agents
```
### bench
Used to evaluate system-configuration across a range of practical tasks. See the [detailed guide](/docs/guides/benchmarking) for more information.
**Usage:**
```bash
goose bench ...etc.
```
---
## Prompt Completion

View File

@@ -162,7 +162,7 @@ for ((i=0; i<$COUNT; i++)); do
if [ "$TOOLSHIM" = true ]; then
export GOOSE_TOOLSHIM=1
if [[ -n "$TOOLSHIM_MODEL" ]]; then
export GOOSE_TOOLSHIM_MODEL="$TOOLSHIM_MODEL"
export GOOSE_TOOLSHIM_OLLAMA_MODEL="$TOOLSHIM_MODEL"
fi
fi

View File

@@ -18,7 +18,6 @@ import { ModeSelection } from './basic/ModeSelection';
import SessionSharingSection from './session/SessionSharingSection';
import { toastSuccess } from '../../toasts';
const EXTENSIONS_DESCRIPTION =
'The Model Context Protocol (MCP) is a system that allows AI models to securely connect with local or remote resources using standard server setups. It works like a client-server setup and expands AI capabilities using three main components: Prompts, Resources, and Tools.';