feat: goose bench framework for functional and regression testing

Co-authored-by: Zaki Ali <zaki@squareup.com>
This commit is contained in:
marcelle
2025-03-05 21:23:00 -05:00
committed by GitHub
parent 24210d7b7b
commit 49dee048e4
31 changed files with 89381 additions and 2 deletions

View File

@@ -2,12 +2,15 @@ use anyhow::Result;
use clap::{Args, Parser, Subcommand};
use goose::config::Config;
use goose_cli::commands::agent_version::AgentCommand;
use goose_cli::commands::bench::{list_suites, run_benchmark};
use goose_cli::commands::configure::handle_configure;
use goose_cli::commands::info::handle_info;
use goose_cli::commands::mcp::run_server;
use goose_cli::logging::setup_logging;
use goose_cli::session;
use goose_cli::session::build_session;
use goose_cli::{commands::agent_version::AgentCommand, session};
use std::io::{self, Read};
use std::path::PathBuf;
@@ -194,6 +197,66 @@ enum Command {
#[arg(short, long, help = "Enforce to re-configure goose during update")]
reconfigure: bool,
},
Bench {
#[arg(
short = 's',
long = "suites",
value_name = "BENCH_SUITE_NAME",
help = "Run this list of bench-suites.",
long_help = "Specify a comma-separated list of evaluation-suite names to be run.",
value_delimiter = ','
)]
suites: Vec<String>,
#[arg(
short = 'i',
long = "include-dir",
value_name = "DIR_NAME",
action = clap::ArgAction::Append,
long_help = "Make one or more dirs available to all bench suites. Specify either a single dir-name, a comma-separated list of dir-names, or use this multiple instances of this flag to specify multiple dirs.",
value_delimiter = ','
)]
include_dirs: Vec<PathBuf>,
#[arg(
long = "repeat",
value_name = "QUANTITY",
long_help = "Number of times to repeat the benchmark run.",
default_value = "1"
)]
repeat: usize,
#[arg(
long = "list",
value_name = "LIST",
help = "List all available bench suites."
)]
list: bool,
#[arg(
long = "output",
short = 'o',
value_name = "FILE",
help = "Save benchmark results to a file"
)]
output: Option<PathBuf>,
#[arg(
long = "format",
value_name = "FORMAT",
help = "Output format (text, json)",
default_value = "text"
)]
format: String,
#[arg(
long = "summary",
help = "Show only summary results",
action = clap::ArgAction::SetTrue
)]
summary: bool,
},
}
#[derive(clap::ValueEnum, Clone, Debug)]
@@ -232,6 +295,7 @@ async fn main() -> Result<()> {
builtin,
)
.await;
setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?;
let _ = session.interactive(None).await;
return Ok(());
@@ -290,6 +354,56 @@ async fn main() -> Result<()> {
goose_cli::commands::update::update(canary, reconfigure)?;
return Ok(());
}
Some(Command::Bench {
suites,
include_dirs,
repeat,
list,
output,
format,
summary,
}) => {
if list {
let suites = list_suites().await?;
for suite in suites.keys() {
println!("{}: {}", suite, suites.get(suite).unwrap());
}
return Ok(());
}
let suites = if suites.is_empty() {
vec!["core".to_string()]
} else {
suites
};
let current_dir = std::env::current_dir()?;
for i in 0..repeat {
if repeat > 1 {
println!("\nRun {} of {}:", i + 1, repeat);
}
let results = run_benchmark(suites.clone(), include_dirs.clone()).await?;
// Handle output based on format
let output_str = match format.as_str() {
"json" => serde_json::to_string_pretty(&results)?,
_ => results.to_string(), // Uses Display impl
};
// Save to file if specified
if let Some(path) = &output {
std::fs::write(current_dir.join(path), &output_str)?;
println!("Results saved to: {}", path.display());
} else {
// Print to console
if summary {
println!("{}", results.summary());
} else {
println!("{}", output_str);
}
}
}
return Ok(());
}
None => {
if !Config::global().exists() {
let _ = handle_configure().await;