feat: efficient benching (#1921)

Co-authored-by: Tyler Rockwood <rockwotj@gmail.com> Co-authored-by: Kalvin C <kalvinnchau@users.noreply.github.com> Co-authored-by: Alice Hau <110418948+ahau-square@users.noreply.github.com>
2025-12-18 06:34:26 +01:00 · 2025-04-08 14:43:43 -04:00
parent 319f2301f3
commit 8fbd9eb327
37 changed files with 1162 additions and 444 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,6 @@ debug_*.txt

 # Benchmark paths
 benchmark-*
+benchconf.json
+scripts/fake.sh
+do_not_version/
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2372,6 +2372,7 @@ dependencies = [
 "serde",
 "serde_json",
 "tokio",
+ "toml",
 "tracing",
 "tracing-subscriber",
 "winapi",
--- a/crates/goose-bench/Cargo.toml
+++ b/crates/goose-bench/Cargo.toml
@@ -24,6 +24,7 @@ tokio = { version = "1.43", features = ["full"] }
 include_dir = "0.7.4"
 once_cell = "1.19"
 regex = "1.11.1"
+toml = "0.8.20"

 [target.'cfg(target_os = "windows")'.dependencies]
 winapi = { version = "0.3", features = ["wincred"] }
--- a/crates/goose-bench/src/bench_config.rs
+++ b/crates/goose-bench/src/bench_config.rs
@@ -0,0 +1,107 @@
+use crate::bench_work_dir::BenchmarkWorkDir;
+use serde::{Deserialize, Serialize};
+use std::fs;
+use std::fs::read_to_string;
+use std::path::PathBuf;
+
+#[derive(Clone, Serialize, Deserialize, Debug)]
+pub struct BenchToolShimOpt {
+    pub use_tool_shim: bool,
+    pub tool_shim_model: Option<String>,
+}
+
+#[derive(Clone, Serialize, Deserialize, Debug)]
+pub struct BenchModel {
+    pub provider: String,
+    pub name: String,
+    pub parallel_safe: bool,
+    pub tool_shim: Option<BenchToolShimOpt>,
+}
+#[derive(Clone, Serialize, Deserialize, Debug)]
+pub struct BenchEval {
+    pub selector: String,
+    pub post_process_cmd: Option<PathBuf>,
+    pub parallel_safe: bool,
+}
+#[derive(Clone, Serialize, Deserialize, Debug)]
+pub struct BenchRunConfig {
+    pub models: Vec<BenchModel>,
+    pub evals: Vec<BenchEval>,
+    pub include_dirs: Vec<PathBuf>,
+    pub repeat: Option<usize>,
+    pub run_id: Option<String>,
+    pub eval_result_filename: String,
+    pub run_summary_filename: String,
+    pub env_file: Option<PathBuf>,
+}
+
+impl Default for BenchRunConfig {
+    fn default() -> Self {
+        BenchRunConfig {
+            models: vec![
+                BenchModel {
+                    provider: "databricks".to_string(),
+                    name: "goose".to_string(),
+                    parallel_safe: true,
+                    tool_shim: Some(BenchToolShimOpt {
+                        use_tool_shim: false,
+                        tool_shim_model: None,
+                    }),
+                },
+                BenchModel {
+                    provider: "databricks".to_string(),
+                    name: "goose-claude-3-5-sonnet".to_string(),
+                    parallel_safe: true,
+                    tool_shim: None,
+                },
+            ],
+            evals: vec![BenchEval {
+                selector: "core".into(),
+                post_process_cmd: None,
+                parallel_safe: true, // Default to true
+            }],
+            include_dirs: vec![],
+            repeat: Some(2),
+            run_id: None,
+            eval_result_filename: "eval-results.json".to_string(),
+            run_summary_filename: "run-results-summary.json".to_string(),
+            env_file: None,
+        }
+    }
+}
+impl BenchRunConfig {
+    pub fn from_string(cfg: String) -> anyhow::Result<Self> {
+        let mut config: Self = serde_json::from_str(cfg.as_str())?;
+        // update include_dirs to contain full-paths only
+        config.include_dirs = BenchmarkWorkDir::canonical_dirs(config.include_dirs);
+        Self::canonicalize_eval_post_proc_cmd(&mut config);
+        Ok(config)
+    }
+
+    fn canonicalize_eval_post_proc_cmd(config: &mut BenchRunConfig) {
+        // update eval post-process script paths to all be full-paths
+        config.evals.iter_mut().for_each(|eval| {
+            if let Some(post_process_cmd) = &eval.post_process_cmd {
+                let canon = BenchmarkWorkDir::canonical_dirs(vec![post_process_cmd.clone()]);
+                let full_path_cmd = canon[0].clone();
+                if !full_path_cmd.exists() {
+                    panic!("BenchConfigError: Eval post-process command not found. File {:?} does not exist", full_path_cmd);
+                }
+                eval.post_process_cmd = Some(full_path_cmd);
+            }
+        });
+    }
+    pub fn from(cfg: PathBuf) -> anyhow::Result<Self> {
+        let config = Self::from_string(read_to_string(cfg)?)?;
+        Ok(config)
+    }
+
+    pub fn to_string(&self) -> anyhow::Result<String> {
+        Ok(serde_json::to_string_pretty(self)?)
+    }
+
+    pub fn save(&self, name: String) {
+        let config = self.to_string().unwrap();
+        fs::write(name, config).expect("Unable to write bench config file");
+    }
+}
--- a/crates/goose-bench/src/bench_session.rs
+++ b/crates/goose-bench/src/bench_session.rs
@@ -0,0 +1,58 @@
+use async_trait::async_trait;
+use chrono::{DateTime, Utc};
+use goose::message::Message;
+
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+use std::sync::Arc;
+use tokio::sync::Mutex;
+
+#[derive(Debug, Deserialize, Serialize, Clone)]
+pub struct BenchAgentError {
+    pub message: String,
+    pub level: String, // ERROR, WARN, etc.
+    pub timestamp: DateTime<Utc>,
+}
+
+// avoid tying benchmarking to current session-impl.
+#[async_trait]
+pub trait BenchBaseSession: Send + Sync {
+    async fn headless(&mut self, message: String) -> anyhow::Result<()>;
+    fn session_file(&self) -> PathBuf;
+    fn message_history(&self) -> Vec<Message>;
+    fn get_total_token_usage(&self) -> anyhow::Result<Option<i32>>;
+}
+// struct for managing agent-session-access. to be passed to evals for benchmarking
+pub struct BenchAgent {
+    session: Box<dyn BenchBaseSession>,
+    errors: Arc<Mutex<Vec<BenchAgentError>>>,
+}
+
+impl BenchAgent {
+    pub fn new(session: Box<dyn BenchBaseSession>) -> Self {
+        let errors = Arc::new(Mutex::new(Vec::new()));
+        Self { session, errors }
+    }
+
+    pub(crate) async fn prompt(&mut self, p: String) -> anyhow::Result<Vec<Message>> {
+        // Clear previous errors
+        {
+            let mut errors = self.errors.lock().await;
+            errors.clear();
+        }
+        self.session.headless(p).await?;
+        Ok(self.session.message_history())
+    }
+
+    pub async fn get_errors(&self) -> Vec<BenchAgentError> {
+        let errors = self.errors.lock().await;
+        errors.clone()
+    }
+
+    pub(crate) async fn get_token_usage(&self) -> Option<i32> {
+        self.session.get_total_token_usage().ok().flatten()
+    }
+    pub(crate) fn session_file(&self) -> PathBuf {
+        self.session.session_file()
+    }
+}
--- a/crates/goose-bench/src/bench_work_dir.rs
+++ b/crates/goose-bench/src/bench_work_dir.rs
@@ -1,5 +1,6 @@
 use chrono::Local;
 use include_dir::{include_dir, Dir};
+use serde::{Deserialize, Serialize};
 use std::fs;
 use std::io;
 use std::io::ErrorKind;
@@ -9,11 +10,12 @@ use std::process::Command;

 pub static BUILTIN_EVAL_ASSETS: Dir = include_dir!("$CARGO_MANIFEST_DIR/src/assets");

+#[derive(Clone, Serialize, Deserialize, Debug)]
 pub struct BenchmarkWorkDir {
    pub base_path: PathBuf,
-    run_dir: PathBuf,
-    cwd: PathBuf,
-    run_name: String,
+    pub run_dir: PathBuf,
+    pub cwd: PathBuf,
+    pub run_id: Option<String>,
 }

 impl Default for BenchmarkWorkDir {
@@ -21,26 +23,17 @@ impl Default for BenchmarkWorkDir {
        Self::new("work_dir".to_string(), Vec::new())
    }
 }
+
 impl BenchmarkWorkDir {
    pub fn new(work_dir_name: String, include_dirs: Vec<PathBuf>) -> Self {
        let run_dir = std::env::current_dir().unwrap().canonicalize().unwrap();
-        let base_path = PathBuf::from(format!("./benchmark-{}", work_dir_name));
+        let base_path = PathBuf::from(format!("./{}", work_dir_name));
        fs::create_dir_all(&base_path).unwrap();

-        let current_time = Local::now().format("%H:%M:%S").to_string();
-        let current_date = Local::now().format("%Y-%m-%d").to_string();
-        let run_name = format!("{}-{}", &current_date, current_time);
-
-        let mut base_path = PathBuf::from(&base_path).canonicalize().unwrap();
-        base_path.push(run_name.clone());
-        fs::create_dir_all(&base_path).unwrap();
-        base_path.pop();
+        let base_path = PathBuf::from(&base_path).canonicalize().unwrap();

        // abs paths from dir-strings
-        let dirs = include_dirs
-            .iter()
-            .map(|d| d.canonicalize().unwrap())
-            .collect::<Vec<_>>();
+        let dirs = Self::canonical_dirs(include_dirs);

        // deep copy each dir
        let _: Vec<_> = dirs
@@ -56,9 +49,32 @@ impl BenchmarkWorkDir {
            base_path: base_path.clone(),
            run_dir,
            cwd: base_path.clone(),
-            run_name,
+            run_id: None,
        }
    }
+
+    pub fn init_experiment() {
+        // create experiment folder
+        let current_time = Local::now().format("%H:%M:%S").to_string();
+        let current_date = Local::now().format("%Y-%m-%d").to_string();
+        let exp_name = format!("{}-{}", &current_date, current_time);
+        let base_path = PathBuf::from(format!("./benchmark-{}", exp_name));
+        fs::create_dir_all(&base_path).unwrap();
+        std::env::set_current_dir(&base_path).unwrap();
+    }
+    pub fn canonical_dirs(include_dirs: Vec<PathBuf>) -> Vec<PathBuf> {
+        include_dirs
+            .iter()
+            .map(|d| {
+                let canon = d.canonicalize();
+                if canon.is_err() {
+                    eprintln!("{:?} can't be canonicalized", d);
+                    panic!();
+                }
+                canon.unwrap()
+            })
+            .collect::<Vec<_>>()
+    }
    fn copy_auto_included_dirs(dest: &Path) {
        let mut assets_dest = dest.to_path_buf();
        assets_dest.push("assets");
@@ -73,10 +89,21 @@ impl BenchmarkWorkDir {
        self.cwd = path;
        Ok(self)
    }
-    pub fn set_eval(&mut self, eval: &str) {
+    pub(crate) fn _run_dir(&mut self) -> Option<PathBuf> {
+        if let Some(run_id) = &self.run_id {
+            let mut eval_dir = self.base_path.clone();
+            eval_dir.push(run_id);
+            return Some(eval_dir);
+        }
+        None
+    }
+
+    pub fn set_eval(&mut self, eval: &str, run_id: String) {
+        self.run_id = Some(run_id.clone());
+
        let eval = eval.replace(":", std::path::MAIN_SEPARATOR_STR);
        let mut eval_dir = self.base_path.clone();
-        eval_dir.push(self.run_name.clone());
+        eval_dir.push(run_id);
        eval_dir.push(eval);

        self.cd(eval_dir.clone())
@@ -134,7 +161,7 @@ impl BenchmarkWorkDir {
        Ok(PathBuf::from(path))
    }

-    fn deep_copy<P, Q>(src: P, dst: Q, recursive: bool) -> io::Result<()>
+    pub(crate) fn deep_copy<P, Q>(src: P, dst: Q, recursive: bool) -> io::Result<()>
    where
        P: AsRef<Path>,
        Q: AsRef<Path>,
@@ -162,6 +189,11 @@ impl BenchmarkWorkDir {
            Err(io::Error::new(ErrorKind::Other, error_message))
        }
    }
+
+    pub fn save(&self) {
+        let work_dir = serde_json::to_string_pretty(&self).unwrap();
+        fs::write("work_dir.json", work_dir).expect("Unable to write work-dir as file");
+    }
 }

 impl Drop for BenchmarkWorkDir {
--- a/crates/goose-bench/src/error_capture.rs
+++ b/crates/goose-bench/src/error_capture.rs
@@ -1,4 +1,4 @@
-use crate::eval_suites::BenchAgentError;
+use crate::bench_session::BenchAgentError;
 use chrono::Utc;
 use once_cell::sync::Lazy;
 use std::sync::Arc;
--- a/crates/goose-bench/src/eval_suites/core/computercontroller/script.rs
+++ b/crates/goose-bench/src/eval_suites/core/computercontroller/script.rs
@@ -1,8 +1,9 @@
 // Create a new file called test.txt with the content 'Hello, World!

+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
+    collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
    ExtensionRequirements,
 };
 use crate::register_evaluation;
@@ -24,12 +25,12 @@ impl ComputerControllerScript {
 impl Evaluation for ComputerControllerScript {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        _work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        _run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        // Send the prompt to list files
        let (messages, perf_metrics) =
-            collect_baseline_metrics(&mut agent, "Make a beep sound".to_string()).await;
+            collect_baseline_metrics(agent, "Make a beep sound".to_string()).await;

        // Convert HashMap to Vec for our metrics
        let mut metrics = metrics_hashmap_to_vec(perf_metrics);
@@ -64,7 +65,7 @@ impl Evaluation for ComputerControllerScript {

        metrics.push((
            "Running os scripts".to_string(),
-            EvaluationMetric::Boolean(valid_tool_call),
+            EvalMetricValue::Boolean(valid_tool_call),
        ));
        Ok(metrics)
    }
--- a/crates/goose-bench/src/eval_suites/core/computercontroller/web_scrape.rs
+++ b/crates/goose-bench/src/eval_suites/core/computercontroller/web_scrape.rs
@@ -1,8 +1,9 @@
 // Create a new file called test.txt with the content 'Hello, World!

+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
+    collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
    ExtensionRequirements,
 };
 use crate::register_evaluation;
@@ -24,12 +25,12 @@ impl ComputerControllerWebScrape {
 impl Evaluation for ComputerControllerWebScrape {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        _work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        _run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        // Send the prompt to list files
        let (messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "What are the headlines on hackernews? Organize the list into categories.".to_string(),
        )
        .await;
@@ -67,7 +68,7 @@ impl Evaluation for ComputerControllerWebScrape {

        metrics.push((
            "Retrieve and scrape web pages".to_string(),
-            EvaluationMetric::Boolean(valid_tool_call),
+            EvalMetricValue::Boolean(valid_tool_call),
        ));
        Ok(metrics)
    }
--- a/crates/goose-bench/src/eval_suites/core/developer/create_file.rs
+++ b/crates/goose-bench/src/eval_suites/core/developer/create_file.rs
@@ -1,8 +1,9 @@
 // Create a new file called test.txt with the content 'Hello, World!

+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
+    collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
    ExtensionRequirements,
 };
 use crate::register_evaluation;
@@ -24,12 +25,12 @@ impl DeveloperCreateFile {
 impl Evaluation for DeveloperCreateFile {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        _work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        _run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        // Send the prompt to create and read
        let (messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "Create a new file called test.txt in the current directory with the content 'Hello, World!'. Then read the contents of the new file to confirm.".to_string()
        ).await;

@@ -99,15 +100,15 @@ impl Evaluation for DeveloperCreateFile {

        metrics.push((
            "Create file".to_string(),
-            EvaluationMetric::Boolean(write_tool_call),
+            EvalMetricValue::Boolean(write_tool_call),
        ));
        metrics.push((
            "Read file".to_string(),
-            EvaluationMetric::Boolean(read_tool_call),
+            EvalMetricValue::Boolean(read_tool_call),
        ));
        metrics.push((
            "Complete create and read".to_string(),
-            EvaluationMetric::Boolean(write_tool_call && read_tool_call),
+            EvalMetricValue::Boolean(write_tool_call && read_tool_call),
        ));
        Ok(metrics)
    }
--- a/crates/goose-bench/src/eval_suites/core/developer/list_files.rs
+++ b/crates/goose-bench/src/eval_suites/core/developer/list_files.rs
@@ -1,6 +1,7 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
+    collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
    ExtensionRequirements,
 };
 use crate::register_evaluation;
@@ -22,14 +23,12 @@ impl DeveloperListFiles {
 impl Evaluation for DeveloperListFiles {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        _work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        _run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        // Send the prompt to list files
-        let (messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
-            "list the files in the current directory".to_string(),
-        )
+        let (messages, perf_metrics) =
+            collect_baseline_metrics(agent, "list the files in the current directory".to_string())
                .await;

        // Convert HashMap to Vec for our metrics
@@ -68,7 +67,7 @@ impl Evaluation for DeveloperListFiles {

        metrics.push((
            "Using the shell command tool".to_string(),
-            EvaluationMetric::Boolean(valid_tool_call),
+            EvalMetricValue::Boolean(valid_tool_call),
        ));
        Ok(metrics)
    }
--- a/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs
+++ b/crates/goose-bench/src/eval_suites/core/developer/simple_repo_clone_test.rs
@@ -1,6 +1,7 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
+    collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
    ExtensionRequirements,
 };
 use crate::register_evaluation;
@@ -22,15 +23,15 @@ impl SimpleRepoCloneTest {
 impl Evaluation for SimpleRepoCloneTest {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
+        agent: &mut BenchAgent,
        _work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        // Send the prompt to clone the repo and add a test
        let (messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "Clone the Git repository https://github.com/michaelneale/mcp-read-pdf to a temporary location. \
            Then add a new test file that verifies the PDF reading functionality. The test should \
-            check if the PDF content can be read and processed correctly.".to_string()
+            check if the PDF content can be read and processed correctly.".to_string(),
        ).await;

        // Convert HashMap to Vec for our metrics
@@ -177,23 +178,23 @@ impl Evaluation for SimpleRepoCloneTest {
        // Add metrics
        metrics.push((
            "Git repo cloned".to_string(),
-            EvaluationMetric::Boolean(git_clone_executed),
+            EvalMetricValue::Boolean(git_clone_executed),
        ));
        metrics.push((
            "Repository explored".to_string(),
-            EvaluationMetric::Boolean(repo_explored),
+            EvalMetricValue::Boolean(repo_explored),
        ));
        metrics.push((
            "Test file added".to_string(),
-            EvaluationMetric::Boolean(test_added),
+            EvalMetricValue::Boolean(test_added),
        ));
        metrics.push((
            "Test executed".to_string(),
-            EvaluationMetric::Boolean(test_executed),
+            EvalMetricValue::Boolean(test_executed),
        ));
        metrics.push((
            "Complete task".to_string(),
-            EvaluationMetric::Boolean(git_clone_executed && test_added),
+            EvalMetricValue::Boolean(git_clone_executed && test_added),
        ));

        Ok(metrics)
--- a/crates/goose-bench/src/eval_suites/core/developer_image/image.rs
+++ b/crates/goose-bench/src/eval_suites/core/developer_image/image.rs
@@ -1,6 +1,7 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
+    collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
    ExtensionRequirements,
 };
 use crate::register_evaluation;
@@ -23,12 +24,12 @@ impl DeveloperImage {
 impl Evaluation for DeveloperImage {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        _work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        _run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        // Send the prompt to list files
        let (messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "Take a screenshot of the display 0 and describe what you see.".to_string(),
        )
        .await;
@@ -85,7 +86,7 @@ impl Evaluation for DeveloperImage {
        // Both the tool call and response must be valid
        metrics.push((
            "Take a screenshot and upload images".to_string(),
-            EvaluationMetric::Boolean(valid_tool_call && valid_response),
+            EvalMetricValue::Boolean(valid_tool_call && valid_response),
        ));
        Ok(metrics)
    }
--- a/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs
+++ b/crates/goose-bench/src/eval_suites/core/developer_search_replace/search_replace.rs
@@ -1,6 +1,7 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
+    collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
    ExtensionRequirements,
 };
 use crate::register_evaluation;
@@ -20,10 +21,10 @@ impl DeveloperSearchReplace {
 impl Evaluation for DeveloperSearchReplace {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
-        let _target_file = match work_dir.fs_get("./assets/kubernetes_swagger.json".to_string()) {
+        agent: &mut BenchAgent,
+        run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
+        let _target_file = match run_loc.fs_get("./assets/kubernetes_swagger.json".to_string()) {
            Ok(file) => file,
            Err(_) => {
                return Err(anyhow::anyhow!(
@@ -31,12 +32,12 @@ impl Evaluation for DeveloperSearchReplace {
                ))
            }
        };
-        let mut source_file = work_dir.base_path.clone();
+        let mut source_file = run_loc.base_path.clone();
        source_file.push("assets/kubernetes_swagger.json");

        // Send the prompt to modify the file
        let (_messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "Remove the io.k8s.api.admissionregistration.v1.ServiceReference definition block and replace with a new definition for io.k8s.api.admissionregistration.v1.FakeServiceReference. Update the fields in the definition as well to be consistent. Don't change the property names. Don't update any references to the old definition. Only modify the definition and it's description to 'FakeServiceReference simulates a reference to a fake service for testing purposes.'.The file to modify is kubernetes_swagger.json.".to_string()
        ).await;

@@ -49,7 +50,7 @@ impl Evaluation for DeveloperSearchReplace {
            .join("kubernetes_swagger.json");

        // Read the expected patch file from the assets directory
-        let patch_file_path = work_dir.base_path.join("assets").join("kubernetes.patch");
+        let patch_file_path = run_loc.base_path.join("assets").join("kubernetes.patch");
        if !patch_file_path.exists() {
            return Err(anyhow::anyhow!("Could not find patch file"));
        }
@@ -88,7 +89,7 @@ impl Evaluation for DeveloperSearchReplace {

        metrics.push((
            "Changes match expected patch".to_string(),
-            EvaluationMetric::Boolean(changes_match),
+            EvalMetricValue::Boolean(changes_match),
        ));

        Ok(metrics)
--- a/crates/goose-bench/src/eval_suites/core/example.rs
+++ b/crates/goose-bench/src/eval_suites/core/example.rs
@@ -1,5 +1,6 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
-use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
+use crate::eval_suites::{EvalMetricValue, Evaluation, ExtensionRequirements};
 use crate::register_evaluation;
 use async_trait::async_trait;
 // use std::fs;
@@ -16,20 +17,17 @@ impl ExampleEval {
 impl Evaluation for ExampleEval {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        _work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        _run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        println!("ExampleEval - run");
        let mut metrics = Vec::new();

        let _ = agent.prompt("What can you do?".to_string()).await;

-        metrics.push((
-            "example_metric".to_string(),
-            EvaluationMetric::Boolean(true),
-        ));
+        metrics.push(("example_metric".to_string(), EvalMetricValue::Boolean(true)));

-        metrics.push(("example_count".to_string(), EvaluationMetric::Integer(42)));
+        metrics.push(("example_count".to_string(), EvalMetricValue::Integer(42)));

        Ok(metrics)
    }
--- a/crates/goose-bench/src/eval_suites/core/memory/save_fact.rs
+++ b/crates/goose-bench/src/eval_suites/core/memory/save_fact.rs
@@ -1,8 +1,9 @@
 // Create a new file called test.txt with the content 'Hello, World!

+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, metrics_hashmap_to_vec, BenchAgent, Evaluation, EvaluationMetric,
+    collect_baseline_metrics, metrics_hashmap_to_vec, EvalMetricValue, Evaluation,
    ExtensionRequirements,
 };
 use crate::register_evaluation;
@@ -24,12 +25,12 @@ impl MemoryRememberMemory {
 impl Evaluation for MemoryRememberMemory {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        _work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        _run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        // Send the prompt to list files
        let (messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "Save this fact: The capital of France is Paris.".to_string(),
        )
        .await;
@@ -69,7 +70,7 @@ impl Evaluation for MemoryRememberMemory {

        metrics.push((
            "Saving facts".to_string(),
-            EvaluationMetric::Boolean(valid_tool_call),
+            EvalMetricValue::Boolean(valid_tool_call),
        ));
        Ok(metrics)
    }
--- a/crates/goose-bench/src/eval_suites/evaluation.rs
+++ b/crates/goose-bench/src/eval_suites/evaluation.rs
@@ -1,27 +1,24 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use anyhow::Result;
 use async_trait::async_trait;
-use chrono::{DateTime, Utc};
-use goose::message::Message;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};

 pub type Model = (String, String);
 pub type Extension = String;

-#[derive(Debug, Serialize, Clone)]
-pub struct BenchAgentError {
-    pub message: String,
-    pub level: String, // ERROR, WARN, etc.
-    pub timestamp: DateTime<Utc>,
-}
-
-#[derive(Debug, Serialize)]
-pub enum EvaluationMetric {
+#[derive(Debug, Deserialize, Serialize)]
+pub enum EvalMetricValue {
    Integer(i64),
    Float(f64),
    String(String),
    Boolean(bool),
 }
+#[derive(Debug, Serialize)]
+pub struct EvalMetric {
+    pub name: String,
+    pub value: EvalMetricValue,
+}

 #[derive(Debug, Default)]
 pub struct ExtensionRequirements {
@@ -30,24 +27,13 @@ pub struct ExtensionRequirements {
    pub remote: Vec<String>,
 }

-#[async_trait]
-pub trait BenchAgent: Send + Sync {
-    async fn prompt(&mut self, p: String) -> Result<Vec<Message>>;
-
-    // Make get_errors async
-    async fn get_errors(&self) -> Vec<BenchAgentError>;
-
-    // Get token usage information
-    async fn get_token_usage(&self) -> Option<i32>;
-}
-
 #[async_trait]
 pub trait Evaluation: Send + Sync {
    async fn run(
        &self,
-        agent: Box<dyn BenchAgent>,
+        agent: &mut BenchAgent,
        run_loc: &mut BenchmarkWorkDir,
-    ) -> Result<Vec<(String, EvaluationMetric)>>;
+    ) -> Result<Vec<(String, EvalMetricValue)>>;

    fn name(&self) -> &str;

--- a/crates/goose-bench/src/eval_suites/metrics.rs
+++ b/crates/goose-bench/src/eval_suites/metrics.rs
@@ -1,13 +1,14 @@
-use crate::eval_suites::{BenchAgent, EvaluationMetric};
+use crate::bench_session::BenchAgent;
+use crate::eval_suites::EvalMetricValue;
 use goose::message::{Message, MessageContent};
 use std::collections::HashMap;
 use std::time::Instant;

 /// Collect baseline metrics including execution time, tool usage, and token count
 pub async fn collect_baseline_metrics(
-    agent: &mut Box<dyn BenchAgent>,
+    agent: &mut BenchAgent,
    prompt: String,
-) -> (Vec<Message>, HashMap<String, EvaluationMetric>) {
+) -> (Vec<Message>, HashMap<String, EvalMetricValue>) {
    // Initialize metrics map
    let mut metrics = HashMap::new();

@@ -20,7 +21,7 @@ pub async fn collect_baseline_metrics(
        Err(e) => {
            metrics.insert(
                "prompt_error".to_string(),
-                EvaluationMetric::String(format!("Error: {}", e)),
+                EvalMetricValue::String(format!("Error: {}", e)),
            );
            Vec::new()
        }
@@ -30,21 +31,21 @@ pub async fn collect_baseline_metrics(
    let execution_time = start_time.elapsed();
    metrics.insert(
        "prompt_execution_time_seconds".to_string(),
-        EvaluationMetric::Float(execution_time.as_secs_f64()),
+        EvalMetricValue::Float(execution_time.as_secs_f64()),
    );

    // Count tool calls
    let (total_tool_calls, tool_calls_by_name) = count_tool_calls(&messages);
    metrics.insert(
        "total_tool_calls".to_string(),
-        EvaluationMetric::Integer(total_tool_calls),
+        EvalMetricValue::Integer(total_tool_calls),
    );

    // Add tool calls by name metrics
    for (tool_name, count) in tool_calls_by_name {
        metrics.insert(
            format!("tool_calls_{}", tool_name),
-            EvaluationMetric::Integer(count),
+            EvalMetricValue::Integer(count),
        );
    }

@@ -52,7 +53,7 @@ pub async fn collect_baseline_metrics(
    if let Some(token_count) = agent.get_token_usage().await {
        metrics.insert(
            "total_tokens".to_string(),
-            EvaluationMetric::Integer(token_count as i64),
+            EvalMetricValue::Integer(token_count as i64),
        );
    }

@@ -82,8 +83,8 @@ fn count_tool_calls(messages: &[Message]) -> (i64, HashMap<String, i64>) {

 /// Convert HashMap of metrics to Vec
 pub fn metrics_hashmap_to_vec(
-    metrics: HashMap<String, EvaluationMetric>,
-) -> Vec<(String, EvaluationMetric)> {
+    metrics: HashMap<String, EvalMetricValue>,
+) -> Vec<(String, EvalMetricValue)> {
    metrics.into_iter().collect()
 }

--- a/crates/goose-bench/src/eval_suites/vibes/blog_summary.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/blog_summary.rs
@@ -1,7 +1,8 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
-    BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
+    EvalMetricValue, Evaluation, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
@@ -23,20 +24,20 @@ impl BlogSummary {
 impl Evaluation for BlogSummary {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        println!("BlogSummary - run");

        // Collect baseline metrics (execution time, token usage, tool calls)
        let (response, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "What are the top 5 most counterintuitive insights from this blog post? Format your response in Markdown with 5 numbered points (1. 2. 3. 4. 5.) https://huyenchip.com/2025/01/07/agents.html".to_string()
        ).await;

        // Write response to file and get the text content
        let response_text =
-            match write_response_to_file(&response, work_dir, "blog_summary_output.txt") {
+            match write_response_to_file(&response, run_loc, "blog_summary_output.txt") {
                Ok(text) => text,
                Err(e) => {
                    println!("Warning: Failed to write blog summary output: {}", e);
@@ -54,14 +55,14 @@ impl Evaluation for BlogSummary {
        let has_markdown_list = self.check_markdown_numbered_list(&response_text);
        metrics.push((
            "valid_markdown_format".to_string(),
-            EvaluationMetric::Boolean(has_markdown_list),
+            EvalMetricValue::Boolean(has_markdown_list),
        ));

        // Check if the fetch tool was used
        let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
        metrics.push((
            "used_fetch_tool".to_string(),
-            EvaluationMetric::Boolean(used_fetch_tool),
+            EvalMetricValue::Boolean(used_fetch_tool),
        ));

        // Copy the session file to the current working directory
--- a/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs
@@ -1,7 +1,8 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
-    EvaluationMetric, ExtensionRequirements,
+    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, EvalMetricValue,
+    Evaluation, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
@@ -31,14 +32,14 @@ impl FlappyBird {
 impl Evaluation for FlappyBird {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        println!("FlappyBird - run");

        // Collect baseline metrics (execution time, token usage, tool calls)
        let (messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "Create a Flappy Bird game in Python. Structure the code with a main function and use the if __name__ == '__main__': idiom. You must use pygame. The background color should be a light blue color. Pressing SPACE multiple times will accelerate the bird. The bird's shape should be a red circle. Place on the bottom some land colored as dark yellow chosen. Make a score shown on the top right side. Increment if you pass pipes and don't hit them. Make randomly spaced dark green pipes with enough space. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again. When trying to run the game, make sure to use pyenv and create the environment in the current working directory. The final game should be written to a file named flappy_bird.py. Remember to use your tools if applicable.".to_string()
        ).await;

@@ -80,17 +81,17 @@ impl Evaluation for FlappyBird {

        metrics.push((
            "used_write_tool".to_string(),
-            EvaluationMetric::Boolean(valid_tool_call),
+            EvalMetricValue::Boolean(valid_tool_call),
        ));

        // If tool was used correctly, check the actual file content
        if valid_tool_call {
-            if let Ok(file_path) = work_dir.fs_get("flappy_bird.py".to_string()) {
+            if let Ok(file_path) = run_loc.fs_get("flappy_bird.py".to_string()) {
                if let Ok(content) = fs::read_to_string(file_path) {
                    let valid_implementation = self.check_python_implementation(&content);
                    metrics.push((
                        "valid_implementation".to_string(),
-                        EvaluationMetric::Boolean(valid_implementation),
+                        EvalMetricValue::Boolean(valid_implementation),
                    ));
                }
            }
--- a/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs
@@ -1,7 +1,8 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
-    EvaluationMetric, ExtensionRequirements,
+    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, EvalMetricValue,
+    Evaluation, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
@@ -21,14 +22,14 @@ impl GooseWiki {
 impl Evaluation for GooseWiki {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        _: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        _run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        println!("GooseWiki - run");

        // Collect baseline metrics (execution time, token usage, tool calls)
        let (messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "Create a Wikipedia-style web page about Goose (Block's AI agent) in a new index.html file. The page should be a complete, well-structured HTML document with proper head and body sections. Use heading tags (h1, h2, h3) to organize the content into clear sections. Include comprehensive information about Goose organized in a way similar to how Wikipedia presents technical topics. Remember to use your tools if applicable.".to_string()
        ).await;

@@ -71,7 +72,7 @@ impl Evaluation for GooseWiki {

        metrics.push((
            "created_valid_html".to_string(),
-            EvaluationMetric::Boolean(valid_tool_call),
+            EvalMetricValue::Boolean(valid_tool_call),
        ));

        // Copy the session file to the current working directory
--- a/crates/goose-bench/src/eval_suites/vibes/restaurant_research.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/restaurant_research.rs
@@ -1,7 +1,8 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
-    BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
+    EvalMetricValue, Evaluation, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
@@ -30,14 +31,14 @@ impl RestaurantResearch {
 impl Evaluation for RestaurantResearch {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        println!("RestaurantResearch - run");

        // Collect baseline metrics (execution time, token usage, tool calls)
        let (response, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            "Search the internet for and provide a current, detailed list of the best Sichuanese restaurants specifically in the East Village neighborhood of NYC. Format your response in Markdown using bullet points (either - or *) for each restaurant. For each restaurant include:
 - Restaurant name and what they're known for
 - Signature dishes
@@ -50,7 +51,7 @@ Present the information in order of significance or quality. Focus specifically

        // Write response to file and get the text content
        let response_text =
-            match write_response_to_file(&response, work_dir, "restaurant_research_output.txt") {
+            match write_response_to_file(&response, run_loc, "restaurant_research_output.txt") {
                Ok(text) => text,
                Err(e) => {
                    println!("Warning: Failed to write restaurant research output: {}", e);
@@ -70,18 +71,18 @@ Present the information in order of significance or quality. Focus specifically

        metrics.push((
            "valid_markdown_format".to_string(),
-            EvaluationMetric::Boolean(has_markdown_bullets),
+            EvalMetricValue::Boolean(has_markdown_bullets),
        ));
        metrics.push((
            "bullet_point_count".to_string(),
-            EvaluationMetric::Integer(bullet_count),
+            EvalMetricValue::Integer(bullet_count),
        ));

        // Check if the fetch tool was used
        let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
        metrics.push((
            "used_fetch_tool".to_string(),
-            EvaluationMetric::Boolean(used_fetch_tool),
+            EvalMetricValue::Boolean(used_fetch_tool),
        ));

        // Copy the session file to the current working directory
--- a/crates/goose-bench/src/eval_suites/vibes/squirrel_census.rs
+++ b/crates/goose-bench/src/eval_suites/vibes/squirrel_census.rs
@@ -1,7 +1,8 @@
+use crate::bench_session::BenchAgent;
 use crate::bench_work_dir::BenchmarkWorkDir;
 use crate::eval_suites::{
-    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, BenchAgent, Evaluation,
-    EvaluationMetric, ExtensionRequirements,
+    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, EvalMetricValue,
+    Evaluation, ExtensionRequirements,
 };
 use crate::register_evaluation;
 use async_trait::async_trait;
@@ -30,13 +31,13 @@ impl SquirrelCensus {
 impl Evaluation for SquirrelCensus {
    async fn run(
        &self,
-        mut agent: Box<dyn BenchAgent>,
-        work_dir: &mut BenchmarkWorkDir,
-    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        agent: &mut BenchAgent,
+        run_loc: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvalMetricValue)>> {
        println!("SquirrelCensus - run");

        // Get the path to the squirrel data file
-        let squirrel_data_path = match work_dir.fs_get("./assets/squirrel-data.csv".to_string()) {
+        let squirrel_data_path = match run_loc.fs_get("./assets/squirrel-data.csv".to_string()) {
            Ok(file) => file,
            Err(_) => return Err(anyhow::anyhow!("Could not find squirrel-data.csv file")),
        };
@@ -45,7 +46,7 @@ impl Evaluation for SquirrelCensus {

        // Collect baseline metrics (execution time, token usage, tool calls)
        let (messages, perf_metrics) = collect_baseline_metrics(
-            &mut agent,
+            agent,
            format!(
                "Create a Python script called analyze_squirrels.py that analyzes the CSV file at {}. Do not ask for any clarification or further instructions - proceed with the implementation as specified below.

@@ -141,15 +142,15 @@ After writing the script, run it using python3 and show the results. Do not ask

        metrics.push((
            "wrote_script".to_string(),
-            EvaluationMetric::Boolean(wrote_script),
+            EvalMetricValue::Boolean(wrote_script),
        ));
        metrics.push((
            "ran_script".to_string(),
-            EvaluationMetric::Boolean(ran_script),
+            EvalMetricValue::Boolean(ran_script),
        ));
        metrics.push((
            "correct_results".to_string(),
-            EvaluationMetric::Boolean(correct_results),
+            EvalMetricValue::Boolean(correct_results),
        ));

        // Copy the session file to the current working directory
--- a/crates/goose-bench/src/lib.rs
+++ b/crates/goose-bench/src/lib.rs
@@ -1,4 +1,8 @@
+pub mod bench_config;
+pub mod bench_session;
 pub mod bench_work_dir;
 pub mod error_capture;
 pub mod eval_suites;
 pub mod reporting;
+pub mod runners;
+pub mod utilities;
--- a/crates/goose-bench/src/reporting.rs
+++ b/crates/goose-bench/src/reporting.rs
@@ -1,25 +1,26 @@
-use crate::eval_suites::{BenchAgentError, EvaluationMetric};
+use crate::bench_session::BenchAgentError;
+use crate::eval_suites::EvalMetricValue;
 use chrono::Local;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 use std::fmt;

 /// Represents a single evaluation result
-#[derive(Default, Serialize)]
+#[derive(Default, Deserialize, Serialize)]
 pub struct EvaluationResult {
    pub name: String,
-    pub metrics: Vec<(String, EvaluationMetric)>,
+    pub metrics: Vec<(String, EvalMetricValue)>,
    pub errors: Vec<BenchAgentError>,
 }

 /// Represents results for an entire suite
-#[derive(Default, Serialize)]
+#[derive(Default, Deserialize, Serialize)]
 pub struct SuiteResult {
    pub name: String,
    pub evaluations: Vec<EvaluationResult>,
 }

 /// Contains all benchmark results and metadata
-#[derive(Default, Serialize)]
+#[derive(Default, Deserialize, Serialize)]
 pub struct BenchmarkResults {
    pub provider: String,
    pub start_time: String,
@@ -35,7 +36,7 @@ impl EvaluationResult {
        }
    }

-    pub fn add_metric(&mut self, name: String, metric: EvaluationMetric) {
+    pub fn add_metric(&mut self, name: String, metric: EvalMetricValue) {
        self.metrics.push((name, metric));
    }

@@ -97,13 +98,13 @@ impl BenchmarkResults {
    }
 }

-impl fmt::Display for EvaluationMetric {
+impl fmt::Display for EvalMetricValue {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
-            EvaluationMetric::Integer(i) => write!(f, "{}", i),
-            EvaluationMetric::Float(fl) => write!(f, "{:.2}", fl),
-            EvaluationMetric::String(s) => write!(f, "{}", s),
-            EvaluationMetric::Boolean(b) => write!(f, "{}", b),
+            EvalMetricValue::Integer(i) => write!(f, "{}", i),
+            EvalMetricValue::Float(fl) => write!(f, "{:.2}", fl),
+            EvalMetricValue::String(s) => write!(f, "{}", s),
+            EvalMetricValue::Boolean(b) => write!(f, "{}", b),
        }
    }
 }
--- a/crates/goose-bench/src/runners/bench_runner.rs
+++ b/crates/goose-bench/src/runners/bench_runner.rs
@@ -0,0 +1,75 @@
+use crate::bench_config::{BenchModel, BenchRunConfig};
+use crate::bench_work_dir::BenchmarkWorkDir;
+use crate::eval_suites::EvaluationSuite;
+use crate::runners::model_runner::ModelRunner;
+use crate::utilities::{await_process_exits, parallel_bench_cmd};
+use std::path::PathBuf;
+
+#[derive(Clone)]
+pub struct BenchRunner {
+    config: BenchRunConfig,
+}
+
+impl BenchRunner {
+    pub fn new(config: PathBuf) -> anyhow::Result<BenchRunner> {
+        let config = BenchRunConfig::from(config)?;
+        BenchmarkWorkDir::init_experiment();
+        config.save("config.cfg".to_string());
+        Ok(BenchRunner { config })
+    }
+
+    pub fn from(config: String) -> anyhow::Result<BenchRunner> {
+        let config = BenchRunConfig::from_string(config)?;
+        Ok(BenchRunner { config })
+    }
+
+    pub fn run(&mut self) -> anyhow::Result<()> {
+        // split models that must run serial from those that can be run in parallel
+        let (parallel_models, serial_models): &(Vec<BenchModel>, Vec<BenchModel>) = &self
+            .config
+            .models
+            .clone()
+            .into_iter()
+            .partition(|model| model.parallel_safe);
+
+        // exec parallel models
+        let mut parallel_models_handle = Vec::new();
+        for model in parallel_models {
+            self.config.models = vec![model.clone()];
+            let cfg = self.config.to_string()?;
+            let model_handle = parallel_bench_cmd("eval-model".to_string(), cfg, Vec::new());
+            parallel_models_handle.push(model_handle);
+        }
+
+        // exec serial models
+        for model in serial_models {
+            self.config.models = vec![model.clone()];
+            ModelRunner::from(self.config.to_string()?)?.run()?;
+        }
+
+        await_process_exits(&mut parallel_models_handle, Vec::new());
+
+        Ok(())
+    }
+
+    pub fn list_selectors(_config: Option<PathBuf>) -> anyhow::Result<()> {
+        let selector_eval_counts = EvaluationSuite::available_selectors();
+        let mut keys: Vec<_> = selector_eval_counts.keys().collect();
+        keys.sort();
+        let max_key_len = keys.iter().map(|k| k.len()).max().unwrap_or(0);
+        println!(
+            "selector {} => Eval Count",
+            " ".repeat(max_key_len - "selector".len())
+        );
+        println!("{}", "-".repeat(max_key_len + 6));
+        for selector in keys {
+            println!(
+                "{} {} => {}",
+                selector,
+                " ".repeat(max_key_len - selector.len()),
+                selector_eval_counts.get(selector).unwrap()
+            );
+        }
+        Ok(())
+    }
+}
--- a/crates/goose-bench/src/runners/eval_runner.rs
+++ b/crates/goose-bench/src/runners/eval_runner.rs
@@ -0,0 +1,120 @@
+use crate::bench_config::{BenchEval, BenchModel, BenchRunConfig};
+use crate::bench_session::BenchAgent;
+use crate::bench_work_dir::BenchmarkWorkDir;
+use crate::eval_suites::{EvaluationSuite, ExtensionRequirements};
+use crate::reporting::EvaluationResult;
+use crate::utilities::await_process_exits;
+use std::env;
+use std::fs;
+use std::future::Future;
+use std::path::PathBuf;
+use std::process::Command;
+
+#[derive(Clone)]
+pub struct EvalRunner {
+    config: BenchRunConfig,
+}
+
+impl EvalRunner {
+    pub fn from(config: String) -> anyhow::Result<EvalRunner> {
+        let config = BenchRunConfig::from_string(config)?;
+        Ok(EvalRunner { config })
+    }
+
+    fn create_work_dir(&self, config: &BenchRunConfig) -> anyhow::Result<BenchmarkWorkDir> {
+        let goose_model = config.models.first().unwrap();
+        let model_name = goose_model.name.clone();
+        let provider_name = goose_model.provider.clone();
+
+        // construct work-dir name to have a shim component only if shim configured to be used
+        let work_dir_name_shim = {
+            let mut shim_name = "".to_string();
+            if let Some(shim_opt) = &goose_model.tool_shim {
+                if shim_opt.use_tool_shim {
+                    let shim_model = if let Some(shim_model) = &shim_opt.tool_shim_model {
+                        shim_model.clone()
+                    } else {
+                        "default".to_string()
+                    };
+                    shim_name = format!("-{}-shim-model", shim_model);
+                }
+            }
+            shim_name
+        };
+
+        let include_dir = config.include_dirs.clone();
+        let work_dir_name = format!("{}-{}{}", provider_name, model_name, work_dir_name_shim);
+        let work_dir = BenchmarkWorkDir::new(work_dir_name, include_dir);
+        Ok(work_dir)
+    }
+    pub async fn run<F, Fut>(&mut self, agent_generator: F) -> anyhow::Result<()>
+    where
+        F: Fn(ExtensionRequirements, String) -> Fut,
+        Fut: Future<Output = BenchAgent> + Send,
+    {
+        let mut work_dir = self.create_work_dir(&self.config)?;
+        let bench_eval = self.config.evals.first().unwrap();
+
+        let run_id = &self
+            .config
+            .run_id
+            .clone()
+            .unwrap_or_else(|| "run-0".to_string());
+        let run_id = format!("run-{}", run_id.clone());
+
+        // create entire dir subtree for eval and cd into dir for running eval
+        work_dir.set_eval(&bench_eval.selector, run_id);
+
+        if let Some(eval) = EvaluationSuite::from(&bench_eval.selector) {
+            let session_id = bench_eval.selector.clone();
+            let mut agent = agent_generator(eval.required_extensions(), session_id).await;
+
+            let mut result = EvaluationResult::new(eval.name().to_string());
+
+            if let Ok(metrics) = eval.run(&mut agent, &mut work_dir).await {
+                for (name, metric) in metrics {
+                    result.add_metric(name, metric);
+                }
+
+                // Add any errors that occurred
+                for error in agent.get_errors().await {
+                    result.add_error(error);
+                }
+            }
+
+            let eval_results = serde_json::to_string_pretty(&result)?;
+
+            let eval_results_file = env::current_dir()?.join(&self.config.eval_result_filename);
+            fs::write(&eval_results_file, &eval_results)?;
+            self.config.save("config.cfg".to_string());
+            work_dir.save();
+
+            // handle running post-process cmd if configured
+            if let Some(cmd) = &bench_eval.post_process_cmd {
+                let handle = Command::new(cmd).arg(&eval_results_file).spawn()?;
+                await_process_exits(&mut [handle], Vec::new());
+            }
+
+            // copy session file into eval-dir
+            let here = env::current_dir()?.canonicalize()?;
+            BenchmarkWorkDir::deep_copy(agent.session_file().as_path(), here.as_path(), false)?;
+        }
+
+        Ok(())
+    }
+
+    pub fn path_for_eval(model: &BenchModel, eval: &BenchEval, run_id: String) -> PathBuf {
+        let provider = model.provider.clone();
+        let model = model.name.clone();
+        let eval_path = &eval.selector.replace(":", std::path::MAIN_SEPARATOR_STR);
+        let eval_results_location = format!(
+            "{}-{}/run-{}{}{}",
+            &provider,
+            model,
+            run_id,
+            std::path::MAIN_SEPARATOR_STR,
+            eval_path
+        );
+        PathBuf::from(eval_results_location.clone())
+    }
+}
--- a/crates/goose-bench/src/runners/mod.rs
+++ b/crates/goose-bench/src/runners/mod.rs
@@ -0,0 +1,3 @@
+pub mod bench_runner;
+pub mod eval_runner;
+pub mod model_runner;
--- a/crates/goose-bench/src/runners/model_runner.rs
+++ b/crates/goose-bench/src/runners/model_runner.rs
@@ -0,0 +1,236 @@
+use crate::bench_config::{BenchEval, BenchModel, BenchRunConfig};
+use crate::eval_suites::EvaluationSuite;
+use crate::reporting::{BenchmarkResults, SuiteResult};
+use crate::runners::eval_runner::EvalRunner;
+use crate::utilities::{await_process_exits, parallel_bench_cmd, union_hashmaps};
+use std::collections::HashMap;
+use std::fs::read_to_string;
+use std::io::{self, BufRead};
+use std::path::PathBuf;
+use std::process::Child;
+use std::thread;
+
+#[derive(Clone)]
+pub struct ModelRunner {
+    config: BenchRunConfig,
+}
+
+impl ModelRunner {
+    pub fn from(config: String) -> anyhow::Result<ModelRunner> {
+        let config = BenchRunConfig::from_string(config)?;
+        Ok(ModelRunner { config })
+    }
+
+    pub fn run(&self) -> anyhow::Result<()> {
+        let model = self.config.models.first().unwrap();
+        let suites = self.collect_evals_for_run();
+
+        let mut handles = vec![];
+
+        for i in 0..self.config.repeat.unwrap_or(1) {
+            let mut self_copy = self.clone();
+            let model_clone = model.clone();
+            let suites_clone = suites.clone();
+            // create thread to handle launching parallel processes to run model's evals in parallel
+            let handle = thread::spawn(move || {
+                self_copy.run_benchmark(&model_clone, suites_clone, i.to_string())
+            });
+            handles.push(handle);
+        }
+        await_process_exits(&mut Vec::new(), handles);
+
+        let mut all_runs_results: Vec<BenchmarkResults> = Vec::new();
+        for i in 0..self.config.repeat.unwrap_or(1) {
+            let run_results =
+                self.collect_run_results(model.clone(), suites.clone(), i.to_string())?;
+            all_runs_results.push(run_results);
+        }
+        // write summary file
+
+        Ok(())
+    }
+
+    fn load_env_file(&self, path: &PathBuf) -> anyhow::Result<Vec<(String, String)>> {
+        let file = std::fs::File::open(path)?;
+        let reader = io::BufReader::new(file);
+        let mut env_vars = Vec::new();
+
+        for line in reader.lines() {
+            let line = line?;
+            // Skip empty lines and comments
+            if line.trim().is_empty() || line.trim_start().starts_with('#') {
+                continue;
+            }
+
+            // Split on first '=' only
+            if let Some((key, value)) = line.split_once('=') {
+                let key = key.trim().to_string();
+                // Remove quotes if present
+                let value = value
+                    .trim()
+                    .trim_matches('"')
+                    .trim_matches('\'')
+                    .to_string();
+                env_vars.push((key, value));
+            }
+        }
+
+        Ok(env_vars)
+    }
+
+    fn run_benchmark(
+        &mut self,
+        model: &BenchModel,
+        suites: HashMap<String, Vec<BenchEval>>,
+        run_id: String,
+    ) -> anyhow::Result<()> {
+        let mut results_handles = HashMap::<String, Vec<Child>>::new();
+
+        // Load environment variables from file if specified
+        let mut envs = self.toolshim_envs();
+        if let Some(env_file) = &self.config.env_file {
+            let env_vars = self.load_env_file(env_file)?;
+            envs.extend(env_vars);
+        }
+        envs.push(("GOOSE_MODEL".to_string(), model.clone().name));
+        envs.push(("GOOSE_PROVIDER".to_string(), model.clone().provider));
+
+        // Only run in parallel if the model is parallel_safe
+        let run_parallel = model.parallel_safe;
+
+        for (suite, evals) in suites.iter() {
+            results_handles.insert((*suite).clone(), Vec::new());
+
+            // Group evaluations by parallel_safe
+            let mut parallel_evals = Vec::new();
+            let mut sequential_evals = Vec::new();
+
+            for eval in evals {
+                if eval.parallel_safe && run_parallel {
+                    parallel_evals.push(eval);
+                } else {
+                    sequential_evals.push(eval);
+                }
+            }
+
+            // Run parallel-safe evaluations in parallel
+            if !parallel_evals.is_empty() {
+                for eval_selector in &parallel_evals {
+                    self.config.run_id = Some(run_id.clone());
+                    self.config.evals = vec![(*eval_selector).clone()];
+                    let cfg = self.config.to_string()?;
+                    let handle = parallel_bench_cmd("exec-eval".to_string(), cfg, envs.clone());
+                    results_handles.get_mut(suite).unwrap().push(handle);
+                }
+            }
+
+            // Run non-parallel-safe evaluations sequentially
+            for eval_selector in &sequential_evals {
+                self.config.run_id = Some(run_id.clone());
+                self.config.evals = vec![(*eval_selector).clone()];
+                let cfg = self.config.to_string()?;
+                let handle = parallel_bench_cmd("exec-eval".to_string(), cfg, envs.clone());
+
+                // Wait for this process to complete before starting the next one
+                let mut child_procs = vec![handle];
+                await_process_exits(&mut child_procs, Vec::new());
+            }
+        }
+
+        // Wait for any remaining parallel processes to complete
+        for (_, child_procs) in results_handles.iter_mut() {
+            await_process_exits(child_procs, Vec::new());
+        }
+
+        Ok(())
+    }
+
+    fn collect_run_results(
+        &self,
+        model: BenchModel,
+        suites: HashMap<String, Vec<BenchEval>>,
+        run_id: String,
+    ) -> anyhow::Result<BenchmarkResults> {
+        let mut results = BenchmarkResults::new(model.provider.clone());
+
+        let mut summary_path: Option<PathBuf> = None;
+
+        for (suite, evals) in suites.iter() {
+            let mut suite_result = SuiteResult::new(suite.clone());
+            for eval_selector in evals {
+                let mut eval_path =
+                    EvalRunner::path_for_eval(&model, eval_selector, run_id.clone());
+                eval_path.push(self.config.eval_result_filename.clone());
+                let eval_result = serde_json::from_str(&read_to_string(&eval_path)?)?;
+                suite_result.add_evaluation(eval_result);
+
+                // use current eval to determine where the summary should be written
+                if summary_path.is_none() {
+                    let mut result = PathBuf::new();
+                    let mut iter = eval_path.components();
+                    if let Some(first) = iter.next() {
+                        result.push(first);
+                        if let Some(second) = iter.next() {
+                            result.push(second);
+                        }
+                    }
+                    summary_path = Some(result);
+                }
+            }
+            results.add_suite(suite_result);
+        }
+
+        let mut run_summary = PathBuf::new();
+        run_summary.push(summary_path.clone().unwrap());
+        run_summary.push(&self.config.run_summary_filename);
+
+        let output_str = serde_json::to_string_pretty(&results)?;
+        std::fs::write(run_summary, &output_str)?;
+
+        Ok(results)
+    }
+    fn collect_evals_for_run(&self) -> HashMap<String, Vec<BenchEval>> {
+        // convert suites map {suite_name => [eval_selector_str] to map suite_name => [BenchEval]
+        let suites = self
+            .config
+            .evals
+            .iter()
+            .map(|eval| {
+                EvaluationSuite::select(vec![eval.clone().selector])
+                    .iter()
+                    .map(|(suite, evals)| {
+                        let bench_evals = evals
+                            .iter()
+                            .map(|suite_eval| {
+                                let mut updated_eval = eval.clone();
+                                updated_eval.selector = (*suite_eval).to_string();
+                                updated_eval
+                            })
+                            .collect::<Vec<_>>();
+                        (suite.clone(), bench_evals)
+                    })
+                    .collect()
+            })
+            .collect();
+        union_hashmaps(suites)
+    }
+
+    fn toolshim_envs(&self) -> Vec<(String, String)> {
+        // read tool-shim preference from config, set respective env vars accordingly
+        let model = self.config.models.first().unwrap();
+
+        let mut shim_envs: Vec<(String, String)> = Vec::new();
+        if let Some(shim_opt) = &model.tool_shim {
+            if shim_opt.use_tool_shim {
+                shim_envs.push(("GOOSE_TOOLSHIM".to_string(), "true".to_string()));
+                if let Some(shim_model) = &shim_opt.tool_shim_model {
+                    shim_envs.push((
+                        "GOOSE_TOOLSHIM_OLLAMA_MODEL".to_string(),
+                        shim_model.clone(),
+                    ));
+                }
+            }
+        }
+        shim_envs
+    }
+}
--- a/crates/goose-bench/src/utilities.rs
+++ b/crates/goose-bench/src/utilities.rs
@@ -0,0 +1,52 @@
+use std::collections::HashMap;
+use std::env;
+use std::process::{Child, Command};
+use std::thread::JoinHandle;
+
+pub fn union_hashmaps<K, V>(maps: Vec<HashMap<K, V>>) -> HashMap<K, V>
+where
+    K: Eq + std::hash::Hash,
+    V: Clone,
+{
+    // We can use the fold method to accumulate all maps into one
+    maps.into_iter().fold(HashMap::new(), |mut result, map| {
+        // For each map in the vector, extend the result with its entries
+        result.extend(map);
+        result
+    })
+}
+
+pub fn await_process_exits(
+    child_processes: &mut [Child],
+    handles: Vec<JoinHandle<anyhow::Result<()>>>,
+) {
+    for child in child_processes.iter_mut() {
+        match child.wait() {
+            Ok(status) => println!("Child exited with status: {}", status),
+            Err(e) => println!("Error waiting for child: {}", e),
+        }
+    }
+
+    for handle in handles {
+        match handle.join() {
+            Ok(_res) => (),
+            Err(e) => {
+                // Handle thread panic
+                println!("Thread panicked: {:?}", e);
+            }
+        }
+    }
+}
+
+pub fn parallel_bench_cmd(bench_cmd: String, config: String, envs: Vec<(String, String)>) -> Child {
+    let current_exe = env::current_exe().expect("Failed to get current executable path");
+
+    let mut cmd = Command::new(current_exe);
+    cmd.arg("bench").arg(bench_cmd).arg("--config").arg(config);
+
+    for (key, value) in envs.into_iter() {
+        cmd.env(key, value);
+    }
+
+    cmd.spawn().expect("Failed to spawn child process")
+}
--- a/crates/goose-cli/src/cli.rs
+++ b/crates/goose-cli/src/cli.rs
@@ -4,7 +4,7 @@ use clap::{Args, Parser, Subcommand};
 use goose::config::Config;

 use crate::commands::agent_version::AgentCommand;
-use crate::commands::bench::{list_selectors, run_benchmark};
+use crate::commands::bench::agent_generator;
 use crate::commands::configure::handle_configure;
 use crate::commands::info::handle_info;
 use crate::commands::mcp::run_server;
@@ -12,6 +12,10 @@ use crate::commands::session::handle_session_list;
 use crate::logging::setup_logging;
 use crate::session;
 use crate::session::build_session;
+use goose_bench::bench_config::BenchRunConfig;
+use goose_bench::runners::bench_runner::BenchRunner;
+use goose_bench::runners::eval_runner::EvalRunner;
+use goose_bench::runners::model_runner::ModelRunner;
 use std::io::Read;
 use std::path::PathBuf;

@@ -71,6 +75,47 @@ enum SessionCommand {
    },
 }

+#[derive(Subcommand)]
+pub enum BenchCommand {
+    #[command(name = "init-config", about = "Create a new starter-config")]
+    InitConfig {
+        #[arg(short, long, help = "filename with extension for generated config")]
+        name: String,
+    },
+
+    #[command(about = "Run all benchmarks from a config")]
+    Run {
+        #[arg(
+            short,
+            long,
+            help = "A config file generated by the config-init command"
+        )]
+        config: PathBuf,
+    },
+
+    #[command(about = "List all available selectors")]
+    Selectors {
+        #[arg(
+            short,
+            long,
+            help = "A config file generated by the config-init command"
+        )]
+        config: Option<PathBuf>,
+    },
+
+    #[command(name = "eval-model", about = "Run an eval of model")]
+    EvalModel {
+        #[arg(short, long, help = "A serialized config file for the model only.")]
+        config: String,
+    },
+
+    #[command(name = "exec-eval", about = "run a single eval")]
+    ExecEval {
+        #[arg(short, long, help = "A serialized config file for the eval only.")]
+        config: String,
+    },
+}
+
 #[derive(Subcommand)]
 enum Command {
    /// Configure Goose settings
@@ -255,63 +300,8 @@ enum Command {
    },

    Bench {
-        #[arg(
-            short = 's',
-            long = "selectors",
-            value_name = "EVALUATIONS_SELECTOR",
-            help = "Run this list of bench-suites.",
-            long_help = "Specify a comma-separated list of evaluation-suite names to be run.",
-            value_delimiter = ','
-        )]
-        selectors: Vec<String>,
-
-        #[arg(
-            short = 'i',
-            long = "include-dir",
-            value_name = "DIR_NAME",
-            action = clap::ArgAction::Append,
-            long_help = "Make one or more dirs available to all bench suites. Specify either a single dir-name, a comma-separated list of dir-names, or use this multiple instances of this flag to specify multiple dirs.",
-            value_delimiter = ','
-        )]
-        include_dirs: Vec<PathBuf>,
-
-        #[arg(
-            long = "repeat",
-            value_name = "QUANTITY",
-            long_help = "Number of times to repeat the benchmark run.",
-            default_value = "1"
-        )]
-        repeat: usize,
-
-        #[arg(
-            long = "list",
-            value_name = "LIST",
-            help = "List all selectors and the number of evaluations they select."
-        )]
-        list: bool,
-
-        #[arg(
-            long = "output",
-            short = 'o',
-            value_name = "FILE",
-            help = "Save benchmark results to a file"
-        )]
-        output: Option<PathBuf>,
-
-        #[arg(
-            long = "format",
-            value_name = "FORMAT",
-            help = "Output format (text, json)",
-            default_value = "text"
-        )]
-        format: String,
-
-        #[arg(
-            long = "summary",
-            help = "Show only summary results",
-            action = clap::ArgAction::SetTrue
-        )]
-        summary: bool,
+        #[command(subcommand)]
+        cmd: BenchCommand,
    },
 }

@@ -346,10 +336,10 @@ pub async fn cli() -> Result<()> {
            remote_extension,
            builtin,
        }) => {
-            match command {
+            return match command {
                Some(SessionCommand::List { verbose, format }) => {
                    handle_session_list(verbose, format)?;
-                    return Ok(());
+                    Ok(())
                }
                None => {
                    // Run session command by default
@@ -367,9 +357,9 @@ pub async fn cli() -> Result<()> {
                        None,
                    )?;
                    let _ = session.interactive(None).await;
-                    return Ok(());
-                }
+                    Ok(())
                }
+            };
        }
        Some(Command::Run {
            instructions,
@@ -438,58 +428,22 @@ pub async fn cli() -> Result<()> {
            crate::commands::update::update(canary, reconfigure)?;
            return Ok(());
        }
-        Some(Command::Bench {
-            selectors,
-            include_dirs,
-            repeat,
-            list,
-            output,
-            format,
-            summary,
-        }) => {
-            if list {
-                return list_selectors().await;
-            }
-
-            let selectors = if selectors.is_empty() {
-                vec!["core".to_string()]
-            } else {
-                selectors
-            };
-
-            let current_dir = std::env::current_dir()?;
-
-            for i in 0..repeat {
-                if repeat > 1 {
-                    println!("\nRun {} of {}:", i + 1, repeat);
-                }
-                let results = run_benchmark(selectors.clone(), include_dirs.clone()).await?;
-
-                // Handle output based on format
-                let output_str = match format.as_str() {
-                    "json" => serde_json::to_string_pretty(&results)?,
-                    _ => results.to_string(), // Uses Display impl
-                };
-
-                // Save to file if specified
-                if let Some(path) = &output {
-                    std::fs::write(current_dir.join(path), &output_str)?;
-                    println!("Results saved to: {}", path.display());
-                } else {
-                    // Print to console
-                    if summary {
-                        println!("{}", results.summary());
-                    } else {
-                        println!("{}", output_str);
-                    }
+        Some(Command::Bench { cmd }) => {
+            match cmd {
+                BenchCommand::Selectors { config } => BenchRunner::list_selectors(config)?,
+                BenchCommand::InitConfig { name } => BenchRunConfig::default().save(name),
+                BenchCommand::Run { config } => BenchRunner::new(config)?.run()?,
+                BenchCommand::EvalModel { config } => ModelRunner::from(config)?.run()?,
+                BenchCommand::ExecEval { config } => {
+                    EvalRunner::from(config)?.run(agent_generator).await?
                }
            }
            return Ok(());
        }
        None => {
-            if !Config::global().exists() {
+            return if !Config::global().exists() {
                let _ = handle_configure().await;
-                return Ok(());
+                Ok(())
            } else {
                // Run session command by default
                let mut session = build_session(None, false, vec![], vec![], vec![], false).await;
@@ -498,8 +452,8 @@ pub async fn cli() -> Result<()> {
                    None,
                )?;
                let _ = session.interactive(None).await;
-                return Ok(());
-            }
+                Ok(())
+            };
        }
    }
    Ok(())
--- a/crates/goose-cli/src/commands/bench.rs
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -1,88 +1,37 @@
-use crate::logging;
 use crate::session::build_session;
-use crate::Session;
+use crate::{logging, session, Session};
 use async_trait::async_trait;
-use goose::config::Config;
 use goose::message::Message;
-use goose_bench::bench_work_dir::BenchmarkWorkDir;
-use goose_bench::eval_suites::{BenchAgent, BenchAgentError, Evaluation, EvaluationSuite};
-use goose_bench::reporting::{BenchmarkResults, EvaluationResult, SuiteResult};
+use goose_bench::bench_session::{BenchAgent, BenchBaseSession};
+use goose_bench::eval_suites::ExtensionRequirements;
 use std::path::PathBuf;
 use std::sync::Arc;
 use tokio::sync::Mutex;

-pub struct BenchSession {
-    session: Session,
-    errors: Arc<Mutex<Vec<BenchAgentError>>>,
-}
-
-impl BenchSession {
-    pub fn new(session: Session) -> Self {
-        let errors = Arc::new(Mutex::new(Vec::new()));
-
-        // Initialize logging with error capture
-        logging::setup_logging(Some("bench"), Some(errors.clone()))
-            .expect("Failed to initialize logging");
-
-        Self { session, errors }
-    }
-}
-
+// allow session obj to be used in benchmarking
 #[async_trait]
-impl BenchAgent for BenchSession {
-    async fn prompt(&mut self, p: String) -> anyhow::Result<Vec<Message>> {
-        // Clear previous errors
-        {
-            let mut errors = self.errors.lock().await;
-            errors.clear();
+impl BenchBaseSession for Session {
+    async fn headless(&mut self, message: String) -> anyhow::Result<()> {
+        self.headless(message).await
    }
-
-        self.session.headless(p).await?;
-        Ok(self.session.message_history())
+    fn session_file(&self) -> PathBuf {
+        self.session_file()
    }
-
-    async fn get_errors(&self) -> Vec<BenchAgentError> {
-        let errors = self.errors.lock().await;
-        errors.clone()
+    fn message_history(&self) -> Vec<Message> {
+        self.message_history()
    }
-
-    async fn get_token_usage(&self) -> Option<i32> {
-        self.session.get_total_token_usage().ok().flatten()
+    fn get_total_token_usage(&self) -> anyhow::Result<Option<i32>> {
+        self.get_total_token_usage()
    }
 }
+pub async fn agent_generator(
+    requirements: ExtensionRequirements,
+    session_id: String,
+) -> BenchAgent {
+    let identifier = Some(session::Identifier::Name(session_id));

-// Wrapper struct to implement BenchAgent for Arc<Mutex<BenchSession>>
-struct BenchAgentWrapper(Arc<Mutex<BenchSession>>);
-
-#[async_trait]
-impl BenchAgent for BenchAgentWrapper {
-    async fn prompt(&mut self, p: String) -> anyhow::Result<Vec<Message>> {
-        let mut session = self.0.lock().await;
-        session.prompt(p).await
-    }
-
-    async fn get_errors(&self) -> Vec<BenchAgentError> {
-        let session = self.0.lock().await;
-        session.get_errors().await
-    }
-
-    async fn get_token_usage(&self) -> Option<i32> {
-        let session = self.0.lock().await;
-        session.get_token_usage().await
-    }
-}
-
-async fn run_eval(
-    evaluation: Box<dyn Evaluation>,
-    work_dir: &mut BenchmarkWorkDir,
-) -> anyhow::Result<EvaluationResult> {
-    let mut result = EvaluationResult::new(evaluation.name().to_string());
-
-    let requirements = evaluation.required_extensions();
-
-    // Create session with error capture
    let base_session = build_session(
-        None,
+        identifier,
        false,
        requirements.external,
        requirements.remote,
@@ -91,84 +40,12 @@ async fn run_eval(
    )
    .await;

-    let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session)));
-    let bench_session_clone = bench_session.clone();
+    // package session obj into benchmark-compatible struct
+    let bench_agent = BenchAgent::new(Box::new(base_session));

-    if let Ok(metrics) = evaluation
-        .run(Box::new(BenchAgentWrapper(bench_session)), work_dir)
-        .await
-    {
-        for (name, metric) in metrics {
-            result.add_metric(name, metric);
-        }
-
-        // Add any errors that occurred
-        let agent = BenchAgentWrapper(bench_session_clone);
-        for error in agent.get_errors().await {
-            result.add_error(error);
-        }
-    }
-
-    let current_dir = std::env::current_dir()?;
-    let output_str = serde_json::to_string_pretty(&result)?;
-    std::fs::write(current_dir.join("eval_result.json"), &output_str)?;
-
-    Ok(result)
-}
-
-pub async fn run_benchmark(
-    selectors: Vec<String>,
-    include_dirs: Vec<PathBuf>,
-) -> anyhow::Result<BenchmarkResults> {
-    let config = Config::global();
-    let goose_model: String = config
-        .get_param("GOOSE_MODEL")
-        .expect("No model configured. Run 'goose configure' first");
-    let provider_name: String = config
-        .get_param("GOOSE_PROVIDER")
-        .expect("No provider configured. Run 'goose configure' first");
-
-    let mut results = BenchmarkResults::new(provider_name.clone());
-
-    let work_dir = Mutex::new(BenchmarkWorkDir::new(
-        format!("{}-{}", provider_name, goose_model),
-        include_dirs.clone(),
-    ));
-
-    for (suite, evals) in EvaluationSuite::select(selectors).iter() {
-        let mut suite_result = SuiteResult::new(suite.clone());
-        for eval_selector in evals {
-            if let Some(eval) = EvaluationSuite::from(eval_selector) {
-                let mut work_dir = work_dir.lock().await;
-                work_dir.set_eval(eval_selector);
-                let eval_result = run_eval(eval, &mut work_dir).await?;
-                suite_result.add_evaluation(eval_result);
-            }
-        }
-
-        results.add_suite(suite_result);
-    }
-
-    Ok(results)
-}
-
-pub async fn list_selectors() -> anyhow::Result<()> {
-    let selector_eval_counts = EvaluationSuite::available_selectors();
-    let mut keys: Vec<_> = selector_eval_counts.keys().collect();
-    keys.sort();
-    let max_key_len = keys.iter().map(|k| k.len()).max().unwrap_or(0);
-    println!(
-        "selector {} => Eval Count",
-        " ".repeat(max_key_len - "selector".len())
-    );
-    println!("{}", "-".repeat(max_key_len + 6));
-    for selector in keys {
-        println!(
-            "{} {} => {}",
-            selector,
-            " ".repeat(max_key_len - selector.len()),
-            selector_eval_counts.get(selector).unwrap()
-        );
-    }
-    Ok(())
+    // Initialize logging with error capture
+    let errors = Some(Arc::new(Mutex::new(bench_agent.get_errors().await)));
+    logging::setup_logging(Some("bench"), errors).expect("Failed to initialize logging");
+
+    bench_agent
 }
--- a/crates/goose-cli/src/logging.rs
+++ b/crates/goose-cli/src/logging.rs
@@ -12,8 +12,8 @@ use tracing_subscriber::{
 };

 use goose::tracing::langfuse_layer;
+use goose_bench::bench_session::BenchAgentError;
 use goose_bench::error_capture::ErrorCaptureLayer;
-use goose_bench::eval_suites::BenchAgentError;

 // Used to ensure we only set up tracing once
 static INIT: Once = Once::new();
--- a/documentation/docs/guides/benchmarking.md
+++ b/documentation/docs/guides/benchmarking.md
@@ -0,0 +1,189 @@
+---
+sidebar_position: 7
+---
+# Benchmarking with Goose
+
+The Goose benchmarking system allows you to evaluate goose performance on complex tasks with one or more system configurations.<br></br>
+This guide covers how to use the `goose bench` command to run benchmarks and analyze results.
+
+## Configuration File
+
+The benchmark configuration is specified in a JSON file with the following structure:
+
+```json
+{
+  "models": [
+    {
+      "provider": "databricks",
+      "name": "goose",
+      "parallel_safe": true,
+      "tool_shim": {
+        "use_tool_shim": false,
+        "tool_shim_model": null
+      }
+    }
+  ],
+  "evals": [
+    {
+      "selector": "core",
+      "post_process_cmd": null,
+      "parallel_safe": true
+    }
+  ],
+  "include_dirs": [],
+  "repeat": 2,
+  "run_id": null,
+  "eval_result_filename": "eval-results.json",
+  "run_summary_filename": "run-results-summary.json",
+  "env_file": null
+}
+```
+
+### Configuration Options
+
+#### Models Section
+Each model entry in the `models` array specifies:
+
+- `provider`: The model provider (e.g., "databricks")
+- `name`: Model identifier
+- `parallel_safe`: Whether the model can be run in parallel
+- `tool_shim`: Optional configuration for tool shimming
+  - `use_tool_shim`: Enable/disable tool shimming
+  - `tool_shim_model`: Optional model to use for tool shimming
+
+#### Evals Section
+Each evaluation entry in the `evals` array specifies:
+
+- `selector`: The evaluation suite to run (e.g., "core")
+- `post_process_cmd`: Optional path to a post-processing script
+- `parallel_safe`: Whether the evaluation can run in parallel
+
+#### General Options
+
+- `include_dirs`: Additional directories to include in the evaluation
+- `repeat`: Number of times to repeat each evaluation
+- `run_id`: Optional identifier for the benchmark run
+- `eval_result_filename`: Name of the evaluation results file
+- `run_summary_filename`: Name of the summary results file
+- `env_file`: Optional path to an environment file
+
+##### Mechanics of include_dirs option
+The `include_dirs` config parameter makes the items at all paths listed within the option, available to all evaluations.<br></br>
+It accomplishes this by:
+* copying each included asset into the top-level directory created for each model/provider pair
+* at evaluation run-time
+  * whichever assets is explicitly required by an evaluation gets copied into the eval-specific directory
+  * only if the evaluation-code specifically pulls it in
+  * and only if the evaluation actually is covered by one of the configured selectors and therefore runs
+
+## Running Benchmarks
+
+### Quick Start
+
+1. The benchmarking system includes several evaluation suites.<br></br>
+Run the following to see a listing of every valid selector:
+
+```bash
+goose bench selectors
+```
+
+2. Create a basic configuration file:
+
+```bash
+goose bench init-config -n bench-config.json
+cat bench-config.json
+{
+  "models": [
+    {
+      "provider": "databricks",
+      "name": "goose",
+      "parallel_safe": true
+    }
+  ],
+  "evals": [
+    {
+      "selector": "core",
+      "parallel_safe": true
+    }
+  ],
+  "repeat": 1
+}
+...etc.
+```
+
+2. Run the benchmark:
+
+```bash
+goose bench run -c bench-config.json
+```
+
+### Customizing Evaluations
+
+You can customize runs in several ways:
+
+1. Using Post-Processing Commands after evaluation:
+```json
+{
+  "evals": [
+    {
+      "selector": "core",
+      "post_process_cmd": "/path/to/process-script.sh",
+      "parallel_safe": true
+    }
+  ]
+}
+```
+
+2. Including Additional Data:
+```json
+{
+  "include_dirs": [
+    "/path/to/custom/eval/data"
+  ]
+}
+```
+
+3. Setting Environment Variables:
+```json
+{
+  "env_file": "/path/to/env-file"
+}
+```
+
+## Output and Results
+
+The benchmark generates two main output files within a file-hierarchy similar to the following.<br></br>
+Results from running ach model/provider pair are stored within their own directory:
+```bash
+benchmark-${datetime}/
+  ${model}-${provider}[-tool-shim[-${shim-model}]]/
+    run-${i}/
+      ${an-include_dir-asset}
+      run-results-summary.json
+      core/developer/list_files/
+        ${an-include_dir-asset}
+        run-results-summary.json
+```
+
+1. `eval-results.json`: Contains detailed results from each evaluation, including:
+   - Individual test case results
+   - Model responses
+   - Scoring metrics
+   - Error logs
+
+2. `run-results-summary.json`: A collection of all eval results across all suites.
+
+### Debug Mode
+
+For detailed logging, you can enable debug mode:
+
+```bash
+RUST_LOG=debug goose bench bench-config.json
+```
+
+## Advanced Usage
+
+### Tool Shimming
+
+Tool shimming allows you to use a non-tool-capable models with Goose, provided Ollama is installed on the system.<br></br>
+See this guide for important details on [tool shimming](experimental-features).
--- a/documentation/docs/guides/goose-cli-commands.md
+++ b/documentation/docs/guides/goose-cli-commands.md
@@ -232,6 +232,16 @@ Used to show the available implementations of the agent loop itself
 goose agents
 ```

+### bench
+
+Used to evaluate system-configuration across a range of practical tasks. See the [detailed guide](/docs/guides/benchmarking) for more information.
+
+**Usage:**
+
+```bash
+goose bench ...etc.
+```
+
 ---
 ## Prompt Completion

--- a/scripts/run-benchmarks.sh
+++ b/scripts/run-benchmarks.sh
@@ -162,7 +162,7 @@ for ((i=0; i<$COUNT; i++)); do
  if [ "$TOOLSHIM" = true ]; then
    export GOOSE_TOOLSHIM=1
    if [[ -n "$TOOLSHIM_MODEL" ]]; then
-      export GOOSE_TOOLSHIM_MODEL="$TOOLSHIM_MODEL"
+      export GOOSE_TOOLSHIM_OLLAMA_MODEL="$TOOLSHIM_MODEL"
    fi
  fi
  
--- a/ui/desktop/src/components/settings/SettingsView.tsx
+++ b/ui/desktop/src/components/settings/SettingsView.tsx
@@ -18,7 +18,6 @@ import { ModeSelection } from './basic/ModeSelection';
 import SessionSharingSection from './session/SessionSharingSection';
 import { toastSuccess } from '../../toasts';

-
 const EXTENSIONS_DESCRIPTION =
  'The Model Context Protocol (MCP) is a system that allows AI models to securely connect with local or remote resources using standard server setups. It works like a client-server setup and expands AI capabilities using three main components: Prompts, Resources, and Tools.';