bugfix: refactor workdirs to be async-safe, and simpler (#1558)

This commit is contained in:
marcelle
2025-03-06 21:11:35 -05:00
committed by GitHub
parent ebf7cb1231
commit 798d657e7e
14 changed files with 288 additions and 226 deletions

84
Cargo.lock generated
View File

@@ -287,9 +287,9 @@ dependencies = [
[[package]] [[package]]
name = "aws-config" name = "aws-config"
version = "1.5.17" version = "1.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490aa7465ee685b2ced076bb87ef654a47724a7844e2c7d3af4e749ce5b875dd" checksum = "90aff65e86db5fe300752551c1b015ef72b708ac54bded8ef43d0d53cb7cb0b1"
dependencies = [ dependencies = [
"aws-credential-types", "aws-credential-types",
"aws-runtime", "aws-runtime",
@@ -297,7 +297,7 @@ dependencies = [
"aws-sdk-ssooidc", "aws-sdk-ssooidc",
"aws-sdk-sts", "aws-sdk-sts",
"aws-smithy-async", "aws-smithy-async",
"aws-smithy-http", "aws-smithy-http 0.61.1",
"aws-smithy-json", "aws-smithy-json",
"aws-smithy-runtime", "aws-smithy-runtime",
"aws-smithy-runtime-api", "aws-smithy-runtime-api",
@@ -336,7 +336,7 @@ dependencies = [
"aws-credential-types", "aws-credential-types",
"aws-sigv4", "aws-sigv4",
"aws-smithy-async", "aws-smithy-async",
"aws-smithy-http", "aws-smithy-http 0.60.12",
"aws-smithy-runtime", "aws-smithy-runtime",
"aws-smithy-runtime-api", "aws-smithy-runtime-api",
"aws-smithy-types", "aws-smithy-types",
@@ -354,15 +354,15 @@ dependencies = [
[[package]] [[package]]
name = "aws-sdk-bedrockruntime" name = "aws-sdk-bedrockruntime"
version = "1.75.0" version = "1.76.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ddf7475b6f50a1a5be8edb1bcdf6e4ae00feed5b890d14a3f1f0e14d76f5a16" checksum = "b538f72f5ab8d23de44aacd109788c37e268fe9f4d060168714a12514d73b434"
dependencies = [ dependencies = [
"aws-credential-types", "aws-credential-types",
"aws-runtime", "aws-runtime",
"aws-smithy-async", "aws-smithy-async",
"aws-smithy-eventstream", "aws-smithy-eventstream",
"aws-smithy-http", "aws-smithy-http 0.61.1",
"aws-smithy-json", "aws-smithy-json",
"aws-smithy-runtime", "aws-smithy-runtime",
"aws-smithy-runtime-api", "aws-smithy-runtime-api",
@@ -378,14 +378,14 @@ dependencies = [
[[package]] [[package]]
name = "aws-sdk-sso" name = "aws-sdk-sso"
version = "1.60.0" version = "1.61.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60186fab60b24376d3e33b9ff0a43485f99efd470e3b75a9160c849741d63d56" checksum = "e65ff295979977039a25f5a0bf067a64bc5e6aa38f3cef4037cf42516265553c"
dependencies = [ dependencies = [
"aws-credential-types", "aws-credential-types",
"aws-runtime", "aws-runtime",
"aws-smithy-async", "aws-smithy-async",
"aws-smithy-http", "aws-smithy-http 0.61.1",
"aws-smithy-json", "aws-smithy-json",
"aws-smithy-runtime", "aws-smithy-runtime",
"aws-smithy-runtime-api", "aws-smithy-runtime-api",
@@ -400,14 +400,14 @@ dependencies = [
[[package]] [[package]]
name = "aws-sdk-ssooidc" name = "aws-sdk-ssooidc"
version = "1.61.0" version = "1.62.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7033130ce1ee13e6018905b7b976c915963755aef299c1521897679d6cd4f8ef" checksum = "91430a60f754f235688387b75ee798ef00cfd09709a582be2b7525ebb5306d4f"
dependencies = [ dependencies = [
"aws-credential-types", "aws-credential-types",
"aws-runtime", "aws-runtime",
"aws-smithy-async", "aws-smithy-async",
"aws-smithy-http", "aws-smithy-http 0.61.1",
"aws-smithy-json", "aws-smithy-json",
"aws-smithy-runtime", "aws-smithy-runtime",
"aws-smithy-runtime-api", "aws-smithy-runtime-api",
@@ -422,14 +422,14 @@ dependencies = [
[[package]] [[package]]
name = "aws-sdk-sts" name = "aws-sdk-sts"
version = "1.61.0" version = "1.62.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c5c1cac7677179d622b4448b0d31bcb359185295dc6fca891920cfb17e2b5156" checksum = "9276e139d39fff5a0b0c984fc2d30f970f9a202da67234f948fda02e5bea1dbe"
dependencies = [ dependencies = [
"aws-credential-types", "aws-credential-types",
"aws-runtime", "aws-runtime",
"aws-smithy-async", "aws-smithy-async",
"aws-smithy-http", "aws-smithy-http 0.61.1",
"aws-smithy-json", "aws-smithy-json",
"aws-smithy-query", "aws-smithy-query",
"aws-smithy-runtime", "aws-smithy-runtime",
@@ -450,7 +450,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bfe75fad52793ce6dec0dc3d4b1f388f038b5eb866c8d4d7f3a8e21b5ea5051" checksum = "9bfe75fad52793ce6dec0dc3d4b1f388f038b5eb866c8d4d7f3a8e21b5ea5051"
dependencies = [ dependencies = [
"aws-credential-types", "aws-credential-types",
"aws-smithy-http", "aws-smithy-http 0.60.12",
"aws-smithy-runtime-api", "aws-smithy-runtime-api",
"aws-smithy-types", "aws-smithy-types",
"bytes", "bytes",
@@ -479,9 +479,9 @@ dependencies = [
[[package]] [[package]]
name = "aws-smithy-eventstream" name = "aws-smithy-eventstream"
version = "0.60.6" version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a" checksum = "461e5e02f9864cba17cff30f007c2e37ade94d01e87cdb5204e44a84e6d38c17"
dependencies = [ dependencies = [
"aws-smithy-types", "aws-smithy-types",
"bytes", "bytes",
@@ -493,6 +493,26 @@ name = "aws-smithy-http"
version = "0.60.12" version = "0.60.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc" checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
dependencies = [
"aws-smithy-runtime-api",
"aws-smithy-types",
"bytes",
"bytes-utils",
"futures-core",
"http 0.2.12",
"http-body 0.4.6",
"once_cell",
"percent-encoding",
"pin-project-lite",
"pin-utils",
"tracing",
]
[[package]]
name = "aws-smithy-http"
version = "0.61.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6f276f21c7921fe902826618d1423ae5bf74cf8c1b8472aee8434f3dfd31824"
dependencies = [ dependencies = [
"aws-smithy-eventstream", "aws-smithy-eventstream",
"aws-smithy-runtime-api", "aws-smithy-runtime-api",
@@ -535,7 +555,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d526a12d9ed61fadefda24abe2e682892ba288c2018bcb38b1b4c111d13f6d92" checksum = "d526a12d9ed61fadefda24abe2e682892ba288c2018bcb38b1b4c111d13f6d92"
dependencies = [ dependencies = [
"aws-smithy-async", "aws-smithy-async",
"aws-smithy-http", "aws-smithy-http 0.60.12",
"aws-smithy-runtime-api", "aws-smithy-runtime-api",
"aws-smithy-types", "aws-smithy-types",
"bytes", "bytes",
@@ -997,9 +1017,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
[[package]] [[package]]
name = "bytes" name = "bytes"
version = "1.10.0" version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
[[package]] [[package]]
name = "bytes-utils" name = "bytes-utils"
@@ -1730,9 +1750,9 @@ checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005"
[[package]] [[package]]
name = "either" name = "either"
version = "1.14.0" version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7914353092ddf589ad78f25c5c1c21b7f80b0ff8621e7c814c3485b5306da9d" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]] [[package]]
name = "encode_unicode" name = "encode_unicode"
@@ -4552,9 +4572,9 @@ dependencies = [
[[package]] [[package]]
name = "ring" name = "ring"
version = "0.17.11" version = "0.17.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da5349ae27d3887ca812fb375b45a4fbb36d8d12d2df394968cd86e35683fe73" checksum = "ed9b823fa29b721a59671b41d6b06e66b29e0628e207e8b1c3ceeda701ec928d"
dependencies = [ dependencies = [
"cc", "cc",
"cfg-if", "cfg-if",
@@ -5475,9 +5495,9 @@ dependencies = [
[[package]] [[package]]
name = "time" name = "time"
version = "0.3.37" version = "0.3.38"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" checksum = "bb041120f25f8fbe8fd2dbe4671c7c2ed74d83be2e7a77529bf7e0790ae3f472"
dependencies = [ dependencies = [
"deranged", "deranged",
"itoa", "itoa",
@@ -5492,15 +5512,15 @@ dependencies = [
[[package]] [[package]]
name = "time-core" name = "time-core"
version = "0.1.2" version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" checksum = "765c97a5b985b7c11d7bc27fa927dc4fe6af3a6dfb021d28deb60d3bf51e76ef"
[[package]] [[package]]
name = "time-macros" name = "time-macros"
version = "0.2.19" version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" checksum = "e8093bc3e81c3bc5f7879de09619d06c9a5a5e45ca44dfeeb7225bae38005c5c"
dependencies = [ dependencies = [
"num-conv", "num-conv",
"time-core", "time-core",

View File

@@ -0,0 +1,167 @@
use chrono::Local;
use std::fs;
use std::io;
use std::io::ErrorKind;
use std::path::Path;
use std::path::PathBuf;
use std::process::Command;
pub struct BenchmarkWorkDir {
pub base_path: PathBuf,
cwd: PathBuf,
run_name: String,
suite: Option<String>,
eval: Option<String>,
}
impl Default for BenchmarkWorkDir {
fn default() -> Self {
BenchmarkWorkDir::new("work_dir".to_string(), Vec::new())
}
}
impl BenchmarkWorkDir {
pub fn new(work_dir_name: String, include_dirs: Vec<PathBuf>) -> Self {
let base_path = PathBuf::from(format!("./benchmark-{}", work_dir_name));
fs::create_dir_all(&base_path).unwrap();
let current_time = Local::now().format("%H:%M:%S").to_string();
let current_date = Local::now().format("%Y-%m-%d").to_string();
let run_name = format!("{}-{}", &current_date, current_time);
let mut base_path = PathBuf::from(&base_path).canonicalize().unwrap();
base_path.push(run_name.clone());
fs::create_dir_all(&base_path).unwrap();
base_path.pop();
// abs paths from dir-strings
let dirs = include_dirs
.iter()
.map(|d| d.canonicalize().unwrap())
.collect::<Vec<_>>();
// deep copy each dir
let _: Vec<_> = dirs
.iter()
.map(|d| BenchmarkWorkDir::deep_copy(d.as_path(), base_path.as_path(), true))
.collect();
std::env::set_current_dir(&base_path).unwrap();
BenchmarkWorkDir {
base_path: base_path.clone(),
cwd: base_path.clone(),
run_name,
suite: None,
eval: None,
}
}
pub fn cd(&mut self, path: PathBuf) -> anyhow::Result<&mut Self> {
fs::create_dir_all(&path)?;
std::env::set_current_dir(&path)?;
self.cwd = path;
Ok(self)
}
pub fn set_suite(&mut self, suite: &str) {
self.eval = None;
self.suite = Some(suite.to_string());
let mut suite_dir = self.base_path.clone();
suite_dir.push(self.run_name.clone());
suite_dir.push(suite);
self.cd(suite_dir.clone()).unwrap_or_else(|_| {
panic!("Failed to execute cd into {}", suite_dir.clone().display())
});
}
pub fn set_eval(&mut self, eval: &str) {
self.eval = Some(eval.to_string());
let mut eval_dir = self.base_path.clone();
eval_dir.push(self.run_name.clone());
eval_dir.push(self.suite.clone().unwrap());
eval_dir.push(eval);
self.cd(eval_dir.clone())
.unwrap_or_else(|_| panic!("Failed to execute cd into {}", eval_dir.clone().display()));
}
fn chop_relative_base<P: AsRef<Path>>(path: P) -> anyhow::Result<PathBuf> {
let path = path.as_ref();
// Get the path components as an iterator
let mut components = path.components();
// Check the first component
if let Some(first) = components.next() {
use std::path::Component;
match first {
Component::ParentDir => Err(anyhow::anyhow!("RelativePathBaseError: Only paths relative to the current working directory are supported.")),
// If first component is "."
Component::CurDir => Ok(components.collect()),
// Otherwise, keep the full path
_ => {
// Create a new PathBuf
let mut result = PathBuf::new();
// Add back the first component
result.push(first);
// Add all remaining components
result.extend(components);
Ok(result)
}
}
} else {
// Empty path
Ok(PathBuf::new())
}
}
pub fn fs_get(&mut self, path: String) -> anyhow::Result<PathBuf> {
let p = PathBuf::from(&path);
if p.exists() {
return Ok(PathBuf::from(path));
}
if p.is_absolute() {
return Err(anyhow::anyhow!("AbsolutePathError: Only paths relative to the current working directory are supported."));
}
let asset_rel_path = Self::chop_relative_base(p.clone())
.unwrap_or_else(|_| panic!("AbsolutePathError: Only paths relative to the current working directory are supported."));
let here = PathBuf::from(".").canonicalize()?;
let artifact_at_root = self.base_path.clone().join(asset_rel_path);
BenchmarkWorkDir::deep_copy(artifact_at_root.as_path(), here.as_path(), true)?;
Ok(PathBuf::from(path))
}
fn deep_copy<P, Q>(src: P, dst: Q, recursive: bool) -> io::Result<()>
where
P: AsRef<Path>,
Q: AsRef<Path>,
{
let src = src.as_ref();
let dst = dst.as_ref();
let mut cmd = Command::new("cp");
// Add -r flag if recursive is true
if recursive {
cmd.arg("-r");
}
// Add source and destination paths
cmd.arg(src).arg(dst);
// Execute the command
let output = cmd.output()?;
if output.status.success() {
Ok(())
} else {
let error_message = String::from_utf8_lossy(&output.stderr).to_string();
Err(io::Error::new(ErrorKind::Other, error_message))
}
}
}

View File

@@ -1,8 +1,8 @@
// Create a new file called test.txt with the content 'Hello, World! // Create a new file called test.txt with the content 'Hello, World!
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation; use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait; use async_trait::async_trait;
use goose::message::MessageContent; use goose::message::MessageContent;
use mcp_core::role::Role; use mcp_core::role::Role;
@@ -22,7 +22,7 @@ impl Evaluation for DeveloperCreateFile {
async fn run( async fn run(
&self, &self,
mut agent: Box<dyn BenchAgent>, mut agent: Box<dyn BenchAgent>,
_work_dir: &mut WorkDir, _work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> { ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let mut metrics = Vec::new(); let mut metrics = Vec::new();

View File

@@ -1,6 +1,6 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation; use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait; use async_trait::async_trait;
// use std::fs; // use std::fs;
@@ -17,7 +17,7 @@ impl Evaluation for ExampleEval {
async fn run( async fn run(
&self, &self,
mut agent: Box<dyn BenchAgent>, mut agent: Box<dyn BenchAgent>,
_work_dir: &mut WorkDir, _work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> { ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("ExampleEval - run"); println!("ExampleEval - run");
// let f = work_dir.fs_get(String::from("./arbitrary_dir/arbitrary_file.txt"))?; // let f = work_dir.fs_get(String::from("./arbitrary_dir/arbitrary_file.txt"))?;

View File

@@ -1,6 +1,6 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation; use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait; use async_trait::async_trait;
use goose::message::MessageContent; use goose::message::MessageContent;
use mcp_core::content::Content; use mcp_core::content::Content;
@@ -21,7 +21,7 @@ impl Evaluation for DeveloperImage {
async fn run( async fn run(
&self, &self,
mut agent: Box<dyn BenchAgent>, mut agent: Box<dyn BenchAgent>,
_work_dir: &mut WorkDir, _work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> { ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let mut metrics = Vec::new(); let mut metrics = Vec::new();

View File

@@ -1,6 +1,6 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation; use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait; use async_trait::async_trait;
use goose::message::MessageContent; use goose::message::MessageContent;
use mcp_core::role::Role; use mcp_core::role::Role;
@@ -20,7 +20,7 @@ impl Evaluation for DeveloperListFiles {
async fn run( async fn run(
&self, &self,
mut agent: Box<dyn BenchAgent>, mut agent: Box<dyn BenchAgent>,
_work_dir: &mut WorkDir, _work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> { ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let mut metrics = Vec::new(); let mut metrics = Vec::new();

View File

@@ -1,8 +1,8 @@
// Create a new file called test.txt with the content 'Hello, World! // Create a new file called test.txt with the content 'Hello, World!
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation; use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait; use async_trait::async_trait;
use goose::message::MessageContent; use goose::message::MessageContent;
use mcp_core::role::Role; use mcp_core::role::Role;
@@ -22,7 +22,7 @@ impl Evaluation for MemoryRememberMemory {
async fn run( async fn run(
&self, &self,
mut agent: Box<dyn BenchAgent>, mut agent: Box<dyn BenchAgent>,
_work_dir: &mut WorkDir, _work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> { ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let mut metrics = Vec::new(); let mut metrics = Vec::new();

View File

@@ -1,8 +1,8 @@
// Create a new file called test.txt with the content 'Hello, World! // Create a new file called test.txt with the content 'Hello, World!
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation; use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait; use async_trait::async_trait;
use goose::message::MessageContent; use goose::message::MessageContent;
use mcp_core::role::Role; use mcp_core::role::Role;
@@ -22,7 +22,7 @@ impl Evaluation for ComputerControllerScript {
async fn run( async fn run(
&self, &self,
mut agent: Box<dyn BenchAgent>, mut agent: Box<dyn BenchAgent>,
_work_dir: &mut WorkDir, _work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> { ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let mut metrics = Vec::new(); let mut metrics = Vec::new();

View File

@@ -1,6 +1,6 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation; use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait; use async_trait::async_trait;
use std::fs; use std::fs;
@@ -18,31 +18,20 @@ impl Evaluation for DeveloperSearchReplace {
async fn run( async fn run(
&self, &self,
mut agent: Box<dyn BenchAgent>, mut agent: Box<dyn BenchAgent>,
work_dir: &mut WorkDir, work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> { ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let mut metrics = Vec::new(); let mut metrics = Vec::new();
// Try to find the assets directory let _target_file = match work_dir.fs_get("./assets/kubernetes_swagger.json".to_string()) {
let assets_dir_path = work_dir.path.join("assets"); Ok(file) => file,
let _assets_exists = assets_dir_path.exists(); Err(_) => {
return Err(anyhow::anyhow!(
// Get the kubernetes_swagger.json file from the assets directory and copy it to the working directory for eval "Could not find kubernetes_swagger.json file"
// so the agent can modify it ))
let source_file = work_dir.path.join("assets").join("kubernetes_swagger.json"); }
let target_file = std::env::current_dir() };
.unwrap_or_default() let mut source_file = work_dir.base_path.clone();
.join("kubernetes_swagger.json"); source_file.push("assets/kubernetes_swagger.json");
// Copy the file to the root of the working directory if it doesn't exist there yet
if !target_file.exists() && source_file.exists() {
println!("Copying file from {:?} to {:?}", source_file, target_file);
fs::copy(&source_file, &target_file)?;
println!("File copied successfully");
} else {
return Err(anyhow::anyhow!(
"Could not find kubernetes_swagger.json file"
));
}
// Send the prompt to modify the file // Send the prompt to modify the file
let _messages = agent.prompt("Remove the io.k8s.api.admissionregistration.v1.ServiceReference definition block and replace with a new definition for io.k8s.api.admissionregistration.v1.FakeServiceReference. Update the fields in the definition as well to be consistent. Don't change the property names. Don't update any references to the old definition. Only modify the definition and it's description to 'FakeServiceReference simulates a reference to a fake service for testing purposes.'.The file to modify is kubernetes_swagger.json.".to_string()).await?; let _messages = agent.prompt("Remove the io.k8s.api.admissionregistration.v1.ServiceReference definition block and replace with a new definition for io.k8s.api.admissionregistration.v1.FakeServiceReference. Update the fields in the definition as well to be consistent. Don't change the property names. Don't update any references to the old definition. Only modify the definition and it's description to 'FakeServiceReference simulates a reference to a fake service for testing purposes.'.The file to modify is kubernetes_swagger.json.".to_string()).await?;
@@ -53,7 +42,7 @@ impl Evaluation for DeveloperSearchReplace {
.join("kubernetes_swagger.json"); .join("kubernetes_swagger.json");
// Read the expected patch file from the assets directory // Read the expected patch file from the assets directory
let patch_file_path = work_dir.path.join("assets").join("kubernetes.patch"); let patch_file_path = work_dir.base_path.join("assets").join("kubernetes.patch");
if !patch_file_path.exists() { if !patch_file_path.exists() {
return Err(anyhow::anyhow!("Could not find patch file")); return Err(anyhow::anyhow!("Could not find patch file"));
} }

View File

@@ -1,8 +1,8 @@
// Create a new file called test.txt with the content 'Hello, World! // Create a new file called test.txt with the content 'Hello, World!
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation; use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait; use async_trait::async_trait;
use goose::message::MessageContent; use goose::message::MessageContent;
use mcp_core::role::Role; use mcp_core::role::Role;
@@ -22,7 +22,7 @@ impl Evaluation for ComputerControllerWebScrape {
async fn run( async fn run(
&self, &self,
mut agent: Box<dyn BenchAgent>, mut agent: Box<dyn BenchAgent>,
_work_dir: &mut WorkDir, _work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> { ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let mut metrics = Vec::new(); let mut metrics = Vec::new();

View File

@@ -1,4 +1,4 @@
use crate::work_dir::WorkDir; use crate::bench_work_dir::BenchmarkWorkDir;
use anyhow::Result; use anyhow::Result;
use async_trait::async_trait; use async_trait::async_trait;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
@@ -42,7 +42,7 @@ pub trait Evaluation: Send + Sync {
async fn run( async fn run(
&self, &self,
agent: Box<dyn BenchAgent>, agent: Box<dyn BenchAgent>,
run_loc: &mut WorkDir, run_loc: &mut BenchmarkWorkDir,
) -> Result<Vec<(String, EvaluationMetric)>>; ) -> Result<Vec<(String, EvaluationMetric)>>;
fn name(&self) -> &str; fn name(&self) -> &str;

View File

@@ -1,4 +1,4 @@
pub mod bench_work_dir;
pub mod error_capture; pub mod error_capture;
pub mod eval_suites; pub mod eval_suites;
pub mod reporting; pub mod reporting;
pub mod work_dir;

View File

@@ -1,113 +0,0 @@
use std::fs;
use std::io;
use std::path::Path;
use std::path::PathBuf;
pub struct WorkDir {
pub path: PathBuf,
traversal: Vec<PathBuf>,
}
impl Default for WorkDir {
fn default() -> Self {
let path = PathBuf::from(".").canonicalize().unwrap();
WorkDir {
path: path.clone(),
traversal: vec![path.clone()],
}
}
}
impl WorkDir {
pub fn new(path: &str) -> Self {
let path = PathBuf::from(path);
WorkDir {
path: path.clone(),
traversal: vec![path.clone()],
}
}
pub fn at(path: String, include_dirs: Vec<PathBuf>) -> anyhow::Result<WorkDir> {
fs::create_dir_all(&path)?;
let dirs = include_dirs
.iter()
.map(|d| d.canonicalize().unwrap())
.collect::<Vec<_>>();
let p = PathBuf::from(&path).canonicalize()?;
let _: Vec<_> = dirs
.iter()
.map(|d| WorkDir::deep_copy(d.as_path(), p.as_path()))
.collect();
std::env::set_current_dir(&path)?;
Ok(WorkDir::new(p.to_string_lossy().to_string().as_str()))
}
pub fn move_to(&mut self, path: String) -> anyhow::Result<&mut Self> {
fs::create_dir_all(&path)?;
self.traversal.push(PathBuf::from(&path));
std::env::set_current_dir(&path)?;
Ok(self)
}
pub fn fs_get(&mut self, path: String) -> anyhow::Result<PathBuf> {
let p = Path::new(&path);
if !p.exists() {
let artifact_at_root = if p.is_dir() {
self.traversal[0].clone().join(&path).canonicalize()?
} else {
self.traversal[0]
.clone()
.join(p.parent().unwrap_or(Path::new("")))
.canonicalize()?
};
let here = PathBuf::from(".").canonicalize()?;
WorkDir::deep_copy(artifact_at_root.as_path(), here.as_path())?;
}
Ok(PathBuf::from(path))
}
fn deep_copy(src: &Path, dst: &Path) -> io::Result<()> {
// Create the destination directory with the source's name
let dst_dir = if let Some(src_name) = src.file_name() {
dst.join(src_name)
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Source path must have a file name",
));
};
// Create the destination directory if it doesn't exist
if !dst_dir.exists() {
fs::create_dir_all(&dst_dir)?;
}
// Copy each entry in the source directory
for entry in fs::read_dir(src)? {
let entry = entry?;
let ty = entry.file_type()?;
let src_path = entry.path();
let dst_path = dst_dir.join(entry.file_name());
if ty.is_dir() {
WorkDir::deep_copy(&src_path, dst_path.parent().unwrap())?;
} else {
fs::copy(&src_path, &dst_path)?;
}
}
Ok(())
}
}
impl Drop for WorkDir {
fn drop(&mut self) {
self.traversal.pop();
std::env::set_current_dir("..").unwrap()
}
}

View File

@@ -1,13 +1,12 @@
use crate::session::build_session; use crate::session::build_session;
use crate::Session; use crate::Session;
use async_trait::async_trait; use async_trait::async_trait;
use chrono::Local;
use goose::config::Config; use goose::config::Config;
use goose::message::Message; use goose::message::Message;
use goose_bench::bench_work_dir::BenchmarkWorkDir;
use goose_bench::error_capture::ErrorCaptureLayer; use goose_bench::error_capture::ErrorCaptureLayer;
use goose_bench::eval_suites::{BenchAgent, BenchAgentError, Evaluation, EvaluationSuiteFactory}; use goose_bench::eval_suites::{BenchAgent, BenchAgentError, Evaluation, EvaluationSuiteFactory};
use goose_bench::reporting::{BenchmarkResults, EvaluationResult, SuiteResult}; use goose_bench::reporting::{BenchmarkResults, EvaluationResult, SuiteResult};
use goose_bench::work_dir::WorkDir;
use std::collections::HashMap; use std::collections::HashMap;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
@@ -77,48 +76,47 @@ impl BenchAgent for BenchAgentWrapper {
async fn run_eval( async fn run_eval(
evaluation: Box<dyn Evaluation>, evaluation: Box<dyn Evaluation>,
work_dir: &mut WorkDir, work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<EvaluationResult> { ) -> anyhow::Result<EvaluationResult> {
let mut result = EvaluationResult::new(evaluation.name().to_string()); let mut result = EvaluationResult::new(evaluation.name().to_string());
if let Ok(work_dir) = work_dir.move_to(format!("./{}", &evaluation.name())) { let requirements = evaluation.required_extensions();
let requirements = evaluation.required_extensions();
// Create session with error capture // Create session with error capture
let base_session = let base_session =
build_session(None, false, requirements.external, requirements.builtin).await; build_session(None, false, requirements.external, requirements.builtin).await;
let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session))); let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session)));
let bench_session_clone = bench_session.clone(); let bench_session_clone = bench_session.clone();
if let Ok(metrics) = evaluation if let Ok(metrics) = evaluation
.run(Box::new(BenchAgentWrapper(bench_session)), work_dir) .run(Box::new(BenchAgentWrapper(bench_session)), work_dir)
.await .await
{ {
for (name, metric) in metrics { for (name, metric) in metrics {
result.add_metric(name, metric); result.add_metric(name, metric);
} }
// Add any errors that occurred // Add any errors that occurred
let agent = BenchAgentWrapper(bench_session_clone); let agent = BenchAgentWrapper(bench_session_clone);
for error in agent.get_errors().await { for error in agent.get_errors().await {
result.add_error(error); result.add_error(error);
}
} }
} }
Ok(result) Ok(result)
} }
async fn run_suite(suite: &str, work_dir: &mut WorkDir) -> anyhow::Result<SuiteResult> { async fn run_suite(suite: &str, work_dir: &mut BenchmarkWorkDir) -> anyhow::Result<SuiteResult> {
let mut suite_result = SuiteResult::new(suite.to_string()); let mut suite_result = SuiteResult::new(suite.to_string());
let eval_lock = Mutex::new(0);
if let Ok(work_dir) = work_dir.move_to(format!("./{}", &suite)) { if let Some(evals) = EvaluationSuiteFactory::create(suite) {
if let Some(evals) = EvaluationSuiteFactory::create(suite) { for eval in evals {
for eval in evals { let _unused = eval_lock.lock().await;
let eval_result = run_eval(eval, work_dir).await?; work_dir.set_eval(eval.name());
suite_result.add_evaluation(eval_result); let eval_result = run_eval(eval, work_dir).await?;
} suite_result.add_evaluation(eval_result);
} }
} }
@@ -135,24 +133,25 @@ pub async fn run_benchmark(
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let config = Config::global(); let config = Config::global();
let goose_model: String = config
.get("GOOSE_MODEL")
.expect("No model configured. Run 'goose configure' first");
let provider_name: String = config let provider_name: String = config
.get("GOOSE_PROVIDER") .get("GOOSE_PROVIDER")
.expect("No provider configured. Run 'goose configure' first"); .expect("No provider configured. Run 'goose configure' first");
let mut results = BenchmarkResults::new(provider_name.clone()); let mut results = BenchmarkResults::new(provider_name.clone());
let current_time = Local::now().format("%H:%M:%S").to_string(); let mut work_dir = BenchmarkWorkDir::new(
let current_date = Local::now().format("%Y-%m-%d").to_string(); format!("{}-{}", provider_name, goose_model),
if let Ok(mut work_dir) = WorkDir::at(
format!("./benchmark-{}", &provider_name),
include_dirs.clone(), include_dirs.clone(),
) { );
if let Ok(work_dir) = work_dir.move_to(format!("./{}-{}", &current_date, current_time)) { let suite_lock = Mutex::new(0);
for suite in suites { for suite in suites {
let suite_result = run_suite(suite, work_dir).await?; let _unused = suite_lock.lock().await;
results.add_suite(suite_result); work_dir.set_suite(suite);
} let suite_result = run_suite(suite, &mut work_dir).await?;
} results.add_suite(suite_result);
} }
Ok(results) Ok(results)