mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-18 22:54:24 +01:00
bugfix: refactor workdirs to be async-safe, and simpler (#1558)
This commit is contained in:
84
Cargo.lock
generated
84
Cargo.lock
generated
@@ -287,9 +287,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-config"
|
||||
version = "1.5.17"
|
||||
version = "1.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "490aa7465ee685b2ced076bb87ef654a47724a7844e2c7d3af4e749ce5b875dd"
|
||||
checksum = "90aff65e86db5fe300752551c1b015ef72b708ac54bded8ef43d0d53cb7cb0b1"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
@@ -297,7 +297,7 @@ dependencies = [
|
||||
"aws-sdk-ssooidc",
|
||||
"aws-sdk-sts",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-http 0.61.1",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -336,7 +336,7 @@ dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-sigv4",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-http 0.60.12",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
@@ -354,15 +354,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-bedrockruntime"
|
||||
version = "1.75.0"
|
||||
version = "1.76.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2ddf7475b6f50a1a5be8edb1bcdf6e4ae00feed5b890d14a3f1f0e14d76f5a16"
|
||||
checksum = "b538f72f5ab8d23de44aacd109788c37e268fe9f4d060168714a12514d73b434"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-eventstream",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-http 0.61.1",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -378,14 +378,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-sso"
|
||||
version = "1.60.0"
|
||||
version = "1.61.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60186fab60b24376d3e33b9ff0a43485f99efd470e3b75a9160c849741d63d56"
|
||||
checksum = "e65ff295979977039a25f5a0bf067a64bc5e6aa38f3cef4037cf42516265553c"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-http 0.61.1",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -400,14 +400,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-ssooidc"
|
||||
version = "1.61.0"
|
||||
version = "1.62.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7033130ce1ee13e6018905b7b976c915963755aef299c1521897679d6cd4f8ef"
|
||||
checksum = "91430a60f754f235688387b75ee798ef00cfd09709a582be2b7525ebb5306d4f"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-http 0.61.1",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -422,14 +422,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-sts"
|
||||
version = "1.61.0"
|
||||
version = "1.62.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c5c1cac7677179d622b4448b0d31bcb359185295dc6fca891920cfb17e2b5156"
|
||||
checksum = "9276e139d39fff5a0b0c984fc2d30f970f9a202da67234f948fda02e5bea1dbe"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-http 0.61.1",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-query",
|
||||
"aws-smithy-runtime",
|
||||
@@ -450,7 +450,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9bfe75fad52793ce6dec0dc3d4b1f388f038b5eb866c8d4d7f3a8e21b5ea5051"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-http 0.60.12",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
@@ -479,9 +479,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-eventstream"
|
||||
version = "0.60.6"
|
||||
version = "0.60.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a"
|
||||
checksum = "461e5e02f9864cba17cff30f007c2e37ade94d01e87cdb5204e44a84e6d38c17"
|
||||
dependencies = [
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
@@ -493,6 +493,26 @@ name = "aws-smithy-http"
|
||||
version = "0.60.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
|
||||
dependencies = [
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
"bytes-utils",
|
||||
"futures-core",
|
||||
"http 0.2.12",
|
||||
"http-body 0.4.6",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-http"
|
||||
version = "0.61.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6f276f21c7921fe902826618d1423ae5bf74cf8c1b8472aee8434f3dfd31824"
|
||||
dependencies = [
|
||||
"aws-smithy-eventstream",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -535,7 +555,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d526a12d9ed61fadefda24abe2e682892ba288c2018bcb38b1b4c111d13f6d92"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-http 0.60.12",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
@@ -997,9 +1017,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.10.0"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9"
|
||||
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
|
||||
|
||||
[[package]]
|
||||
name = "bytes-utils"
|
||||
@@ -1730,9 +1750,9 @@ checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.14.0"
|
||||
version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7914353092ddf589ad78f25c5c1c21b7f80b0ff8621e7c814c3485b5306da9d"
|
||||
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "encode_unicode"
|
||||
@@ -4552,9 +4572,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.11"
|
||||
version = "0.17.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da5349ae27d3887ca812fb375b45a4fbb36d8d12d2df394968cd86e35683fe73"
|
||||
checksum = "ed9b823fa29b721a59671b41d6b06e66b29e0628e207e8b1c3ceeda701ec928d"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
@@ -5475,9 +5495,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.37"
|
||||
version = "0.3.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21"
|
||||
checksum = "bb041120f25f8fbe8fd2dbe4671c7c2ed74d83be2e7a77529bf7e0790ae3f472"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"itoa",
|
||||
@@ -5492,15 +5512,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.2"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
||||
checksum = "765c97a5b985b7c11d7bc27fa927dc4fe6af3a6dfb021d28deb60d3bf51e76ef"
|
||||
|
||||
[[package]]
|
||||
name = "time-macros"
|
||||
version = "0.2.19"
|
||||
version = "0.2.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de"
|
||||
checksum = "e8093bc3e81c3bc5f7879de09619d06c9a5a5e45ca44dfeeb7225bae38005c5c"
|
||||
dependencies = [
|
||||
"num-conv",
|
||||
"time-core",
|
||||
|
||||
167
crates/goose-bench/src/bench_work_dir.rs
Normal file
167
crates/goose-bench/src/bench_work_dir.rs
Normal file
@@ -0,0 +1,167 @@
|
||||
use chrono::Local;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::io::ErrorKind;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
pub struct BenchmarkWorkDir {
|
||||
pub base_path: PathBuf,
|
||||
cwd: PathBuf,
|
||||
run_name: String,
|
||||
suite: Option<String>,
|
||||
eval: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for BenchmarkWorkDir {
|
||||
fn default() -> Self {
|
||||
BenchmarkWorkDir::new("work_dir".to_string(), Vec::new())
|
||||
}
|
||||
}
|
||||
impl BenchmarkWorkDir {
|
||||
pub fn new(work_dir_name: String, include_dirs: Vec<PathBuf>) -> Self {
|
||||
let base_path = PathBuf::from(format!("./benchmark-{}", work_dir_name));
|
||||
fs::create_dir_all(&base_path).unwrap();
|
||||
|
||||
let current_time = Local::now().format("%H:%M:%S").to_string();
|
||||
let current_date = Local::now().format("%Y-%m-%d").to_string();
|
||||
let run_name = format!("{}-{}", ¤t_date, current_time);
|
||||
|
||||
let mut base_path = PathBuf::from(&base_path).canonicalize().unwrap();
|
||||
base_path.push(run_name.clone());
|
||||
fs::create_dir_all(&base_path).unwrap();
|
||||
base_path.pop();
|
||||
|
||||
// abs paths from dir-strings
|
||||
let dirs = include_dirs
|
||||
.iter()
|
||||
.map(|d| d.canonicalize().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// deep copy each dir
|
||||
let _: Vec<_> = dirs
|
||||
.iter()
|
||||
.map(|d| BenchmarkWorkDir::deep_copy(d.as_path(), base_path.as_path(), true))
|
||||
.collect();
|
||||
|
||||
std::env::set_current_dir(&base_path).unwrap();
|
||||
|
||||
BenchmarkWorkDir {
|
||||
base_path: base_path.clone(),
|
||||
cwd: base_path.clone(),
|
||||
run_name,
|
||||
suite: None,
|
||||
eval: None,
|
||||
}
|
||||
}
|
||||
pub fn cd(&mut self, path: PathBuf) -> anyhow::Result<&mut Self> {
|
||||
fs::create_dir_all(&path)?;
|
||||
std::env::set_current_dir(&path)?;
|
||||
self.cwd = path;
|
||||
Ok(self)
|
||||
}
|
||||
pub fn set_suite(&mut self, suite: &str) {
|
||||
self.eval = None;
|
||||
self.suite = Some(suite.to_string());
|
||||
|
||||
let mut suite_dir = self.base_path.clone();
|
||||
suite_dir.push(self.run_name.clone());
|
||||
suite_dir.push(suite);
|
||||
|
||||
self.cd(suite_dir.clone()).unwrap_or_else(|_| {
|
||||
panic!("Failed to execute cd into {}", suite_dir.clone().display())
|
||||
});
|
||||
}
|
||||
pub fn set_eval(&mut self, eval: &str) {
|
||||
self.eval = Some(eval.to_string());
|
||||
|
||||
let mut eval_dir = self.base_path.clone();
|
||||
eval_dir.push(self.run_name.clone());
|
||||
eval_dir.push(self.suite.clone().unwrap());
|
||||
eval_dir.push(eval);
|
||||
|
||||
self.cd(eval_dir.clone())
|
||||
.unwrap_or_else(|_| panic!("Failed to execute cd into {}", eval_dir.clone().display()));
|
||||
}
|
||||
|
||||
fn chop_relative_base<P: AsRef<Path>>(path: P) -> anyhow::Result<PathBuf> {
|
||||
let path = path.as_ref();
|
||||
|
||||
// Get the path components as an iterator
|
||||
let mut components = path.components();
|
||||
|
||||
// Check the first component
|
||||
if let Some(first) = components.next() {
|
||||
use std::path::Component;
|
||||
|
||||
match first {
|
||||
Component::ParentDir => Err(anyhow::anyhow!("RelativePathBaseError: Only paths relative to the current working directory are supported.")),
|
||||
// If first component is "."
|
||||
Component::CurDir => Ok(components.collect()),
|
||||
// Otherwise, keep the full path
|
||||
_ => {
|
||||
// Create a new PathBuf
|
||||
let mut result = PathBuf::new();
|
||||
// Add back the first component
|
||||
result.push(first);
|
||||
// Add all remaining components
|
||||
result.extend(components);
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Empty path
|
||||
Ok(PathBuf::new())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fs_get(&mut self, path: String) -> anyhow::Result<PathBuf> {
|
||||
let p = PathBuf::from(&path);
|
||||
if p.exists() {
|
||||
return Ok(PathBuf::from(path));
|
||||
}
|
||||
|
||||
if p.is_absolute() {
|
||||
return Err(anyhow::anyhow!("AbsolutePathError: Only paths relative to the current working directory are supported."));
|
||||
}
|
||||
|
||||
let asset_rel_path = Self::chop_relative_base(p.clone())
|
||||
.unwrap_or_else(|_| panic!("AbsolutePathError: Only paths relative to the current working directory are supported."));
|
||||
|
||||
let here = PathBuf::from(".").canonicalize()?;
|
||||
let artifact_at_root = self.base_path.clone().join(asset_rel_path);
|
||||
|
||||
BenchmarkWorkDir::deep_copy(artifact_at_root.as_path(), here.as_path(), true)?;
|
||||
Ok(PathBuf::from(path))
|
||||
}
|
||||
|
||||
fn deep_copy<P, Q>(src: P, dst: Q, recursive: bool) -> io::Result<()>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
Q: AsRef<Path>,
|
||||
{
|
||||
let src = src.as_ref();
|
||||
let dst = dst.as_ref();
|
||||
|
||||
let mut cmd = Command::new("cp");
|
||||
|
||||
// Add -r flag if recursive is true
|
||||
if recursive {
|
||||
cmd.arg("-r");
|
||||
}
|
||||
|
||||
// Add source and destination paths
|
||||
cmd.arg(src).arg(dst);
|
||||
|
||||
// Execute the command
|
||||
let output = cmd.output()?;
|
||||
|
||||
if output.status.success() {
|
||||
Ok(())
|
||||
} else {
|
||||
let error_message = String::from_utf8_lossy(&output.stderr).to_string();
|
||||
Err(io::Error::new(ErrorKind::Other, error_message))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
// Create a new file called test.txt with the content 'Hello, World!
|
||||
|
||||
use crate::bench_work_dir::BenchmarkWorkDir;
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
use goose::message::MessageContent;
|
||||
use mcp_core::role::Role;
|
||||
@@ -22,7 +22,7 @@ impl Evaluation for DeveloperCreateFile {
|
||||
async fn run(
|
||||
&self,
|
||||
mut agent: Box<dyn BenchAgent>,
|
||||
_work_dir: &mut WorkDir,
|
||||
_work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::bench_work_dir::BenchmarkWorkDir;
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
// use std::fs;
|
||||
|
||||
@@ -17,7 +17,7 @@ impl Evaluation for ExampleEval {
|
||||
async fn run(
|
||||
&self,
|
||||
mut agent: Box<dyn BenchAgent>,
|
||||
_work_dir: &mut WorkDir,
|
||||
_work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
println!("ExampleEval - run");
|
||||
// let f = work_dir.fs_get(String::from("./arbitrary_dir/arbitrary_file.txt"))?;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::bench_work_dir::BenchmarkWorkDir;
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
use goose::message::MessageContent;
|
||||
use mcp_core::content::Content;
|
||||
@@ -21,7 +21,7 @@ impl Evaluation for DeveloperImage {
|
||||
async fn run(
|
||||
&self,
|
||||
mut agent: Box<dyn BenchAgent>,
|
||||
_work_dir: &mut WorkDir,
|
||||
_work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::bench_work_dir::BenchmarkWorkDir;
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
use goose::message::MessageContent;
|
||||
use mcp_core::role::Role;
|
||||
@@ -20,7 +20,7 @@ impl Evaluation for DeveloperListFiles {
|
||||
async fn run(
|
||||
&self,
|
||||
mut agent: Box<dyn BenchAgent>,
|
||||
_work_dir: &mut WorkDir,
|
||||
_work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
// Create a new file called test.txt with the content 'Hello, World!
|
||||
|
||||
use crate::bench_work_dir::BenchmarkWorkDir;
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
use goose::message::MessageContent;
|
||||
use mcp_core::role::Role;
|
||||
@@ -22,7 +22,7 @@ impl Evaluation for MemoryRememberMemory {
|
||||
async fn run(
|
||||
&self,
|
||||
mut agent: Box<dyn BenchAgent>,
|
||||
_work_dir: &mut WorkDir,
|
||||
_work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
// Create a new file called test.txt with the content 'Hello, World!
|
||||
|
||||
use crate::bench_work_dir::BenchmarkWorkDir;
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
use goose::message::MessageContent;
|
||||
use mcp_core::role::Role;
|
||||
@@ -22,7 +22,7 @@ impl Evaluation for ComputerControllerScript {
|
||||
async fn run(
|
||||
&self,
|
||||
mut agent: Box<dyn BenchAgent>,
|
||||
_work_dir: &mut WorkDir,
|
||||
_work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::bench_work_dir::BenchmarkWorkDir;
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
use std::fs;
|
||||
|
||||
@@ -18,31 +18,20 @@ impl Evaluation for DeveloperSearchReplace {
|
||||
async fn run(
|
||||
&self,
|
||||
mut agent: Box<dyn BenchAgent>,
|
||||
work_dir: &mut WorkDir,
|
||||
work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
// Try to find the assets directory
|
||||
let assets_dir_path = work_dir.path.join("assets");
|
||||
let _assets_exists = assets_dir_path.exists();
|
||||
|
||||
// Get the kubernetes_swagger.json file from the assets directory and copy it to the working directory for eval
|
||||
// so the agent can modify it
|
||||
let source_file = work_dir.path.join("assets").join("kubernetes_swagger.json");
|
||||
let target_file = std::env::current_dir()
|
||||
.unwrap_or_default()
|
||||
.join("kubernetes_swagger.json");
|
||||
|
||||
// Copy the file to the root of the working directory if it doesn't exist there yet
|
||||
if !target_file.exists() && source_file.exists() {
|
||||
println!("Copying file from {:?} to {:?}", source_file, target_file);
|
||||
fs::copy(&source_file, &target_file)?;
|
||||
println!("File copied successfully");
|
||||
} else {
|
||||
let _target_file = match work_dir.fs_get("./assets/kubernetes_swagger.json".to_string()) {
|
||||
Ok(file) => file,
|
||||
Err(_) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Could not find kubernetes_swagger.json file"
|
||||
));
|
||||
))
|
||||
}
|
||||
};
|
||||
let mut source_file = work_dir.base_path.clone();
|
||||
source_file.push("assets/kubernetes_swagger.json");
|
||||
|
||||
// Send the prompt to modify the file
|
||||
let _messages = agent.prompt("Remove the io.k8s.api.admissionregistration.v1.ServiceReference definition block and replace with a new definition for io.k8s.api.admissionregistration.v1.FakeServiceReference. Update the fields in the definition as well to be consistent. Don't change the property names. Don't update any references to the old definition. Only modify the definition and it's description to 'FakeServiceReference simulates a reference to a fake service for testing purposes.'.The file to modify is kubernetes_swagger.json.".to_string()).await?;
|
||||
@@ -53,7 +42,7 @@ impl Evaluation for DeveloperSearchReplace {
|
||||
.join("kubernetes_swagger.json");
|
||||
|
||||
// Read the expected patch file from the assets directory
|
||||
let patch_file_path = work_dir.path.join("assets").join("kubernetes.patch");
|
||||
let patch_file_path = work_dir.base_path.join("assets").join("kubernetes.patch");
|
||||
if !patch_file_path.exists() {
|
||||
return Err(anyhow::anyhow!("Could not find patch file"));
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
// Create a new file called test.txt with the content 'Hello, World!
|
||||
|
||||
use crate::bench_work_dir::BenchmarkWorkDir;
|
||||
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
|
||||
use crate::register_evaluation;
|
||||
use crate::work_dir::WorkDir;
|
||||
use async_trait::async_trait;
|
||||
use goose::message::MessageContent;
|
||||
use mcp_core::role::Role;
|
||||
@@ -22,7 +22,7 @@ impl Evaluation for ComputerControllerWebScrape {
|
||||
async fn run(
|
||||
&self,
|
||||
mut agent: Box<dyn BenchAgent>,
|
||||
_work_dir: &mut WorkDir,
|
||||
_work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::work_dir::WorkDir;
|
||||
use crate::bench_work_dir::BenchmarkWorkDir;
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use chrono::{DateTime, Utc};
|
||||
@@ -42,7 +42,7 @@ pub trait Evaluation: Send + Sync {
|
||||
async fn run(
|
||||
&self,
|
||||
agent: Box<dyn BenchAgent>,
|
||||
run_loc: &mut WorkDir,
|
||||
run_loc: &mut BenchmarkWorkDir,
|
||||
) -> Result<Vec<(String, EvaluationMetric)>>;
|
||||
|
||||
fn name(&self) -> &str;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
pub mod bench_work_dir;
|
||||
pub mod error_capture;
|
||||
pub mod eval_suites;
|
||||
pub mod reporting;
|
||||
pub mod work_dir;
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub struct WorkDir {
|
||||
pub path: PathBuf,
|
||||
traversal: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
impl Default for WorkDir {
|
||||
fn default() -> Self {
|
||||
let path = PathBuf::from(".").canonicalize().unwrap();
|
||||
WorkDir {
|
||||
path: path.clone(),
|
||||
traversal: vec![path.clone()],
|
||||
}
|
||||
}
|
||||
}
|
||||
impl WorkDir {
|
||||
pub fn new(path: &str) -> Self {
|
||||
let path = PathBuf::from(path);
|
||||
WorkDir {
|
||||
path: path.clone(),
|
||||
traversal: vec![path.clone()],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn at(path: String, include_dirs: Vec<PathBuf>) -> anyhow::Result<WorkDir> {
|
||||
fs::create_dir_all(&path)?;
|
||||
|
||||
let dirs = include_dirs
|
||||
.iter()
|
||||
.map(|d| d.canonicalize().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let p = PathBuf::from(&path).canonicalize()?;
|
||||
let _: Vec<_> = dirs
|
||||
.iter()
|
||||
.map(|d| WorkDir::deep_copy(d.as_path(), p.as_path()))
|
||||
.collect();
|
||||
|
||||
std::env::set_current_dir(&path)?;
|
||||
|
||||
Ok(WorkDir::new(p.to_string_lossy().to_string().as_str()))
|
||||
}
|
||||
pub fn move_to(&mut self, path: String) -> anyhow::Result<&mut Self> {
|
||||
fs::create_dir_all(&path)?;
|
||||
self.traversal.push(PathBuf::from(&path));
|
||||
std::env::set_current_dir(&path)?;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
pub fn fs_get(&mut self, path: String) -> anyhow::Result<PathBuf> {
|
||||
let p = Path::new(&path);
|
||||
if !p.exists() {
|
||||
let artifact_at_root = if p.is_dir() {
|
||||
self.traversal[0].clone().join(&path).canonicalize()?
|
||||
} else {
|
||||
self.traversal[0]
|
||||
.clone()
|
||||
.join(p.parent().unwrap_or(Path::new("")))
|
||||
.canonicalize()?
|
||||
};
|
||||
|
||||
let here = PathBuf::from(".").canonicalize()?;
|
||||
|
||||
WorkDir::deep_copy(artifact_at_root.as_path(), here.as_path())?;
|
||||
}
|
||||
|
||||
Ok(PathBuf::from(path))
|
||||
}
|
||||
|
||||
fn deep_copy(src: &Path, dst: &Path) -> io::Result<()> {
|
||||
// Create the destination directory with the source's name
|
||||
let dst_dir = if let Some(src_name) = src.file_name() {
|
||||
dst.join(src_name)
|
||||
} else {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"Source path must have a file name",
|
||||
));
|
||||
};
|
||||
|
||||
// Create the destination directory if it doesn't exist
|
||||
if !dst_dir.exists() {
|
||||
fs::create_dir_all(&dst_dir)?;
|
||||
}
|
||||
|
||||
// Copy each entry in the source directory
|
||||
for entry in fs::read_dir(src)? {
|
||||
let entry = entry?;
|
||||
let ty = entry.file_type()?;
|
||||
let src_path = entry.path();
|
||||
let dst_path = dst_dir.join(entry.file_name());
|
||||
|
||||
if ty.is_dir() {
|
||||
WorkDir::deep_copy(&src_path, dst_path.parent().unwrap())?;
|
||||
} else {
|
||||
fs::copy(&src_path, &dst_path)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WorkDir {
|
||||
fn drop(&mut self) {
|
||||
self.traversal.pop();
|
||||
std::env::set_current_dir("..").unwrap()
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,12 @@
|
||||
use crate::session::build_session;
|
||||
use crate::Session;
|
||||
use async_trait::async_trait;
|
||||
use chrono::Local;
|
||||
use goose::config::Config;
|
||||
use goose::message::Message;
|
||||
use goose_bench::bench_work_dir::BenchmarkWorkDir;
|
||||
use goose_bench::error_capture::ErrorCaptureLayer;
|
||||
use goose_bench::eval_suites::{BenchAgent, BenchAgentError, Evaluation, EvaluationSuiteFactory};
|
||||
use goose_bench::reporting::{BenchmarkResults, EvaluationResult, SuiteResult};
|
||||
use goose_bench::work_dir::WorkDir;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
@@ -77,11 +76,10 @@ impl BenchAgent for BenchAgentWrapper {
|
||||
|
||||
async fn run_eval(
|
||||
evaluation: Box<dyn Evaluation>,
|
||||
work_dir: &mut WorkDir,
|
||||
work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<EvaluationResult> {
|
||||
let mut result = EvaluationResult::new(evaluation.name().to_string());
|
||||
|
||||
if let Ok(work_dir) = work_dir.move_to(format!("./{}", &evaluation.name())) {
|
||||
let requirements = evaluation.required_extensions();
|
||||
|
||||
// Create session with error capture
|
||||
@@ -105,22 +103,22 @@ async fn run_eval(
|
||||
result.add_error(error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn run_suite(suite: &str, work_dir: &mut WorkDir) -> anyhow::Result<SuiteResult> {
|
||||
async fn run_suite(suite: &str, work_dir: &mut BenchmarkWorkDir) -> anyhow::Result<SuiteResult> {
|
||||
let mut suite_result = SuiteResult::new(suite.to_string());
|
||||
let eval_lock = Mutex::new(0);
|
||||
|
||||
if let Ok(work_dir) = work_dir.move_to(format!("./{}", &suite)) {
|
||||
if let Some(evals) = EvaluationSuiteFactory::create(suite) {
|
||||
for eval in evals {
|
||||
let _unused = eval_lock.lock().await;
|
||||
work_dir.set_eval(eval.name());
|
||||
let eval_result = run_eval(eval, work_dir).await?;
|
||||
suite_result.add_evaluation(eval_result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(suite_result)
|
||||
}
|
||||
@@ -135,25 +133,26 @@ pub async fn run_benchmark(
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let config = Config::global();
|
||||
let goose_model: String = config
|
||||
.get("GOOSE_MODEL")
|
||||
.expect("No model configured. Run 'goose configure' first");
|
||||
let provider_name: String = config
|
||||
.get("GOOSE_PROVIDER")
|
||||
.expect("No provider configured. Run 'goose configure' first");
|
||||
|
||||
let mut results = BenchmarkResults::new(provider_name.clone());
|
||||
|
||||
let current_time = Local::now().format("%H:%M:%S").to_string();
|
||||
let current_date = Local::now().format("%Y-%m-%d").to_string();
|
||||
if let Ok(mut work_dir) = WorkDir::at(
|
||||
format!("./benchmark-{}", &provider_name),
|
||||
let mut work_dir = BenchmarkWorkDir::new(
|
||||
format!("{}-{}", provider_name, goose_model),
|
||||
include_dirs.clone(),
|
||||
) {
|
||||
if let Ok(work_dir) = work_dir.move_to(format!("./{}-{}", ¤t_date, current_time)) {
|
||||
);
|
||||
let suite_lock = Mutex::new(0);
|
||||
for suite in suites {
|
||||
let suite_result = run_suite(suite, work_dir).await?;
|
||||
let _unused = suite_lock.lock().await;
|
||||
work_dir.set_suite(suite);
|
||||
let suite_result = run_suite(suite, &mut work_dir).await?;
|
||||
results.add_suite(suite_result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user