mirror of
https://github.com/aljazceru/goose.git
synced 2026-01-11 02:14:22 +01:00
fix: ensure repeating benches return to initial run-dir (#1617)
This commit is contained in:
@@ -11,6 +11,7 @@ pub static BUILTIN_EVAL_ASSETS: Dir = include_dir!("$CARGO_MANIFEST_DIR/src/asse
|
||||
|
||||
pub struct BenchmarkWorkDir {
|
||||
pub base_path: PathBuf,
|
||||
run_dir: PathBuf,
|
||||
cwd: PathBuf,
|
||||
run_name: String,
|
||||
suite: Option<String>,
|
||||
@@ -24,6 +25,7 @@ impl Default for BenchmarkWorkDir {
|
||||
}
|
||||
impl BenchmarkWorkDir {
|
||||
pub fn new(work_dir_name: String, include_dirs: Vec<PathBuf>) -> Self {
|
||||
let run_dir = std::env::current_dir().unwrap().canonicalize().unwrap();
|
||||
let base_path = PathBuf::from(format!("./benchmark-{}", work_dir_name));
|
||||
fs::create_dir_all(&base_path).unwrap();
|
||||
|
||||
@@ -54,6 +56,7 @@ impl BenchmarkWorkDir {
|
||||
|
||||
BenchmarkWorkDir {
|
||||
base_path: base_path.clone(),
|
||||
run_dir,
|
||||
cwd: base_path.clone(),
|
||||
run_name,
|
||||
suite: None,
|
||||
@@ -178,3 +181,9 @@ impl BenchmarkWorkDir {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for BenchmarkWorkDir {
|
||||
fn drop(&mut self) {
|
||||
std::env::set_current_dir(&self.run_dir).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,15 +20,17 @@ impl Evaluation for ExampleEval {
|
||||
_work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
|
||||
println!("ExampleEval - run");
|
||||
// let f = work_dir.fs_get(String::from("./arbitrary_dir/arbitrary_file.txt"))?;
|
||||
// let _contents = fs::read_to_string(f)?;
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
let _ = agent.prompt("What can you do?".to_string()).await;
|
||||
|
||||
metrics.push((
|
||||
"example_metric".to_string(),
|
||||
EvaluationMetric::Boolean(true),
|
||||
));
|
||||
|
||||
metrics.push(("example_count".to_string(), EvaluationMetric::Integer(42)));
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
||||
@@ -124,13 +124,13 @@ async fn run_eval(
|
||||
|
||||
async fn run_suite(suite: &str, work_dir: &mut BenchmarkWorkDir) -> anyhow::Result<SuiteResult> {
|
||||
let mut suite_result = SuiteResult::new(suite.to_string());
|
||||
let eval_lock = Mutex::new(());
|
||||
let eval_work_dir_guard = Mutex::new(work_dir);
|
||||
|
||||
if let Some(evals) = EvaluationSuiteFactory::create(suite) {
|
||||
for eval in evals {
|
||||
let _unused = eval_lock.lock().await;
|
||||
work_dir.set_eval(eval.name());
|
||||
let eval_result = run_eval(eval, work_dir).await?;
|
||||
let mut eval_work_dir = eval_work_dir_guard.lock().await;
|
||||
eval_work_dir.set_eval(eval.name());
|
||||
let eval_result = run_eval(eval, &mut eval_work_dir).await?;
|
||||
suite_result.add_evaluation(eval_result);
|
||||
}
|
||||
}
|
||||
@@ -157,13 +157,13 @@ pub async fn run_benchmark(
|
||||
|
||||
let mut results = BenchmarkResults::new(provider_name.clone());
|
||||
|
||||
let mut work_dir = BenchmarkWorkDir::new(
|
||||
let suite_work_dir = Mutex::new(BenchmarkWorkDir::new(
|
||||
format!("{}-{}", provider_name, goose_model),
|
||||
include_dirs.clone(),
|
||||
);
|
||||
let suite_lock = Mutex::new(());
|
||||
));
|
||||
|
||||
for suite in suites {
|
||||
let _unused = suite_lock.lock().await;
|
||||
let mut work_dir = suite_work_dir.lock().await;
|
||||
work_dir.set_suite(suite);
|
||||
let suite_result = run_suite(suite, &mut work_dir).await?;
|
||||
results.add_suite(suite_result);
|
||||
|
||||
Reference in New Issue
Block a user