fix: ensure repeating benches return to initial run-dir (#1617)

This commit is contained in:
marcelle
2025-03-11 11:44:57 -04:00
committed by GitHub
parent 9447b792ee
commit c23be1eb19
3 changed files with 21 additions and 10 deletions

View File

@@ -11,6 +11,7 @@ pub static BUILTIN_EVAL_ASSETS: Dir = include_dir!("$CARGO_MANIFEST_DIR/src/asse
pub struct BenchmarkWorkDir {
pub base_path: PathBuf,
run_dir: PathBuf,
cwd: PathBuf,
run_name: String,
suite: Option<String>,
@@ -24,6 +25,7 @@ impl Default for BenchmarkWorkDir {
}
impl BenchmarkWorkDir {
pub fn new(work_dir_name: String, include_dirs: Vec<PathBuf>) -> Self {
let run_dir = std::env::current_dir().unwrap().canonicalize().unwrap();
let base_path = PathBuf::from(format!("./benchmark-{}", work_dir_name));
fs::create_dir_all(&base_path).unwrap();
@@ -54,6 +56,7 @@ impl BenchmarkWorkDir {
BenchmarkWorkDir {
base_path: base_path.clone(),
run_dir,
cwd: base_path.clone(),
run_name,
suite: None,
@@ -178,3 +181,9 @@ impl BenchmarkWorkDir {
}
}
}
impl Drop for BenchmarkWorkDir {
fn drop(&mut self) {
std::env::set_current_dir(&self.run_dir).unwrap();
}
}

View File

@@ -20,15 +20,17 @@ impl Evaluation for ExampleEval {
_work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("ExampleEval - run");
// let f = work_dir.fs_get(String::from("./arbitrary_dir/arbitrary_file.txt"))?;
// let _contents = fs::read_to_string(f)?;
let mut metrics = Vec::new();
let _ = agent.prompt("What can you do?".to_string()).await;
metrics.push((
"example_metric".to_string(),
EvaluationMetric::Boolean(true),
));
metrics.push(("example_count".to_string(), EvaluationMetric::Integer(42)));
Ok(metrics)
}

View File

@@ -124,13 +124,13 @@ async fn run_eval(
async fn run_suite(suite: &str, work_dir: &mut BenchmarkWorkDir) -> anyhow::Result<SuiteResult> {
let mut suite_result = SuiteResult::new(suite.to_string());
let eval_lock = Mutex::new(());
let eval_work_dir_guard = Mutex::new(work_dir);
if let Some(evals) = EvaluationSuiteFactory::create(suite) {
for eval in evals {
let _unused = eval_lock.lock().await;
work_dir.set_eval(eval.name());
let eval_result = run_eval(eval, work_dir).await?;
let mut eval_work_dir = eval_work_dir_guard.lock().await;
eval_work_dir.set_eval(eval.name());
let eval_result = run_eval(eval, &mut eval_work_dir).await?;
suite_result.add_evaluation(eval_result);
}
}
@@ -157,13 +157,13 @@ pub async fn run_benchmark(
let mut results = BenchmarkResults::new(provider_name.clone());
let mut work_dir = BenchmarkWorkDir::new(
let suite_work_dir = Mutex::new(BenchmarkWorkDir::new(
format!("{}-{}", provider_name, goose_model),
include_dirs.clone(),
);
let suite_lock = Mutex::new(());
));
for suite in suites {
let _unused = suite_lock.lock().await;
let mut work_dir = suite_work_dir.lock().await;
work_dir.set_suite(suite);
let suite_result = run_suite(suite, &mut work_dir).await?;
results.add_suite(suite_result);