mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-18 14:44:21 +01:00
bugfix: refactor workdirs to be async-safe, and simpler (#1558)
This commit is contained in:
@@ -1,13 +1,12 @@
|
||||
use crate::session::build_session;
|
||||
use crate::Session;
|
||||
use async_trait::async_trait;
|
||||
use chrono::Local;
|
||||
use goose::config::Config;
|
||||
use goose::message::Message;
|
||||
use goose_bench::bench_work_dir::BenchmarkWorkDir;
|
||||
use goose_bench::error_capture::ErrorCaptureLayer;
|
||||
use goose_bench::eval_suites::{BenchAgent, BenchAgentError, Evaluation, EvaluationSuiteFactory};
|
||||
use goose_bench::reporting::{BenchmarkResults, EvaluationResult, SuiteResult};
|
||||
use goose_bench::work_dir::WorkDir;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
@@ -77,48 +76,47 @@ impl BenchAgent for BenchAgentWrapper {
|
||||
|
||||
async fn run_eval(
|
||||
evaluation: Box<dyn Evaluation>,
|
||||
work_dir: &mut WorkDir,
|
||||
work_dir: &mut BenchmarkWorkDir,
|
||||
) -> anyhow::Result<EvaluationResult> {
|
||||
let mut result = EvaluationResult::new(evaluation.name().to_string());
|
||||
|
||||
if let Ok(work_dir) = work_dir.move_to(format!("./{}", &evaluation.name())) {
|
||||
let requirements = evaluation.required_extensions();
|
||||
let requirements = evaluation.required_extensions();
|
||||
|
||||
// Create session with error capture
|
||||
let base_session =
|
||||
build_session(None, false, requirements.external, requirements.builtin).await;
|
||||
// Create session with error capture
|
||||
let base_session =
|
||||
build_session(None, false, requirements.external, requirements.builtin).await;
|
||||
|
||||
let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session)));
|
||||
let bench_session_clone = bench_session.clone();
|
||||
let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session)));
|
||||
let bench_session_clone = bench_session.clone();
|
||||
|
||||
if let Ok(metrics) = evaluation
|
||||
.run(Box::new(BenchAgentWrapper(bench_session)), work_dir)
|
||||
.await
|
||||
{
|
||||
for (name, metric) in metrics {
|
||||
result.add_metric(name, metric);
|
||||
}
|
||||
if let Ok(metrics) = evaluation
|
||||
.run(Box::new(BenchAgentWrapper(bench_session)), work_dir)
|
||||
.await
|
||||
{
|
||||
for (name, metric) in metrics {
|
||||
result.add_metric(name, metric);
|
||||
}
|
||||
|
||||
// Add any errors that occurred
|
||||
let agent = BenchAgentWrapper(bench_session_clone);
|
||||
for error in agent.get_errors().await {
|
||||
result.add_error(error);
|
||||
}
|
||||
// Add any errors that occurred
|
||||
let agent = BenchAgentWrapper(bench_session_clone);
|
||||
for error in agent.get_errors().await {
|
||||
result.add_error(error);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn run_suite(suite: &str, work_dir: &mut WorkDir) -> anyhow::Result<SuiteResult> {
|
||||
async fn run_suite(suite: &str, work_dir: &mut BenchmarkWorkDir) -> anyhow::Result<SuiteResult> {
|
||||
let mut suite_result = SuiteResult::new(suite.to_string());
|
||||
let eval_lock = Mutex::new(0);
|
||||
|
||||
if let Ok(work_dir) = work_dir.move_to(format!("./{}", &suite)) {
|
||||
if let Some(evals) = EvaluationSuiteFactory::create(suite) {
|
||||
for eval in evals {
|
||||
let eval_result = run_eval(eval, work_dir).await?;
|
||||
suite_result.add_evaluation(eval_result);
|
||||
}
|
||||
if let Some(evals) = EvaluationSuiteFactory::create(suite) {
|
||||
for eval in evals {
|
||||
let _unused = eval_lock.lock().await;
|
||||
work_dir.set_eval(eval.name());
|
||||
let eval_result = run_eval(eval, work_dir).await?;
|
||||
suite_result.add_evaluation(eval_result);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,24 +133,25 @@ pub async fn run_benchmark(
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let config = Config::global();
|
||||
let goose_model: String = config
|
||||
.get("GOOSE_MODEL")
|
||||
.expect("No model configured. Run 'goose configure' first");
|
||||
let provider_name: String = config
|
||||
.get("GOOSE_PROVIDER")
|
||||
.expect("No provider configured. Run 'goose configure' first");
|
||||
|
||||
let mut results = BenchmarkResults::new(provider_name.clone());
|
||||
|
||||
let current_time = Local::now().format("%H:%M:%S").to_string();
|
||||
let current_date = Local::now().format("%Y-%m-%d").to_string();
|
||||
if let Ok(mut work_dir) = WorkDir::at(
|
||||
format!("./benchmark-{}", &provider_name),
|
||||
let mut work_dir = BenchmarkWorkDir::new(
|
||||
format!("{}-{}", provider_name, goose_model),
|
||||
include_dirs.clone(),
|
||||
) {
|
||||
if let Ok(work_dir) = work_dir.move_to(format!("./{}-{}", ¤t_date, current_time)) {
|
||||
for suite in suites {
|
||||
let suite_result = run_suite(suite, work_dir).await?;
|
||||
results.add_suite(suite_result);
|
||||
}
|
||||
}
|
||||
);
|
||||
let suite_lock = Mutex::new(0);
|
||||
for suite in suites {
|
||||
let _unused = suite_lock.lock().await;
|
||||
work_dir.set_suite(suite);
|
||||
let suite_result = run_suite(suite, &mut work_dir).await?;
|
||||
results.add_suite(suite_result);
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
|
||||
Reference in New Issue
Block a user