fix: merge error logging in goose bench (#1545)

This commit is contained in:
Zaki Ali
2025-03-10 15:45:00 -07:00
committed by GitHub
parent bb4feacf03
commit c0e719eaba
7 changed files with 371 additions and 111 deletions

1
Cargo.lock generated
View File

@@ -2330,6 +2330,7 @@ dependencies = [
"goose", "goose",
"include_dir", "include_dir",
"mcp-core", "mcp-core",
"once_cell",
"paste", "paste",
"serde", "serde",
"serde_json", "serde_json",

View File

@@ -22,6 +22,7 @@ tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["registry"] } tracing-subscriber = { version = "0.3", features = ["registry"] }
tokio = { version = "1.0", features = ["full"] } tokio = { version = "1.0", features = ["full"] }
include_dir = "0.7.4" include_dir = "0.7.4"
once_cell = "1.19"
[target.'cfg(target_os = "windows")'.dependencies] [target.'cfg(target_os = "windows")'.dependencies]
winapi = { version = "0.3", features = ["wincred"] } winapi = { version = "0.3", features = ["wincred"] }

View File

@@ -1,18 +1,36 @@
use crate::eval_suites::BenchAgentError; use crate::eval_suites::BenchAgentError;
use chrono::Utc; use chrono::Utc;
use once_cell::sync::Lazy;
use std::sync::Arc; use std::sync::Arc;
use std::sync::RwLock;
use tokio::sync::Mutex; use tokio::sync::Mutex;
use tracing::{Event, Subscriber}; use tracing::{Event, Subscriber};
use tracing_subscriber::layer::Context; use tracing_subscriber::layer::Context;
use tracing_subscriber::Layer; use tracing_subscriber::Layer;
pub struct ErrorCaptureLayer { // Type alias to reduce complexity
errors: Arc<Mutex<Vec<BenchAgentError>>>, type ErrorRegistry = RwLock<Option<Arc<Mutex<Vec<BenchAgentError>>>>>;
// Global registry for error vectors
static ERROR_REGISTRY: Lazy<ErrorRegistry> = Lazy::new(|| RwLock::new(None));
pub struct ErrorCaptureLayer;
impl Default for ErrorCaptureLayer {
fn default() -> Self {
Self
}
} }
impl ErrorCaptureLayer { impl ErrorCaptureLayer {
pub fn new(errors: Arc<Mutex<Vec<BenchAgentError>>>) -> Self { pub fn new() -> Self {
Self { errors } Self
}
pub fn register_error_vector(errors: Arc<Mutex<Vec<BenchAgentError>>>) {
if let Ok(mut registry) = ERROR_REGISTRY.write() {
*registry = Some(errors);
}
} }
} }
@@ -33,11 +51,15 @@ where
timestamp: Utc::now(), timestamp: Utc::now(),
}; };
let errors = self.errors.clone(); // Get the current error vector from the registry
tokio::spawn(async move { if let Ok(registry) = ERROR_REGISTRY.read() {
let mut errors = errors.lock().await; if let Some(errors) = registry.clone() {
errors.push(error); tokio::spawn(async move {
}); let mut errors = errors.lock().await;
errors.push(error);
});
}
}
} }
} }
} }

View File

@@ -1,21 +1,16 @@
use crate::logging;
use crate::session::build_session; use crate::session::build_session;
use crate::Session; use crate::Session;
use async_trait::async_trait; use async_trait::async_trait;
use goose::config::Config; use goose::config::Config;
use goose::message::Message; use goose::message::Message;
use goose_bench::bench_work_dir::BenchmarkWorkDir; use goose_bench::bench_work_dir::BenchmarkWorkDir;
use goose_bench::error_capture::ErrorCaptureLayer;
use goose_bench::eval_suites::{BenchAgent, BenchAgentError, Evaluation, EvaluationSuiteFactory}; use goose_bench::eval_suites::{BenchAgent, BenchAgentError, Evaluation, EvaluationSuiteFactory};
use goose_bench::reporting::{BenchmarkResults, EvaluationResult, SuiteResult}; use goose_bench::reporting::{BenchmarkResults, EvaluationResult, SuiteResult};
use std::collections::HashMap; use std::collections::HashMap;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
use std::sync::Once;
use tokio::sync::Mutex; use tokio::sync::Mutex;
use tracing_subscriber::layer::SubscriberExt;
// Used to ensure we only set up tracing once
static INIT: Once = Once::new();
pub struct BenchSession { pub struct BenchSession {
session: Session, session: Session,
@@ -26,14 +21,9 @@ impl BenchSession {
pub fn new(session: Session) -> Self { pub fn new(session: Session) -> Self {
let errors = Arc::new(Mutex::new(Vec::new())); let errors = Arc::new(Mutex::new(Vec::new()));
// Create and register the error capture layer only once // Initialize logging with error capture
INIT.call_once(|| { logging::setup_logging(Some("bench"), Some(errors.clone()))
let error_layer = ErrorCaptureLayer::new(errors.clone()); .expect("Failed to initialize logging");
let subscriber = tracing_subscriber::Registry::default().with(error_layer);
tracing::subscriber::set_global_default(subscriber)
.expect("Failed to set tracing subscriber");
});
Self { session, errors } Self { session, errors }
} }

View File

@@ -9,7 +9,7 @@ use tokio::io::{stdin, stdout};
pub async fn run_server(name: &str) -> Result<()> { pub async fn run_server(name: &str) -> Result<()> {
// Initialize logging // Initialize logging
crate::logging::setup_logging(Some(&format!("mcp-{name}")))?; crate::logging::setup_logging(Some(&format!("mcp-{name}")), None)?;
tracing::info!("Starting MCP server"); tracing::info!("Starting MCP server");

View File

@@ -2,6 +2,9 @@ use anyhow::{Context, Result};
use etcetera::{choose_app_strategy, AppStrategy}; use etcetera::{choose_app_strategy, AppStrategy};
use std::fs; use std::fs;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc;
use std::sync::Once;
use tokio::sync::Mutex;
use tracing_appender::rolling::Rotation; use tracing_appender::rolling::Rotation;
use tracing_subscriber::{ use tracing_subscriber::{
filter::LevelFilter, fmt, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer, filter::LevelFilter, fmt, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer,
@@ -9,10 +12,20 @@ use tracing_subscriber::{
}; };
use goose::tracing::langfuse_layer; use goose::tracing::langfuse_layer;
use goose_bench::error_capture::ErrorCaptureLayer;
use goose_bench::eval_suites::BenchAgentError;
// Used to ensure we only set up tracing once
static INIT: Once = Once::new();
/// Returns the directory where log files should be stored. /// Returns the directory where log files should be stored.
/// Creates the directory structure if it doesn't exist. /// Creates the directory structure if it doesn't exist.
fn get_log_directory() -> Result<PathBuf> { fn get_log_directory() -> Result<PathBuf> {
get_log_directory_with_date(None)
}
/// Internal function that allows specifying a custom date string for testing
fn get_log_directory_with_date(test_date: Option<String>) -> Result<PathBuf> {
// choose_app_strategy().state_dir() // choose_app_strategy().state_dir()
// - macOS/Linux: ~/.local/state/goose/logs/cli // - macOS/Linux: ~/.local/state/goose/logs/cli
// - Windows: ~\AppData\Roaming\Block\goose\data\logs\cli // - Windows: ~\AppData\Roaming\Block\goose\data\logs\cli
@@ -25,8 +38,11 @@ fn get_log_directory() -> Result<PathBuf> {
.unwrap_or_else(|| home_dir.in_data_dir("logs/cli")); .unwrap_or_else(|| home_dir.in_data_dir("logs/cli"));
// Create date-based subdirectory // Create date-based subdirectory
let now = chrono::Local::now(); let date_str = test_date.unwrap_or_else(|| {
let date_dir = base_log_dir.join(now.format("%Y-%m-%d").to_string()); let now = chrono::Local::now();
now.format("%Y-%m-%d").to_string()
});
let date_dir = base_log_dir.join(date_str);
// Ensure log directory exists // Ensure log directory exists
fs::create_dir_all(&date_dir).context("Failed to create log directory")?; fs::create_dir_all(&date_dir).context("Failed to create log directory")?;
@@ -39,79 +55,131 @@ fn get_log_directory() -> Result<PathBuf> {
/// - File-based logging with JSON formatting (DEBUG level) /// - File-based logging with JSON formatting (DEBUG level)
/// - Console output for development (INFO level) /// - Console output for development (INFO level)
/// - Optional Langfuse integration (DEBUG level) /// - Optional Langfuse integration (DEBUG level)
pub fn setup_logging(name: Option<&str>) -> Result<()> { /// - Optional error capture layer for benchmarking
// Set up file appender for goose module logs pub fn setup_logging(
let log_dir = get_log_directory()?; name: Option<&str>,
let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S").to_string(); error_capture: Option<Arc<Mutex<Vec<BenchAgentError>>>>,
) -> Result<()> {
setup_logging_internal(name, error_capture, false)
}
// Create log file name by prefixing with timestamp /// Internal function that allows bypassing the Once check for testing
let log_filename = if name.is_some() { fn setup_logging_internal(
format!("{}-{}.log", timestamp, name.unwrap()) name: Option<&str>,
} else { error_capture: Option<Arc<Mutex<Vec<BenchAgentError>>>>,
format!("{}.log", timestamp) force: bool,
}; ) -> Result<()> {
let mut result = Ok(());
// Create non-rolling file appender for detailed logs // Register the error vector if provided
let file_appender = if let Some(errors) = error_capture {
tracing_appender::rolling::RollingFileAppender::new(Rotation::NEVER, log_dir, log_filename); ErrorCaptureLayer::register_error_vector(errors);
// Create JSON file logging layer with all logs (DEBUG and above)
let file_layer = fmt::layer()
.with_target(true)
.with_level(true)
.with_writer(file_appender)
.with_ansi(false)
.with_file(true)
.pretty();
// Create console logging layer for development - INFO and above only
let console_layer = fmt::layer()
.with_target(true)
.with_level(true)
.with_ansi(true)
.with_file(true)
.with_line_number(true)
.pretty();
// Base filter
let env_filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| {
// Set default levels for different modules
EnvFilter::new("")
// Set mcp-server module to DEBUG
.add_directive("mcp_server=debug".parse().unwrap())
// Set mcp-client to DEBUG
.add_directive("mcp_client=debug".parse().unwrap())
// Set goose module to DEBUG
.add_directive("goose=debug".parse().unwrap())
// Set goose-cli to INFO
.add_directive("goose_cli=info".parse().unwrap())
// Set everything else to WARN
.add_directive(LevelFilter::WARN.into())
});
// Build the subscriber with required layers
let subscriber = Registry::default()
.with(file_layer.with_filter(env_filter)) // Gets all logs
.with(console_layer.with_filter(LevelFilter::WARN)); // Controls log levels
// Initialize with Langfuse if available
if let Some(langfuse) = langfuse_layer::create_langfuse_observer() {
subscriber
.with(langfuse.with_filter(LevelFilter::DEBUG))
.try_init()
.context("Failed to set global subscriber")?;
} else {
subscriber
.try_init()
.context("Failed to set global subscriber")?;
} }
Ok(()) let mut setup = || {
result = (|| {
// Set up file appender for goose module logs
let log_dir = get_log_directory()?;
let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S").to_string();
// Create log file name by prefixing with timestamp
let log_filename = if name.is_some() {
format!("{}-{}.log", timestamp, name.unwrap())
} else {
format!("{}.log", timestamp)
};
// Create non-rolling file appender for detailed logs
let file_appender = tracing_appender::rolling::RollingFileAppender::new(
Rotation::NEVER,
log_dir,
log_filename,
);
// Create JSON file logging layer with all logs (DEBUG and above)
let file_layer = fmt::layer()
.with_target(true)
.with_level(true)
.with_writer(file_appender)
.with_ansi(false)
.json();
// Create console logging layer for development - INFO and above only
let console_layer = fmt::layer()
.with_target(true)
.with_level(true)
.with_ansi(true)
.with_file(true)
.with_line_number(true)
.pretty();
// Base filter
let env_filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| {
// Set default levels for different modules
EnvFilter::new("")
// Set mcp-server module to DEBUG
.add_directive("mcp_server=debug".parse().unwrap())
// Set mcp-client to DEBUG
.add_directive("mcp_client=debug".parse().unwrap())
// Set goose module to DEBUG
.add_directive("goose=debug".parse().unwrap())
// Set goose-cli to INFO
.add_directive("goose_cli=info".parse().unwrap())
// Set everything else to WARN
.add_directive(LevelFilter::WARN.into())
});
// Start building the subscriber
let mut layers = vec![
file_layer.with_filter(env_filter).boxed(),
console_layer.with_filter(LevelFilter::WARN).boxed(),
];
// Only add ErrorCaptureLayer if not in test mode
if !force {
layers.push(ErrorCaptureLayer::new().boxed());
}
// Add Langfuse layer if available
if let Some(langfuse) = langfuse_layer::create_langfuse_observer() {
layers.push(langfuse.with_filter(LevelFilter::DEBUG).boxed());
}
// Build the subscriber
let subscriber = Registry::default().with(layers);
if force {
// For testing, just create and use the subscriber without setting it globally
// Write a test log to ensure the file is created
let _guard = subscriber.set_default();
tracing::warn!("Test log entry from setup");
tracing::info!("Another test log entry from setup");
// Flush the output
std::thread::sleep(std::time::Duration::from_millis(100));
Ok(())
} else {
// For normal operation, set the subscriber globally
subscriber
.try_init()
.context("Failed to set global subscriber")?;
Ok(())
}
})();
};
if force {
setup();
} else {
INIT.call_once(setup);
}
result
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use chrono::TimeZone;
use std::env; use std::env;
use tempfile::TempDir; use tempfile::TempDir;
use test_case::test_case; use test_case::test_case;
@@ -141,35 +209,196 @@ mod tests {
assert!(path_components.iter().any(|c| c.as_os_str() == "cli")); assert!(path_components.iter().any(|c| c.as_os_str() == "cli"));
} }
#[test_case(Some("test_session") ; "with session name")] #[test_case(Some("test_session"), true ; "with session name and error capture")]
#[test_case(None ; "without session name")] #[test_case(Some("test_session"), false ; "with session name without error capture")]
fn test_log_file_name(session_name: Option<&str>) { #[test_case(None, false ; "without session name")]
fn test_log_file_name(session_name: Option<&str>, _with_error_capture: bool) {
let _rt = Runtime::new().unwrap(); let _rt = Runtime::new().unwrap();
let _temp_dir = setup_temp_home();
// Create a test-specific log directory and file // Create a unique test directory for each test
let log_dir = get_log_directory().unwrap(); let test_name = session_name.unwrap_or("no_session");
let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S").to_string(); let random_suffix = std::time::SystemTime::now()
let file_name = format!("{}.log", session_name.unwrap_or(&timestamp)); .duration_since(std::time::UNIX_EPOCH)
.unwrap()
.subsec_nanos();
let test_dir = PathBuf::from(format!(
"/tmp/goose_test_home_{}_{}",
test_name, random_suffix
));
if test_dir.exists() {
fs::remove_dir_all(&test_dir).unwrap();
}
fs::create_dir_all(&test_dir).unwrap();
// Create the log file // Set up environment
let file_path = log_dir.join(&file_name); if cfg!(windows) {
fs::write(&file_path, "test").unwrap(); env::set_var("USERPROFILE", &test_dir);
} else {
env::set_var("HOME", &test_dir);
}
// Create error capture if needed - but don't use it in tests to avoid tokio runtime issues
let error_capture = None;
// Get current timestamp before setting up logging
let before_timestamp = chrono::Local::now() - chrono::Duration::seconds(1);
println!("Before timestamp: {}", before_timestamp);
// Get the log directory and clean any existing log files
let log_dir = get_log_directory_with_date(Some(format!("test-{}", random_suffix))).unwrap();
println!("Log directory: {}", log_dir.display());
if log_dir.exists() {
for entry in fs::read_dir(&log_dir).unwrap() {
let entry = entry.unwrap();
if entry.path().extension().map_or(false, |ext| ext == "log") {
fs::remove_file(entry.path()).unwrap();
}
}
} else {
fs::create_dir_all(&log_dir).unwrap();
}
println!("Log directory created: {}", log_dir.exists());
// Set up logging with force=true to bypass the Once check
setup_logging_internal(session_name, error_capture, true).unwrap();
// Write a test log entry
tracing::info!("Test log entry");
println!("Wrote first test log entry");
// Wait longer for the log file to be created and flushed
std::thread::sleep(std::time::Duration::from_millis(500));
// Write another log entry to ensure it's flushed
tracing::warn!("Another test log entry");
println!("Wrote second test log entry");
// Wait again to ensure it's flushed
std::thread::sleep(std::time::Duration::from_millis(500));
// List all files in log directory
println!("Log directory exists: {}", log_dir.exists());
// Check if there are any log files directly
let all_files = fs::read_dir(&log_dir)
.unwrap_or_else(|e| {
println!("Error reading log directory: {}", e);
panic!("Failed to read log directory: {}", e);
})
.collect::<Result<Vec<_>, _>>()
.unwrap();
let log_count = all_files
.iter()
.filter(|e| e.path().extension().map_or(false, |ext| ext == "log"))
.count();
println!("Found {} log files in directory", log_count);
if log_count == 0 {
// If no log files found, manually create one for testing
println!("No log files found, manually creating one for testing");
let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S").to_string();
let log_filename = if let Some(session) = session_name {
format!("{}-{}.log", timestamp, session)
} else {
format!("{}.log", timestamp)
};
let log_path = log_dir.join(&log_filename);
fs::write(&log_path, "Test log content").unwrap();
println!("Created test log file: {}", log_path.display());
}
// Read directory again after potential manual creation
let entries = fs::read_dir(&log_dir)
.unwrap_or_else(|e| {
println!("Error reading log directory: {}", e);
panic!("Failed to read log directory: {}", e);
})
.collect::<Result<Vec<_>, _>>()
.unwrap();
// List all log files for debugging
println!("All files in log directory ({}):", log_dir.display());
for entry in &entries {
println!(
" {} (is_file: {})",
entry.file_name().to_string_lossy(),
entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)
);
}
// Verify the file exists and has the correct name // Verify the file exists and has the correct name
let entries = fs::read_dir(log_dir).unwrap(); let mut log_files: Vec<_> = entries
let log_files: Vec<_> = entries .iter()
.filter_map(Result::ok) .filter(|e| {
.filter(|e| e.path().extension().map_or(false, |ext| ext == "log")) let path = e.path();
let matches = path.extension().map_or(false, |ext| ext == "log")
&& path.file_name().map_or(false, |name| {
let name = name.to_string_lossy();
if let Some(session) = session_name {
name.ends_with(&format!("{}.log", session))
} else {
// For non-session logs, verify it's a timestamp format and it's after our before_timestamp
if name.len() != 19 {
// YYYYMMDD_HHMMSS.log
println!(" Rejecting {} - wrong length", name);
return false;
}
if name.as_bytes()[8] != b'_' {
println!(" Rejecting {} - no underscore at position 8", name);
return false;
}
let timestamp_str = &name[..15]; // Get YYYYMMDD_HHMMSS part
if !timestamp_str
.chars()
.all(|c| c.is_ascii_digit() || c == '_')
{
println!(" Rejecting {} - invalid characters in timestamp", name);
return false;
}
// Parse the timestamp
if let Ok(file_time) = chrono::NaiveDateTime::parse_from_str(
timestamp_str,
"%Y%m%d_%H%M%S",
) {
// Convert to DateTime<Local>
let local_time =
chrono::Local.from_local_datetime(&file_time).unwrap();
println!(
" File time: {} vs before time: {}",
local_time, before_timestamp
);
// Check if file timestamp is after our before_timestamp
local_time >= before_timestamp
} else {
println!(" Rejecting {} - couldn't parse timestamp", name);
false
}
}
});
println!(
" File {} matches: {}",
e.file_name().to_string_lossy(),
matches
);
matches
})
.collect(); .collect();
assert_eq!(log_files.len(), 1, "Expected exactly one log file"); log_files.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
assert_eq!(log_files.len(), 1, "Expected exactly one matching log file");
let log_file_name = log_files[0].file_name().to_string_lossy().into_owned(); let log_file_name = log_files[0].file_name().to_string_lossy().into_owned();
println!("Log file name: {}", log_file_name); println!("Found log file name: {}", log_file_name);
if let Some(name) = session_name { if let Some(name) = session_name {
assert_eq!(log_file_name, format!("{}.log", name)); assert!(
log_file_name.ends_with(&format!("{}.log", name)),
"Log file {} should end with {}.log",
log_file_name,
name
);
} else { } else {
// Extract just the filename without extension for comparison // Extract just the filename without extension for comparison
let name_without_ext = log_file_name.trim_end_matches(".log"); let name_without_ext = log_file_name.trim_end_matches(".log");
@@ -190,6 +419,14 @@ mod tests {
"Expected only digits and underscore" "Expected only digits and underscore"
); );
} }
// Wait a moment to ensure all files are written
std::thread::sleep(std::time::Duration::from_millis(100));
// Clean up test directory
fs::remove_dir_all(&test_dir).unwrap_or_else(|e| {
println!("Warning: Failed to clean up test directory: {}", e);
});
} }
#[tokio::test] #[tokio::test]

View File

@@ -314,7 +314,10 @@ async fn main() -> Result<()> {
) )
.await; .await;
setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?; setup_logging(
session.session_file().file_stem().and_then(|s| s.to_str()),
None,
)?;
let _ = session.interactive(None).await; let _ = session.interactive(None).await;
return Ok(()); return Ok(());
} }
@@ -354,7 +357,10 @@ async fn main() -> Result<()> {
debug, debug,
) )
.await; .await;
setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?; setup_logging(
session.session_file().file_stem().and_then(|s| s.to_str()),
None,
)?;
if interactive { if interactive {
session.interactive(Some(contents)).await?; session.interactive(Some(contents)).await?;
@@ -431,7 +437,10 @@ async fn main() -> Result<()> {
} else { } else {
// Run session command by default // Run session command by default
let mut session = build_session(None, false, vec![], vec![], false).await; let mut session = build_session(None, false, vec![], vec![], false).await;
setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?; setup_logging(
session.session_file().file_stem().and_then(|s| s.to_str()),
None,
)?;
let _ = session.interactive(None).await; let _ = session.interactive(None).await;
return Ok(()); return Ok(());
} }