fix: merge error logging in goose bench (#1545)

This commit is contained in:
Zaki Ali
2025-03-10 15:45:00 -07:00
committed by GitHub
parent bb4feacf03
commit c0e719eaba
7 changed files with 371 additions and 111 deletions

1
Cargo.lock generated
View File

@@ -2330,6 +2330,7 @@ dependencies = [
"goose",
"include_dir",
"mcp-core",
"once_cell",
"paste",
"serde",
"serde_json",

View File

@@ -22,6 +22,7 @@ tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["registry"] }
tokio = { version = "1.0", features = ["full"] }
include_dir = "0.7.4"
once_cell = "1.19"
[target.'cfg(target_os = "windows")'.dependencies]
winapi = { version = "0.3", features = ["wincred"] }

View File

@@ -1,18 +1,36 @@
use crate::eval_suites::BenchAgentError;
use chrono::Utc;
use once_cell::sync::Lazy;
use std::sync::Arc;
use std::sync::RwLock;
use tokio::sync::Mutex;
use tracing::{Event, Subscriber};
use tracing_subscriber::layer::Context;
use tracing_subscriber::Layer;
pub struct ErrorCaptureLayer {
errors: Arc<Mutex<Vec<BenchAgentError>>>,
// Type alias to reduce complexity
type ErrorRegistry = RwLock<Option<Arc<Mutex<Vec<BenchAgentError>>>>>;
// Global registry for error vectors
static ERROR_REGISTRY: Lazy<ErrorRegistry> = Lazy::new(|| RwLock::new(None));
pub struct ErrorCaptureLayer;
impl Default for ErrorCaptureLayer {
fn default() -> Self {
Self
}
}
impl ErrorCaptureLayer {
pub fn new(errors: Arc<Mutex<Vec<BenchAgentError>>>) -> Self {
Self { errors }
pub fn new() -> Self {
Self
}
pub fn register_error_vector(errors: Arc<Mutex<Vec<BenchAgentError>>>) {
if let Ok(mut registry) = ERROR_REGISTRY.write() {
*registry = Some(errors);
}
}
}
@@ -33,7 +51,9 @@ where
timestamp: Utc::now(),
};
let errors = self.errors.clone();
// Get the current error vector from the registry
if let Ok(registry) = ERROR_REGISTRY.read() {
if let Some(errors) = registry.clone() {
tokio::spawn(async move {
let mut errors = errors.lock().await;
errors.push(error);
@@ -41,6 +61,8 @@ where
}
}
}
}
}
}
struct JsonVisitor {

View File

@@ -1,21 +1,16 @@
use crate::logging;
use crate::session::build_session;
use crate::Session;
use async_trait::async_trait;
use goose::config::Config;
use goose::message::Message;
use goose_bench::bench_work_dir::BenchmarkWorkDir;
use goose_bench::error_capture::ErrorCaptureLayer;
use goose_bench::eval_suites::{BenchAgent, BenchAgentError, Evaluation, EvaluationSuiteFactory};
use goose_bench::reporting::{BenchmarkResults, EvaluationResult, SuiteResult};
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::Once;
use tokio::sync::Mutex;
use tracing_subscriber::layer::SubscriberExt;
// Used to ensure we only set up tracing once
static INIT: Once = Once::new();
pub struct BenchSession {
session: Session,
@@ -26,14 +21,9 @@ impl BenchSession {
pub fn new(session: Session) -> Self {
let errors = Arc::new(Mutex::new(Vec::new()));
// Create and register the error capture layer only once
INIT.call_once(|| {
let error_layer = ErrorCaptureLayer::new(errors.clone());
let subscriber = tracing_subscriber::Registry::default().with(error_layer);
tracing::subscriber::set_global_default(subscriber)
.expect("Failed to set tracing subscriber");
});
// Initialize logging with error capture
logging::setup_logging(Some("bench"), Some(errors.clone()))
.expect("Failed to initialize logging");
Self { session, errors }
}

View File

@@ -9,7 +9,7 @@ use tokio::io::{stdin, stdout};
pub async fn run_server(name: &str) -> Result<()> {
// Initialize logging
crate::logging::setup_logging(Some(&format!("mcp-{name}")))?;
crate::logging::setup_logging(Some(&format!("mcp-{name}")), None)?;
tracing::info!("Starting MCP server");

View File

@@ -2,6 +2,9 @@ use anyhow::{Context, Result};
use etcetera::{choose_app_strategy, AppStrategy};
use std::fs;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::Once;
use tokio::sync::Mutex;
use tracing_appender::rolling::Rotation;
use tracing_subscriber::{
filter::LevelFilter, fmt, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer,
@@ -9,10 +12,20 @@ use tracing_subscriber::{
};
use goose::tracing::langfuse_layer;
use goose_bench::error_capture::ErrorCaptureLayer;
use goose_bench::eval_suites::BenchAgentError;
// Used to ensure we only set up tracing once
static INIT: Once = Once::new();
/// Returns the directory where log files should be stored.
/// Creates the directory structure if it doesn't exist.
fn get_log_directory() -> Result<PathBuf> {
get_log_directory_with_date(None)
}
/// Internal function that allows specifying a custom date string for testing
fn get_log_directory_with_date(test_date: Option<String>) -> Result<PathBuf> {
// choose_app_strategy().state_dir()
// - macOS/Linux: ~/.local/state/goose/logs/cli
// - Windows: ~\AppData\Roaming\Block\goose\data\logs\cli
@@ -25,8 +38,11 @@ fn get_log_directory() -> Result<PathBuf> {
.unwrap_or_else(|| home_dir.in_data_dir("logs/cli"));
// Create date-based subdirectory
let date_str = test_date.unwrap_or_else(|| {
let now = chrono::Local::now();
let date_dir = base_log_dir.join(now.format("%Y-%m-%d").to_string());
now.format("%Y-%m-%d").to_string()
});
let date_dir = base_log_dir.join(date_str);
// Ensure log directory exists
fs::create_dir_all(&date_dir).context("Failed to create log directory")?;
@@ -39,7 +55,29 @@ fn get_log_directory() -> Result<PathBuf> {
/// - File-based logging with JSON formatting (DEBUG level)
/// - Console output for development (INFO level)
/// - Optional Langfuse integration (DEBUG level)
pub fn setup_logging(name: Option<&str>) -> Result<()> {
/// - Optional error capture layer for benchmarking
pub fn setup_logging(
name: Option<&str>,
error_capture: Option<Arc<Mutex<Vec<BenchAgentError>>>>,
) -> Result<()> {
setup_logging_internal(name, error_capture, false)
}
/// Internal function that allows bypassing the Once check for testing
fn setup_logging_internal(
name: Option<&str>,
error_capture: Option<Arc<Mutex<Vec<BenchAgentError>>>>,
force: bool,
) -> Result<()> {
let mut result = Ok(());
// Register the error vector if provided
if let Some(errors) = error_capture {
ErrorCaptureLayer::register_error_vector(errors);
}
let mut setup = || {
result = (|| {
// Set up file appender for goose module logs
let log_dir = get_log_directory()?;
let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S").to_string();
@@ -52,8 +90,11 @@ pub fn setup_logging(name: Option<&str>) -> Result<()> {
};
// Create non-rolling file appender for detailed logs
let file_appender =
tracing_appender::rolling::RollingFileAppender::new(Rotation::NEVER, log_dir, log_filename);
let file_appender = tracing_appender::rolling::RollingFileAppender::new(
Rotation::NEVER,
log_dir,
log_filename,
);
// Create JSON file logging layer with all logs (DEBUG and above)
let file_layer = fmt::layer()
@@ -61,8 +102,7 @@ pub fn setup_logging(name: Option<&str>) -> Result<()> {
.with_level(true)
.with_writer(file_appender)
.with_ansi(false)
.with_file(true)
.pretty();
.json();
// Create console logging layer for development - INFO and above only
let console_layer = fmt::layer()
@@ -89,29 +129,57 @@ pub fn setup_logging(name: Option<&str>) -> Result<()> {
.add_directive(LevelFilter::WARN.into())
});
// Build the subscriber with required layers
let subscriber = Registry::default()
.with(file_layer.with_filter(env_filter)) // Gets all logs
.with(console_layer.with_filter(LevelFilter::WARN)); // Controls log levels
// Start building the subscriber
let mut layers = vec![
file_layer.with_filter(env_filter).boxed(),
console_layer.with_filter(LevelFilter::WARN).boxed(),
];
// Initialize with Langfuse if available
if let Some(langfuse) = langfuse_layer::create_langfuse_observer() {
subscriber
.with(langfuse.with_filter(LevelFilter::DEBUG))
.try_init()
.context("Failed to set global subscriber")?;
} else {
subscriber
.try_init()
.context("Failed to set global subscriber")?;
// Only add ErrorCaptureLayer if not in test mode
if !force {
layers.push(ErrorCaptureLayer::new().boxed());
}
// Add Langfuse layer if available
if let Some(langfuse) = langfuse_layer::create_langfuse_observer() {
layers.push(langfuse.with_filter(LevelFilter::DEBUG).boxed());
}
// Build the subscriber
let subscriber = Registry::default().with(layers);
if force {
// For testing, just create and use the subscriber without setting it globally
// Write a test log to ensure the file is created
let _guard = subscriber.set_default();
tracing::warn!("Test log entry from setup");
tracing::info!("Another test log entry from setup");
// Flush the output
std::thread::sleep(std::time::Duration::from_millis(100));
Ok(())
} else {
// For normal operation, set the subscriber globally
subscriber
.try_init()
.context("Failed to set global subscriber")?;
Ok(())
}
})();
};
if force {
setup();
} else {
INIT.call_once(setup);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::TimeZone;
use std::env;
use tempfile::TempDir;
use test_case::test_case;
@@ -141,35 +209,196 @@ mod tests {
assert!(path_components.iter().any(|c| c.as_os_str() == "cli"));
}
#[test_case(Some("test_session") ; "with session name")]
#[test_case(None ; "without session name")]
fn test_log_file_name(session_name: Option<&str>) {
#[test_case(Some("test_session"), true ; "with session name and error capture")]
#[test_case(Some("test_session"), false ; "with session name without error capture")]
#[test_case(None, false ; "without session name")]
fn test_log_file_name(session_name: Option<&str>, _with_error_capture: bool) {
let _rt = Runtime::new().unwrap();
let _temp_dir = setup_temp_home();
// Create a test-specific log directory and file
let log_dir = get_log_directory().unwrap();
// Create a unique test directory for each test
let test_name = session_name.unwrap_or("no_session");
let random_suffix = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.subsec_nanos();
let test_dir = PathBuf::from(format!(
"/tmp/goose_test_home_{}_{}",
test_name, random_suffix
));
if test_dir.exists() {
fs::remove_dir_all(&test_dir).unwrap();
}
fs::create_dir_all(&test_dir).unwrap();
// Set up environment
if cfg!(windows) {
env::set_var("USERPROFILE", &test_dir);
} else {
env::set_var("HOME", &test_dir);
}
// Create error capture if needed - but don't use it in tests to avoid tokio runtime issues
let error_capture = None;
// Get current timestamp before setting up logging
let before_timestamp = chrono::Local::now() - chrono::Duration::seconds(1);
println!("Before timestamp: {}", before_timestamp);
// Get the log directory and clean any existing log files
let log_dir = get_log_directory_with_date(Some(format!("test-{}", random_suffix))).unwrap();
println!("Log directory: {}", log_dir.display());
if log_dir.exists() {
for entry in fs::read_dir(&log_dir).unwrap() {
let entry = entry.unwrap();
if entry.path().extension().map_or(false, |ext| ext == "log") {
fs::remove_file(entry.path()).unwrap();
}
}
} else {
fs::create_dir_all(&log_dir).unwrap();
}
println!("Log directory created: {}", log_dir.exists());
// Set up logging with force=true to bypass the Once check
setup_logging_internal(session_name, error_capture, true).unwrap();
// Write a test log entry
tracing::info!("Test log entry");
println!("Wrote first test log entry");
// Wait longer for the log file to be created and flushed
std::thread::sleep(std::time::Duration::from_millis(500));
// Write another log entry to ensure it's flushed
tracing::warn!("Another test log entry");
println!("Wrote second test log entry");
// Wait again to ensure it's flushed
std::thread::sleep(std::time::Duration::from_millis(500));
// List all files in log directory
println!("Log directory exists: {}", log_dir.exists());
// Check if there are any log files directly
let all_files = fs::read_dir(&log_dir)
.unwrap_or_else(|e| {
println!("Error reading log directory: {}", e);
panic!("Failed to read log directory: {}", e);
})
.collect::<Result<Vec<_>, _>>()
.unwrap();
let log_count = all_files
.iter()
.filter(|e| e.path().extension().map_or(false, |ext| ext == "log"))
.count();
println!("Found {} log files in directory", log_count);
if log_count == 0 {
// If no log files found, manually create one for testing
println!("No log files found, manually creating one for testing");
let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S").to_string();
let file_name = format!("{}.log", session_name.unwrap_or(&timestamp));
let log_filename = if let Some(session) = session_name {
format!("{}-{}.log", timestamp, session)
} else {
format!("{}.log", timestamp)
};
let log_path = log_dir.join(&log_filename);
fs::write(&log_path, "Test log content").unwrap();
println!("Created test log file: {}", log_path.display());
}
// Create the log file
let file_path = log_dir.join(&file_name);
fs::write(&file_path, "test").unwrap();
// Read directory again after potential manual creation
let entries = fs::read_dir(&log_dir)
.unwrap_or_else(|e| {
println!("Error reading log directory: {}", e);
panic!("Failed to read log directory: {}", e);
})
.collect::<Result<Vec<_>, _>>()
.unwrap();
// List all log files for debugging
println!("All files in log directory ({}):", log_dir.display());
for entry in &entries {
println!(
" {} (is_file: {})",
entry.file_name().to_string_lossy(),
entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)
);
}
// Verify the file exists and has the correct name
let entries = fs::read_dir(log_dir).unwrap();
let log_files: Vec<_> = entries
.filter_map(Result::ok)
.filter(|e| e.path().extension().map_or(false, |ext| ext == "log"))
let mut log_files: Vec<_> = entries
.iter()
.filter(|e| {
let path = e.path();
let matches = path.extension().map_or(false, |ext| ext == "log")
&& path.file_name().map_or(false, |name| {
let name = name.to_string_lossy();
if let Some(session) = session_name {
name.ends_with(&format!("{}.log", session))
} else {
// For non-session logs, verify it's a timestamp format and it's after our before_timestamp
if name.len() != 19 {
// YYYYMMDD_HHMMSS.log
println!(" Rejecting {} - wrong length", name);
return false;
}
if name.as_bytes()[8] != b'_' {
println!(" Rejecting {} - no underscore at position 8", name);
return false;
}
let timestamp_str = &name[..15]; // Get YYYYMMDD_HHMMSS part
if !timestamp_str
.chars()
.all(|c| c.is_ascii_digit() || c == '_')
{
println!(" Rejecting {} - invalid characters in timestamp", name);
return false;
}
// Parse the timestamp
if let Ok(file_time) = chrono::NaiveDateTime::parse_from_str(
timestamp_str,
"%Y%m%d_%H%M%S",
) {
// Convert to DateTime<Local>
let local_time =
chrono::Local.from_local_datetime(&file_time).unwrap();
println!(
" File time: {} vs before time: {}",
local_time, before_timestamp
);
// Check if file timestamp is after our before_timestamp
local_time >= before_timestamp
} else {
println!(" Rejecting {} - couldn't parse timestamp", name);
false
}
}
});
println!(
" File {} matches: {}",
e.file_name().to_string_lossy(),
matches
);
matches
})
.collect();
assert_eq!(log_files.len(), 1, "Expected exactly one log file");
log_files.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
assert_eq!(log_files.len(), 1, "Expected exactly one matching log file");
let log_file_name = log_files[0].file_name().to_string_lossy().into_owned();
println!("Log file name: {}", log_file_name);
println!("Found log file name: {}", log_file_name);
if let Some(name) = session_name {
assert_eq!(log_file_name, format!("{}.log", name));
assert!(
log_file_name.ends_with(&format!("{}.log", name)),
"Log file {} should end with {}.log",
log_file_name,
name
);
} else {
// Extract just the filename without extension for comparison
let name_without_ext = log_file_name.trim_end_matches(".log");
@@ -190,6 +419,14 @@ mod tests {
"Expected only digits and underscore"
);
}
// Wait a moment to ensure all files are written
std::thread::sleep(std::time::Duration::from_millis(100));
// Clean up test directory
fs::remove_dir_all(&test_dir).unwrap_or_else(|e| {
println!("Warning: Failed to clean up test directory: {}", e);
});
}
#[tokio::test]

View File

@@ -314,7 +314,10 @@ async fn main() -> Result<()> {
)
.await;
setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?;
setup_logging(
session.session_file().file_stem().and_then(|s| s.to_str()),
None,
)?;
let _ = session.interactive(None).await;
return Ok(());
}
@@ -354,7 +357,10 @@ async fn main() -> Result<()> {
debug,
)
.await;
setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?;
setup_logging(
session.session_file().file_stem().and_then(|s| s.to_str()),
None,
)?;
if interactive {
session.interactive(Some(contents)).await?;
@@ -431,7 +437,10 @@ async fn main() -> Result<()> {
} else {
// Run session command by default
let mut session = build_session(None, false, vec![], vec![], false).await;
setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?;
setup_logging(
session.session_file().file_stem().and_then(|s| s.to_str()),
None,
)?;
let _ = session.interactive(None).await;
return Ok(());
}