feat: recipes can retry with success criteria (#3474)

2025-12-17 14:14:26 +01:00 · 2025-07-22 10:49:21 +10:00
parent 5f3c7d339c
commit 99cc0a9c81
17 changed files with 1078 additions and 82 deletions
--- a/crates/goose-cli/src/cli.rs
+++ b/crates/goose-cli/src/cli.rs
@@ -685,6 +685,14 @@ pub struct InputConfig {
    pub additional_system_prompt: Option<String>,
 }

+#[derive(Debug)]
+pub struct RecipeInfo {
+    pub session_settings: Option<SessionSettings>,
+    pub sub_recipes: Option<Vec<goose::recipe::SubRecipe>>,
+    pub final_output_response: Option<goose::recipe::Response>,
+    pub retry_config: Option<goose::agents::types::RetryConfig>,
+}
+
 pub async fn cli() -> Result<()> {
    let cli = Cli::parse();

@@ -771,6 +779,7 @@ pub async fn cli() -> Result<()> {
                        quiet: false,
                        sub_recipes: None,
                        final_output_response: None,
+                        retry_config: None,
                    })
                    .await;
                    setup_logging(
@@ -828,27 +837,19 @@ pub async fn cli() -> Result<()> {
            provider,
            model,
        }) => {
-            let (input_config, session_settings, sub_recipes, final_output_response) = match (
-                instructions,
-                input_text,
-                recipe,
-            ) {
+            let (input_config, recipe_info) = match (instructions, input_text, recipe) {
                (Some(file), _, _) if file == "-" => {
                    let mut input = String::new();
                    std::io::stdin()
                        .read_to_string(&mut input)
                        .expect("Failed to read from stdin");

-                    (
-                        InputConfig {
+                    let input_config = InputConfig {
                        contents: Some(input),
                        extensions_override: None,
                        additional_system_prompt: system,
-                        },
-                        None,
-                        None,
-                        None,
-                    )
+                    };
+                    (input_config, None)
                }
                (Some(file), _, _) => {
                    let contents = std::fs::read_to_string(&file).unwrap_or_else(|err| {
@@ -858,27 +859,21 @@ pub async fn cli() -> Result<()> {
                        );
                        std::process::exit(1);
                    });
-                    (
-                        InputConfig {
+                    let input_config = InputConfig {
                        contents: Some(contents),
                        extensions_override: None,
                        additional_system_prompt: None,
-                        },
-                        None,
-                        None,
-                        None,
-                    )
+                    };
+                    (input_config, None)
                }
-                (_, Some(text), _) => (
-                    InputConfig {
+                (_, Some(text), _) => {
+                    let input_config = InputConfig {
                        contents: Some(text),
                        extensions_override: None,
                        additional_system_prompt: system,
-                    },
-                    None,
-                    None,
-                    None,
-                ),
+                    };
+                    (input_config, None)
+                }
                (_, _, Some(recipe_name)) => {
                    if explain {
                        explain_recipe(&recipe_name, params)?;
@@ -891,7 +886,9 @@ pub async fn cli() -> Result<()> {
                        }
                        return Ok(());
                    }
-                    extract_recipe_info_from_cli(recipe_name, params, additional_sub_recipes)?
+                    let (input_config, recipe_info) =
+                        extract_recipe_info_from_cli(recipe_name, params, additional_sub_recipes)?;
+                    (input_config, Some(recipe_info))
                }
                (None, None, None) => {
                    eprintln!("Error: Must provide either --instructions (-i), --text (-t), or --recipe. Use -i - for stdin.");
@@ -909,7 +906,9 @@ pub async fn cli() -> Result<()> {
                builtins,
                extensions_override: input_config.extensions_override,
                additional_system_prompt: input_config.additional_system_prompt,
-                settings: session_settings,
+                settings: recipe_info
+                    .as_ref()
+                    .and_then(|r| r.session_settings.clone()),
                provider,
                model,
                debug,
@@ -918,8 +917,11 @@ pub async fn cli() -> Result<()> {
                scheduled_job_id,
                interactive, // Use the interactive flag from the Run command
                quiet,
-                sub_recipes,
-                final_output_response,
+                sub_recipes: recipe_info.as_ref().and_then(|r| r.sub_recipes.clone()),
+                final_output_response: recipe_info
+                    .as_ref()
+                    .and_then(|r| r.final_output_response.clone()),
+                retry_config: recipe_info.as_ref().and_then(|r| r.retry_config.clone()),
            })
            .await;

@@ -1051,6 +1053,7 @@ pub async fn cli() -> Result<()> {
                    quiet: false,
                    sub_recipes: None,
                    final_output_response: None,
+                    retry_config: None,
                })
                .await;
                setup_logging(
--- a/crates/goose-cli/src/commands/bench.rs
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -52,6 +52,7 @@ pub async fn agent_generator(
        quiet: false,
        sub_recipes: None,
        final_output_response: None,
+        retry_config: None,
    })
    .await;

--- a/crates/goose-cli/src/commands/web.rs
+++ b/crates/goose-cli/src/commands/web.rs
@@ -491,6 +491,7 @@ async fn process_message_streaming(
        schedule_id: None,
        execution_mode: None,
        max_turns: None,
+        retry_config: None,
    };

    // Get response from agent
--- a/crates/goose-cli/src/recipes/extract_from_cli.rs
+++ b/crates/goose-cli/src/recipes/extract_from_cli.rs
@@ -1,24 +1,21 @@
 use std::path::PathBuf;

 use anyhow::{anyhow, Result};
-use goose::recipe::{Response, SubRecipe};
+use goose::recipe::SubRecipe;

 use crate::recipes::print_recipe::print_recipe_info;
 use crate::recipes::recipe::load_recipe;
 use crate::recipes::search_recipe::retrieve_recipe_file;
-use crate::{cli::InputConfig, session::SessionSettings};
+use crate::{
+    cli::{InputConfig, RecipeInfo},
+    session::SessionSettings,
+};

-#[allow(clippy::type_complexity)]
 pub fn extract_recipe_info_from_cli(
    recipe_name: String,
    params: Vec<(String, String)>,
    additional_sub_recipes: Vec<String>,
-) -> Result<(
-    InputConfig,
-    Option<SessionSettings>,
-    Option<Vec<SubRecipe>>,
-    Option<Response>,
-)> {
+) -> Result<(InputConfig, RecipeInfo)> {
    let recipe = load_recipe(&recipe_name, params.clone()).unwrap_or_else(|err| {
        eprintln!("{}: {}", console::style("Error").red().bold(), err);
        std::process::exit(1);
@@ -49,20 +46,24 @@ pub fn extract_recipe_info_from_cli(
            }
        }
    }
-    Ok((
-        InputConfig {
+    let input_config = InputConfig {
        contents: recipe.prompt.filter(|s| !s.trim().is_empty()),
        extensions_override: recipe.extensions,
        additional_system_prompt: recipe.instructions,
-        },
-        recipe.settings.map(|s| SessionSettings {
+    };
+
+    let recipe_info = RecipeInfo {
+        session_settings: recipe.settings.map(|s| SessionSettings {
            goose_provider: s.goose_provider,
            goose_model: s.goose_model,
            temperature: s.temperature,
        }),
-        Some(all_sub_recipes),
-        recipe.response,
-    ))
+        sub_recipes: Some(all_sub_recipes),
+        final_output_response: recipe.response,
+        retry_config: recipe.retry,
+    };
+
+    Ok((input_config, recipe_info))
 }

 fn extract_recipe_name(recipe_identifier: &str) -> String {
@@ -93,8 +94,11 @@ mod tests {
        let params = vec![("name".to_string(), "my_value".to_string())];
        let recipe_name = recipe_path.to_str().unwrap().to_string();

-        let (input_config, settings, sub_recipes, response) =
+        let (input_config, recipe_info) =
            extract_recipe_info_from_cli(recipe_name, params, Vec::new()).unwrap();
+        let settings = recipe_info.session_settings;
+        let sub_recipes = recipe_info.sub_recipes;
+        let response = recipe_info.final_output_response;

        assert_eq!(input_config.contents, Some("test_prompt".to_string()));
        assert_eq!(
@@ -149,8 +153,11 @@ mod tests {
            sub_recipe2_path.to_string_lossy().to_string(),
        ];

-        let (input_config, settings, sub_recipes, response) =
+        let (input_config, recipe_info) =
            extract_recipe_info_from_cli(recipe_name, params, additional_sub_recipes).unwrap();
+        let settings = recipe_info.session_settings;
+        let sub_recipes = recipe_info.sub_recipes;
+        let response = recipe_info.final_output_response;

        assert_eq!(input_config.contents, Some("test_prompt".to_string()));
        assert_eq!(
--- a/crates/goose-cli/src/session/builder.rs
+++ b/crates/goose-cli/src/session/builder.rs
@@ -1,5 +1,6 @@
 use console::style;
 use goose::agents::extension::ExtensionError;
+use goose::agents::types::RetryConfig;
 use goose::agents::Agent;
 use goose::config::{Config, ExtensionConfig, ExtensionConfigManager};
 use goose::providers::create;
@@ -60,6 +61,8 @@ pub struct SessionBuilderConfig {
    pub sub_recipes: Option<Vec<SubRecipe>>,
    /// Final output expected response
    pub final_output_response: Option<Response>,
+    /// Retry configuration for automated validation and recovery
+    pub retry_config: Option<RetryConfig>,
 }

 /// Offers to help debug an extension failure by creating a minimal debugging session
@@ -138,6 +141,7 @@ async fn offer_extension_debugging_help(
        None,
        None,
        None,
+        None,
    );

    // Process the debugging request
@@ -407,6 +411,7 @@ pub async fn build_session(session_config: SessionBuilderConfig) -> Session {
        session_config.scheduled_job_id.clone(),
        session_config.max_turns,
        edit_mode,
+        session_config.retry_config.clone(),
    );

    // Add extensions if provided
@@ -602,6 +607,7 @@ mod tests {
            quiet: false,
            sub_recipes: None,
            final_output_response: None,
+            retry_config: None,
        };

        assert_eq!(config.extensions.len(), 1);
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -28,6 +28,7 @@ use anyhow::{Context, Result};
 use completion::GooseCompleter;
 use etcetera::{choose_app_strategy, AppStrategy};
 use goose::agents::extension::{Envs, ExtensionConfig};
+use goose::agents::types::RetryConfig;
 use goose::agents::{Agent, SessionConfig};
 use goose::config::Config;
 use goose::message::{Message, MessageContent};
@@ -64,6 +65,7 @@ pub struct Session {
    scheduled_job_id: Option<String>, // ID of the scheduled job that triggered this session
    max_turns: Option<u32>,
    edit_mode: Option<EditMode>,
+    retry_config: Option<RetryConfig>,
 }

 // Cache structure for completion data
@@ -127,6 +129,7 @@ impl Session {
        scheduled_job_id: Option<String>,
        max_turns: Option<u32>,
        edit_mode: Option<EditMode>,
+        retry_config: Option<RetryConfig>,
    ) -> Self {
        let messages = if let Some(session_file) = &session_file {
            match session::read_messages(session_file) {
@@ -151,6 +154,7 @@ impl Session {
            scheduled_job_id,
            max_turns,
            edit_mode,
+            retry_config,
        }
    }

@@ -879,6 +883,7 @@ impl Session {
                schedule_id: self.scheduled_job_id.clone(),
                execution_mode: None,
                max_turns: self.max_turns,
+                retry_config: self.retry_config.clone(),
            }
        });
        let mut stream = self
--- a/crates/goose-server/src/routes/reply.rs
+++ b/crates/goose-server/src/routes/reply.rs
@@ -186,6 +186,7 @@ async fn handler(
                    schedule_id: request.scheduled_job_id.clone(),
                    execution_mode: None,
                    max_turns: None,
+                    retry_config: None,
                }),
            )
            .await
@@ -368,6 +369,7 @@ async fn ask_handler(
                schedule_id: request.scheduled_job_id.clone(),
                execution_mode: None,
                max_turns: None,
+                retry_config: None,
            }),
        )
        .await
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -29,7 +29,7 @@ use crate::tool_monitor::{ToolCall, ToolMonitor};
 use regex::Regex;
 use serde_json::Value;
 use tokio::sync::{mpsc, Mutex, RwLock};
-use tracing::{debug, error, instrument};
+use tracing::{debug, error, info, instrument};

 use crate::agents::extension::{ExtensionConfig, ExtensionError, ExtensionResult, ToolInfo};
 use crate::agents::extension_manager::{get_parameter_names, ExtensionManager};
@@ -39,6 +39,7 @@ use crate::agents::platform_tools::{
    PLATFORM_SEARCH_AVAILABLE_EXTENSIONS_TOOL_NAME,
 };
 use crate::agents::prompt_manager::PromptManager;
+use crate::agents::retry::{RetryManager, RetryResult};
 use crate::agents::router_tool_selector::{
    create_tool_selector, RouterToolSelectionStrategy, RouterToolSelector,
 };
@@ -64,7 +65,7 @@ pub struct Agent {
    pub(super) extension_manager: Arc<RwLock<ExtensionManager>>,
    pub(super) sub_recipe_manager: Mutex<SubRecipeManager>,
    pub(super) tasks_manager: TasksManager,
-    pub(super) final_output_tool: Mutex<Option<FinalOutputTool>>,
+    pub(super) final_output_tool: Arc<Mutex<Option<FinalOutputTool>>>,
    pub(super) frontend_tools: Mutex<HashMap<String, FrontendTool>>,
    pub(super) frontend_instructions: Mutex<Option<String>>,
    pub(super) prompt_manager: Mutex<PromptManager>,
@@ -72,11 +73,12 @@ pub struct Agent {
    pub(super) confirmation_rx: Mutex<mpsc::Receiver<(String, PermissionConfirmation)>>,
    pub(super) tool_result_tx: mpsc::Sender<(String, ToolResult<Vec<Content>>)>,
    pub(super) tool_result_rx: ToolResultReceiver,
-    pub(super) tool_monitor: Mutex<Option<ToolMonitor>>,
+    pub(super) tool_monitor: Arc<Mutex<Option<ToolMonitor>>>,
    pub(super) router_tool_selector: Mutex<Option<Arc<Box<dyn RouterToolSelector>>>>,
    pub(super) scheduler_service: Mutex<Option<Arc<dyn SchedulerTrait>>>,
    pub(super) mcp_tx: Mutex<mpsc::Sender<JsonRpcMessage>>,
    pub(super) mcp_notification_rx: Arc<Mutex<mpsc::Receiver<JsonRpcMessage>>>,
+    pub(super) retry_manager: RetryManager,
 }

 #[derive(Clone, Debug)]
@@ -134,12 +136,15 @@ impl Agent {
        // Add MCP notification channel
        let (mcp_tx, mcp_rx) = mpsc::channel(100);

+        let tool_monitor = Arc::new(Mutex::new(None));
+        let retry_manager = RetryManager::with_tool_monitor(tool_monitor.clone());
+
        Self {
            provider: Mutex::new(None),
            extension_manager: Arc::new(RwLock::new(ExtensionManager::new())),
            sub_recipe_manager: Mutex::new(SubRecipeManager::new()),
            tasks_manager: TasksManager::new(),
-            final_output_tool: Mutex::new(None),
+            final_output_tool: Arc::new(Mutex::new(None)),
            frontend_tools: Mutex::new(HashMap::new()),
            frontend_instructions: Mutex::new(None),
            prompt_manager: Mutex::new(PromptManager::new()),
@@ -147,12 +152,13 @@ impl Agent {
            confirmation_rx: Mutex::new(confirm_rx),
            tool_result_tx: tool_tx,
            tool_result_rx: Arc::new(Mutex::new(tool_rx)),
-            tool_monitor: Mutex::new(None),
+            tool_monitor,
            router_tool_selector: Mutex::new(None),
            scheduler_service: Mutex::new(None),
            // Initialize with MCP notification support
            mcp_tx: Mutex::new(mcp_tx),
            mcp_notification_rx: Arc::new(Mutex::new(mcp_rx)),
+            retry_manager,
        }
    }

@@ -172,6 +178,41 @@ impl Agent {
        }
    }

+    /// Reset the retry attempts counter to 0
+    pub async fn reset_retry_attempts(&self) {
+        self.retry_manager.reset_attempts().await;
+    }
+
+    /// Increment the retry attempts counter and return the new value
+    pub async fn increment_retry_attempts(&self) -> u32 {
+        self.retry_manager.increment_attempts().await
+    }
+
+    /// Get the current retry attempts count
+    pub async fn get_retry_attempts(&self) -> u32 {
+        self.retry_manager.get_attempts().await
+    }
+
+    /// Handle retry logic for the agent reply loop
+    async fn handle_retry_logic(
+        &self,
+        messages: &mut Vec<Message>,
+        session: &Option<SessionConfig>,
+        initial_messages: &[Message],
+    ) -> Result<bool> {
+        let result = self
+            .retry_manager
+            .handle_retry_logic(messages, session, initial_messages, &self.final_output_tool)
+            .await?;
+
+        match result {
+            RetryResult::Retried => Ok(true),
+            RetryResult::Skipped
+            | RetryResult::MaxAttemptsReached
+            | RetryResult::SuccessChecksPassed => Ok(false),
+        }
+    }
+
    /// Set the scheduler service for this agent
    pub async fn set_scheduler(&self, scheduler: Arc<dyn SchedulerTrait>) {
        let mut scheduler_service = self.scheduler_service.lock().await;
@@ -680,8 +721,11 @@ impl Agent {
        session: Option<SessionConfig>,
    ) -> anyhow::Result<BoxStream<'_, anyhow::Result<AgentEvent>>> {
        let mut messages = messages.to_vec();
+        let initial_messages = messages.clone();
        let reply_span = tracing::Span::current();

+        self.reset_retry_attempts().await;
+
        // Load settings from config
        let config = Config::global();

@@ -1040,6 +1084,22 @@ impl Agent {
                            yield AgentEvent::Message(message);
                        }
                    }
+
+                    match self.handle_retry_logic(&mut messages, &session, &initial_messages).await {
+                        Ok(should_retry) => {
+                            if should_retry {
+                                info!("Retry logic triggered, restarting agent loop");
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            error!("Retry logic failed: {}", e);
+                            yield AgentEvent::Message(Message::assistant().with_text(
+                                format!("Retry logic encountered an error: {}", e)
+                            ));
+                        }
+                    }
+
                    break;
                }

--- a/crates/goose/src/agents/mod.rs
+++ b/crates/goose/src/agents/mod.rs
@@ -8,6 +8,7 @@ pub mod platform_tools;
 pub mod prompt_manager;
 mod recipe_tools;
 mod reply_parts;
+pub mod retry;
 mod router_tool_selector;
 mod router_tools;
 mod schedule_tool;
@@ -19,7 +20,7 @@ mod subagent_task_config;
 mod tool_execution;
 mod tool_router_index_manager;
 pub(crate) mod tool_vectordb;
-mod types;
+pub mod types;

 pub use agent::{Agent, AgentEvent};
 pub use extension::ExtensionConfig;
@@ -27,4 +28,4 @@ pub use extension_manager::ExtensionManager;
 pub use prompt_manager::PromptManager;
 pub use subagent::{SubAgent, SubAgentProgress, SubAgentStatus};
 pub use subagent_task_config::TaskConfig;
-pub use types::{FrontendTool, SessionConfig};
+pub use types::{FrontendTool, RetryConfig, SessionConfig, SuccessCheck};
--- a/crates/goose/src/agents/retry.rs
+++ b/crates/goose/src/agents/retry.rs
@@ -0,0 +1,498 @@
+use anyhow::Result;
+use std::process::Stdio;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::process::Command;
+use tokio::sync::Mutex;
+use tracing::{debug, info, warn};
+
+use crate::agents::types::SessionConfig;
+use crate::agents::types::{
+    RetryConfig, SuccessCheck, DEFAULT_ON_FAILURE_TIMEOUT_SECONDS, DEFAULT_RETRY_TIMEOUT_SECONDS,
+};
+use crate::config::Config;
+use crate::message::Message;
+use crate::tool_monitor::ToolMonitor;
+
+/// Result of a retry logic evaluation
+#[derive(Debug, Clone, PartialEq)]
+pub enum RetryResult {
+    /// No retry configuration or session available, retry logic skipped
+    Skipped,
+    /// Maximum retry attempts reached, cannot retry further
+    MaxAttemptsReached,
+    /// Success checks passed, no retry needed
+    SuccessChecksPassed,
+    /// Retry is needed and will be performed
+    Retried,
+}
+
+/// Environment variable for configuring retry timeout globally
+const GOOSE_RECIPE_RETRY_TIMEOUT_SECONDS: &str = "GOOSE_RECIPE_RETRY_TIMEOUT_SECONDS";
+
+/// Environment variable for configuring on_failure timeout globally
+const GOOSE_RECIPE_ON_FAILURE_TIMEOUT_SECONDS: &str = "GOOSE_RECIPE_ON_FAILURE_TIMEOUT_SECONDS";
+
+/// Manages retry state and operations for agent execution
+#[derive(Debug)]
+pub struct RetryManager {
+    /// Current number of retry attempts
+    attempts: Arc<Mutex<u32>>,
+    /// Optional tool monitor for reset operations
+    tool_monitor: Option<Arc<Mutex<Option<ToolMonitor>>>>,
+}
+
+impl Default for RetryManager {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RetryManager {
+    /// Create a new retry manager
+    pub fn new() -> Self {
+        Self {
+            attempts: Arc::new(Mutex::new(0)),
+            tool_monitor: None,
+        }
+    }
+
+    /// Create a new retry manager with tool monitor
+    pub fn with_tool_monitor(tool_monitor: Arc<Mutex<Option<ToolMonitor>>>) -> Self {
+        Self {
+            attempts: Arc::new(Mutex::new(0)),
+            tool_monitor: Some(tool_monitor),
+        }
+    }
+
+    /// Reset the retry attempts counter to 0
+    pub async fn reset_attempts(&self) {
+        let mut attempts = self.attempts.lock().await;
+        *attempts = 0;
+
+        // Reset tool monitor if available
+        if let Some(monitor) = &self.tool_monitor {
+            if let Some(monitor) = monitor.lock().await.as_mut() {
+                monitor.reset();
+            }
+        }
+    }
+
+    /// Increment the retry attempts counter and return the new value
+    pub async fn increment_attempts(&self) -> u32 {
+        let mut attempts = self.attempts.lock().await;
+        *attempts += 1;
+        *attempts
+    }
+
+    /// Get the current retry attempts count
+    pub async fn get_attempts(&self) -> u32 {
+        *self.attempts.lock().await
+    }
+
+    /// Reset status for retry: clear message history and final output tool state
+    async fn reset_status_for_retry(
+        messages: &mut Vec<Message>,
+        initial_messages: &[Message],
+        final_output_tool: &Arc<Mutex<Option<crate::agents::final_output_tool::FinalOutputTool>>>,
+    ) {
+        messages.clear();
+        messages.extend_from_slice(initial_messages);
+        info!("Reset message history to initial state for retry");
+
+        if let Some(final_output_tool) = final_output_tool.lock().await.as_mut() {
+            final_output_tool.final_output = None;
+            info!("Cleared final output tool state for retry");
+        }
+    }
+
+    /// Handle retry logic for the agent reply loop
+    pub async fn handle_retry_logic(
+        &self,
+        messages: &mut Vec<Message>,
+        session: &Option<SessionConfig>,
+        initial_messages: &[Message],
+        final_output_tool: &Arc<Mutex<Option<crate::agents::final_output_tool::FinalOutputTool>>>,
+    ) -> Result<RetryResult> {
+        let Some(session_config) = session else {
+            return Ok(RetryResult::Skipped);
+        };
+
+        let Some(retry_config) = &session_config.retry_config else {
+            return Ok(RetryResult::Skipped);
+        };
+
+        let success = execute_success_checks(&retry_config.checks, retry_config).await?;
+
+        if success {
+            info!("All success checks passed, no retry needed");
+            return Ok(RetryResult::SuccessChecksPassed);
+        }
+
+        let current_attempts = self.get_attempts().await;
+        if current_attempts >= retry_config.max_retries {
+            let error_msg = Message::assistant().with_text(format!(
+                "Maximum retry attempts ({}) exceeded. Unable to complete the task successfully.",
+                retry_config.max_retries
+            ));
+            messages.push(error_msg);
+            warn!(
+                "Maximum retry attempts ({}) exceeded",
+                retry_config.max_retries
+            );
+            return Ok(RetryResult::MaxAttemptsReached);
+        }
+
+        if let Some(on_failure_cmd) = &retry_config.on_failure {
+            info!("Executing on_failure command: {}", on_failure_cmd);
+            execute_on_failure_command(on_failure_cmd, retry_config).await?;
+        }
+
+        Self::reset_status_for_retry(messages, initial_messages, final_output_tool).await;
+
+        let new_attempts = self.increment_attempts().await;
+        info!("Incrementing retry attempts to {}", new_attempts);
+
+        Ok(RetryResult::Retried)
+    }
+}
+
+/// Get the configured timeout duration for retry operations
+/// retry_config.timeout_seconds -> env var -> default
+fn get_retry_timeout(retry_config: &RetryConfig) -> Duration {
+    let timeout_seconds = retry_config
+        .timeout_seconds
+        .or_else(|| {
+            let config = Config::global();
+            config.get_param(GOOSE_RECIPE_RETRY_TIMEOUT_SECONDS).ok()
+        })
+        .unwrap_or(DEFAULT_RETRY_TIMEOUT_SECONDS);
+
+    Duration::from_secs(timeout_seconds)
+}
+
+/// Get the configured timeout duration for on_failure operations
+/// retry_config.on_failure_timeout_seconds -> env var -> default
+fn get_on_failure_timeout(retry_config: &RetryConfig) -> Duration {
+    let timeout_seconds = retry_config
+        .on_failure_timeout_seconds
+        .or_else(|| {
+            let config = Config::global();
+            config
+                .get_param(GOOSE_RECIPE_ON_FAILURE_TIMEOUT_SECONDS)
+                .ok()
+        })
+        .unwrap_or(DEFAULT_ON_FAILURE_TIMEOUT_SECONDS);
+
+    Duration::from_secs(timeout_seconds)
+}
+
+/// Execute all success checks and return true if all pass
+pub async fn execute_success_checks(
+    checks: &[SuccessCheck],
+    retry_config: &RetryConfig,
+) -> Result<bool> {
+    let timeout = get_retry_timeout(retry_config);
+
+    for check in checks {
+        match check {
+            SuccessCheck::Shell { command } => {
+                let result = execute_shell_command(command, timeout).await?;
+                if !result.status.success() {
+                    warn!(
+                        "Success check failed: command '{}' exited with status {}, stderr: {}",
+                        command,
+                        result.status,
+                        String::from_utf8_lossy(&result.stderr)
+                    );
+                    return Ok(false);
+                }
+                info!(
+                    "Success check passed: command '{}' completed successfully",
+                    command
+                );
+            }
+        }
+    }
+    Ok(true)
+}
+
+/// Execute a shell command with cross-platform compatibility and mandatory timeout
+pub async fn execute_shell_command(
+    command: &str,
+    timeout: std::time::Duration,
+) -> Result<std::process::Output> {
+    debug!(
+        "Executing shell command with timeout {:?}: {}",
+        timeout, command
+    );
+
+    let future = async {
+        let mut cmd = if cfg!(target_os = "windows") {
+            let mut cmd = Command::new("cmd");
+            cmd.args(["/C", command]);
+            cmd
+        } else {
+            let mut cmd = Command::new("sh");
+            cmd.args(["-c", command]);
+            cmd
+        };
+
+        let output = cmd
+            .stdout(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdin(Stdio::null())
+            .kill_on_drop(true)
+            .output()
+            .await?;
+
+        debug!(
+            "Shell command completed with status: {}, stdout: {}, stderr: {}",
+            output.status,
+            String::from_utf8_lossy(&output.stdout),
+            String::from_utf8_lossy(&output.stderr)
+        );
+
+        Ok(output)
+    };
+
+    match tokio::time::timeout(timeout, future).await {
+        Ok(result) => result,
+        Err(_) => {
+            let error_msg = format!("Shell command timed out after {:?}: {}", timeout, command);
+            warn!("{}", error_msg);
+            Err(anyhow::anyhow!("{}", error_msg))
+        }
+    }
+}
+
+/// Execute an on_failure command and return an error if it fails
+pub async fn execute_on_failure_command(command: &str, retry_config: &RetryConfig) -> Result<()> {
+    let timeout = get_on_failure_timeout(retry_config);
+    info!(
+        "Executing on_failure command with timeout {:?}: {}",
+        timeout, command
+    );
+
+    let output = match execute_shell_command(command, timeout).await {
+        Ok(output) => output,
+        Err(e) => {
+            if e.to_string().contains("timed out") {
+                let error_msg = format!(
+                    "On_failure command timed out after {:?}: {}",
+                    timeout, command
+                );
+                warn!("{}", error_msg);
+                return Err(anyhow::anyhow!(error_msg));
+            } else {
+                warn!("On_failure command execution error: {}", e);
+                return Err(e);
+            }
+        }
+    };
+
+    if !output.status.success() {
+        let error_msg = format!(
+            "On_failure command failed: command '{}' exited with status {}, stderr: {}",
+            command,
+            output.status,
+            String::from_utf8_lossy(&output.stderr)
+        );
+        warn!("{}", error_msg);
+        return Err(anyhow::anyhow!(error_msg));
+    } else {
+        info!("On_failure command completed successfully: {}", command);
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::agents::types::SuccessCheck;
+
+    fn create_test_retry_config() -> RetryConfig {
+        RetryConfig {
+            max_retries: 3,
+            checks: vec![],
+            on_failure: None,
+            timeout_seconds: Some(60),
+            on_failure_timeout_seconds: Some(120),
+        }
+    }
+
+    #[test]
+    fn test_retry_result_enum() {
+        assert_ne!(RetryResult::Skipped, RetryResult::MaxAttemptsReached);
+        assert_ne!(RetryResult::Skipped, RetryResult::SuccessChecksPassed);
+        assert_ne!(RetryResult::Skipped, RetryResult::Retried);
+        assert_ne!(
+            RetryResult::MaxAttemptsReached,
+            RetryResult::SuccessChecksPassed
+        );
+        assert_ne!(RetryResult::MaxAttemptsReached, RetryResult::Retried);
+        assert_ne!(RetryResult::SuccessChecksPassed, RetryResult::Retried);
+
+        let result = RetryResult::Retried;
+        let cloned = result.clone();
+        assert_eq!(result, cloned);
+
+        let debug_str = format!("{:?}", RetryResult::MaxAttemptsReached);
+        assert!(debug_str.contains("MaxAttemptsReached"));
+    }
+
+    #[tokio::test]
+    async fn test_execute_success_checks_all_pass() {
+        let checks = vec![
+            SuccessCheck::Shell {
+                command: "echo 'test'".to_string(),
+            },
+            SuccessCheck::Shell {
+                command: "true".to_string(),
+            },
+        ];
+        let retry_config = create_test_retry_config();
+
+        let result = execute_success_checks(&checks, &retry_config).await;
+        assert!(result.is_ok());
+        assert!(result.unwrap());
+    }
+
+    #[tokio::test]
+    async fn test_execute_success_checks_one_fails() {
+        let checks = vec![
+            SuccessCheck::Shell {
+                command: "echo 'test'".to_string(),
+            },
+            SuccessCheck::Shell {
+                command: "false".to_string(),
+            },
+        ];
+        let retry_config = create_test_retry_config();
+
+        let result = execute_success_checks(&checks, &retry_config).await;
+        assert!(result.is_ok());
+        assert!(!result.unwrap());
+    }
+
+    #[tokio::test]
+    async fn test_execute_shell_command_success() {
+        let result = execute_shell_command("echo 'hello world'", Duration::from_secs(30)).await;
+        assert!(result.is_ok());
+        let output = result.unwrap();
+        assert!(output.status.success());
+        assert!(String::from_utf8_lossy(&output.stdout).contains("hello world"));
+    }
+
+    #[tokio::test]
+    async fn test_execute_shell_command_failure() {
+        let result = execute_shell_command("false", Duration::from_secs(30)).await;
+        assert!(result.is_ok());
+        let output = result.unwrap();
+        assert!(!output.status.success());
+    }
+
+    #[tokio::test]
+    async fn test_execute_on_failure_command_success() {
+        let retry_config = create_test_retry_config();
+        let result = execute_on_failure_command("echo 'cleanup'", &retry_config).await;
+        assert!(result.is_ok());
+    }
+
+    #[tokio::test]
+    async fn test_execute_on_failure_command_failure() {
+        let retry_config = create_test_retry_config();
+        let result = execute_on_failure_command("false", &retry_config).await;
+        assert!(result.is_err());
+    }
+
+    #[tokio::test]
+    async fn test_shell_command_timeout() {
+        let timeout = std::time::Duration::from_millis(100);
+        let result = if cfg!(target_os = "windows") {
+            execute_shell_command("timeout /t 1", timeout).await
+        } else {
+            execute_shell_command("sleep 1", timeout).await
+        };
+
+        assert!(result.is_err());
+    }
+
+    #[tokio::test]
+    async fn test_get_retry_timeout_uses_config_default() {
+        let retry_config = RetryConfig {
+            max_retries: 1,
+            checks: vec![],
+            on_failure: None,
+            timeout_seconds: None,
+            on_failure_timeout_seconds: None,
+        };
+
+        let timeout = get_retry_timeout(&retry_config);
+        assert_eq!(timeout, Duration::from_secs(DEFAULT_RETRY_TIMEOUT_SECONDS));
+    }
+
+    #[tokio::test]
+    async fn test_get_retry_timeout_uses_retry_config() {
+        let retry_config = RetryConfig {
+            max_retries: 1,
+            checks: vec![],
+            on_failure: None,
+            timeout_seconds: Some(120),
+            on_failure_timeout_seconds: None,
+        };
+
+        let timeout = get_retry_timeout(&retry_config);
+        assert_eq!(timeout, Duration::from_secs(120));
+    }
+
+    #[tokio::test]
+    async fn test_get_on_failure_timeout_uses_config_default() {
+        let retry_config = RetryConfig {
+            max_retries: 1,
+            checks: vec![],
+            on_failure: None,
+            timeout_seconds: None,
+            on_failure_timeout_seconds: None,
+        };
+
+        let timeout = get_on_failure_timeout(&retry_config);
+        assert_eq!(
+            timeout,
+            Duration::from_secs(DEFAULT_ON_FAILURE_TIMEOUT_SECONDS)
+        );
+    }
+
+    #[tokio::test]
+    async fn test_get_on_failure_timeout_uses_retry_config() {
+        let retry_config = RetryConfig {
+            max_retries: 1,
+            checks: vec![],
+            on_failure: None,
+            timeout_seconds: None,
+            on_failure_timeout_seconds: Some(900),
+        };
+
+        let timeout = get_on_failure_timeout(&retry_config);
+        assert_eq!(timeout, Duration::from_secs(900));
+    }
+
+    #[tokio::test]
+    async fn test_on_failure_timeout_different_from_retry_timeout() {
+        let retry_config = RetryConfig {
+            max_retries: 1,
+            checks: vec![],
+            on_failure: None,
+            timeout_seconds: Some(60),
+            on_failure_timeout_seconds: Some(300),
+        };
+
+        let retry_timeout = get_retry_timeout(&retry_config);
+        let on_failure_timeout = get_on_failure_timeout(&retry_config);
+
+        assert_eq!(retry_timeout, Duration::from_secs(60));
+        assert_eq!(on_failure_timeout, Duration::from_secs(300));
+        assert_ne!(retry_timeout, on_failure_timeout);
+    }
+}
--- a/crates/goose/src/agents/types.rs
+++ b/crates/goose/src/agents/types.rs
@@ -9,6 +9,67 @@ use tokio::sync::{mpsc, Mutex};
 /// Type alias for the tool result channel receiver
 pub type ToolResultReceiver = Arc<Mutex<mpsc::Receiver<(String, ToolResult<Vec<Content>>)>>>;

+/// Default timeout for retry operations (5 minutes)
+pub const DEFAULT_RETRY_TIMEOUT_SECONDS: u64 = 300;
+
+/// Default timeout for on_failure operations (10 minutes - longer for on_failure tasks)
+pub const DEFAULT_ON_FAILURE_TIMEOUT_SECONDS: u64 = 600;
+
+/// Configuration for retry logic in recipe execution
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RetryConfig {
+    /// Maximum number of retry attempts before giving up
+    pub max_retries: u32,
+    /// List of success checks to validate recipe completion
+    pub checks: Vec<SuccessCheck>,
+    /// Optional shell command to run on failure for cleanup
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub on_failure: Option<String>,
+    /// Timeout in seconds for individual shell commands (default: 300 seconds)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub timeout_seconds: Option<u64>,
+    /// Timeout in seconds for on_failure commands (default: 600 seconds)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub on_failure_timeout_seconds: Option<u64>,
+}
+
+impl RetryConfig {
+    /// Validates the retry configuration values
+    pub fn validate(&self) -> Result<(), String> {
+        if self.max_retries == 0 {
+            return Err("max_retries must be greater than 0".to_string());
+        }
+
+        if let Some(timeout) = self.timeout_seconds {
+            if timeout == 0 {
+                return Err("timeout_seconds must be greater than 0 if specified".to_string());
+            }
+        }
+
+        if let Some(on_failure_timeout) = self.on_failure_timeout_seconds {
+            if on_failure_timeout == 0 {
+                return Err(
+                    "on_failure_timeout_seconds must be greater than 0 if specified".to_string(),
+                );
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// A single success check to validate recipe completion
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type")]
+pub enum SuccessCheck {
+    /// Execute a shell command and check its exit status
+    #[serde(alias = "shell")]
+    Shell {
+        /// The shell command to execute
+        command: String,
+    },
+}
+
 /// A frontend tool that will be executed by the frontend rather than an extension
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct FrontendTool {
@@ -29,4 +90,7 @@ pub struct SessionConfig {
    pub execution_mode: Option<String>,
    /// Maximum number of turns (iterations) allowed without user input
    pub max_turns: Option<u32>,
+    /// Retry configuration for automated validation and recovery
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub retry_config: Option<RetryConfig>,
 }
--- a/crates/goose/src/providers/pricing.rs
+++ b/crates/goose/src/providers/pricing.rs
@@ -231,12 +231,16 @@ impl Default for PricingCache {
 // Global cache instance
 lazy_static::lazy_static! {
    static ref PRICING_CACHE: PricingCache = PricingCache::new();
-    static ref HTTP_CLIENT: Client = Client::builder()
+}
+
+/// Create a properly configured HTTP client for the current runtime
+fn create_http_client() -> Client {
+    Client::builder()
        .timeout(Duration::from_secs(30))
        .pool_idle_timeout(Duration::from_secs(90))
        .pool_max_idle_per_host(10)
        .build()
-        .unwrap();
+        .expect("Failed to create HTTP client")
 }

 /// OpenRouter model pricing information
@@ -270,7 +274,8 @@ pub struct OpenRouterModelsResponse {

 /// Internal function to fetch pricing data
 async fn fetch_openrouter_pricing_internal() -> Result<HashMap<String, OpenRouterModel>> {
-    let response = HTTP_CLIENT
+    let client = create_http_client();
+    let response = client
        .get("https://openrouter.ai/api/v1/models")
        .send()
        .await?;
--- a/crates/goose/src/recipe/mod.rs
+++ b/crates/goose/src/recipe/mod.rs
@@ -4,6 +4,7 @@ use std::collections::HashMap;
 use std::fmt;

 use crate::agents::extension::ExtensionConfig;
+use crate::agents::types::RetryConfig;
 use serde::de::Deserializer;
 use serde::{Deserialize, Serialize};
 use utoipa::ToSchema;
@@ -37,7 +38,7 @@ fn default_version() -> String {
 /// * `author` - Information about the Recipe's creator and metadata
 /// * `parameters` - Additional parameters for the Recipe
 /// * `response` - Response configuration including JSON schema validation
-///
+/// * `retry` - Retry configuration for automated validation and recovery
 /// # Example
 ///
 ///
@@ -66,6 +67,7 @@ fn default_version() -> String {
 ///     parameters: None,
 ///     response: None,
 ///     sub_recipes: None,
+///     retry: None,
 /// };
 ///
 #[derive(Serialize, Deserialize, Debug, Clone, ToSchema)]
@@ -109,6 +111,9 @@ pub struct Recipe {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub sub_recipes: Option<Vec<SubRecipe>>, // sub-recipes for the recipe
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub retry: Option<RetryConfig>,
 }

 #[derive(Serialize, Deserialize, Debug, Clone, ToSchema)]
@@ -239,6 +244,7 @@ pub struct RecipeBuilder {
    parameters: Option<Vec<RecipeParameter>>,
    response: Option<Response>,
    sub_recipes: Option<Vec<SubRecipe>>,
+    retry: Option<RetryConfig>,
 }

 impl Recipe {
@@ -271,27 +277,40 @@ impl Recipe {
            parameters: None,
            response: None,
            sub_recipes: None,
+            retry: None,
        }
    }
    pub fn from_content(content: &str) -> Result<Self> {
+        let recipe: Recipe =
            if let Ok(json_value) = serde_json::from_str::<serde_json::Value>(content) {
                if let Some(nested_recipe) = json_value.get("recipe") {
-                Ok(serde_json::from_value(nested_recipe.clone())?)
+                    serde_json::from_value(nested_recipe.clone())?
                } else {
-                Ok(serde_json::from_str(content)?)
+                    serde_json::from_str(content)?
                }
            } else if let Ok(yaml_value) = serde_yaml::from_str::<serde_yaml::Value>(content) {
                if let Some(nested_recipe) = yaml_value.get("recipe") {
-                Ok(serde_yaml::from_value(nested_recipe.clone())?)
+                    serde_yaml::from_value(nested_recipe.clone())?
                } else {
-                Ok(serde_yaml::from_str(content)?)
+                    serde_yaml::from_str(content)?
                }
            } else {
-            Err(anyhow::anyhow!(
+                return Err(anyhow::anyhow!(
                    "Unsupported format. Expected JSON or YAML."
-            ))
+                ));
+            };
+
+        if let Some(ref retry_config) = recipe.retry {
+            if let Err(validation_error) = retry_config.validate() {
+                return Err(anyhow::anyhow!(
+                    "Invalid retry configuration: {}",
+                    validation_error
+                ));
            }
        }
+
+        Ok(recipe)
+    }
 }

 impl RecipeBuilder {
@@ -369,6 +388,12 @@ impl RecipeBuilder {
        self
    }

+    /// Sets the retry configuration for the Recipe
+    pub fn retry(mut self, retry: RetryConfig) -> Self {
+        self.retry = Some(retry);
+        self
+    }
+
    /// Builds the Recipe instance
    ///
    /// Returns an error if any required fields are missing
@@ -394,6 +419,7 @@ impl RecipeBuilder {
            parameters: self.parameters,
            response: self.response,
            sub_recipes: self.sub_recipes,
+            retry: self.retry,
        })
    }
 }
--- a/crates/goose/src/scheduler.rs
+++ b/crates/goose/src/scheduler.rs
@@ -1204,6 +1204,7 @@ async fn run_scheduled_job_internal(
            schedule_id: Some(job.id.clone()),
            execution_mode: job.execution_mode.clone(),
            max_turns: None,
+            retry_config: None,
        };

        match agent
@@ -1424,6 +1425,7 @@ mod tests {
            settings: None,
            response: None,
            sub_recipes: None,
+            retry: None,
        };
        let mut recipe_file = File::create(&recipe_filename)?;
        writeln!(
--- a/crates/goose/tests/agent.rs
+++ b/crates/goose/tests/agent.rs
@@ -761,6 +761,184 @@ mod final_output_tool_tests {
    }
 }

+#[cfg(test)]
+mod retry_tests {
+    use super::*;
+    use async_trait::async_trait;
+    use goose::agents::types::{RetryConfig, SessionConfig, SuccessCheck};
+    use goose::model::ModelConfig;
+    use goose::providers::base::{Provider, ProviderUsage, Usage};
+    use goose::providers::errors::ProviderError;
+    use mcp_core::tool::Tool;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::sync::Arc;
+
+    #[derive(Clone)]
+    struct MockRetryProvider {
+        model_config: ModelConfig,
+        call_count: Arc<AtomicUsize>,
+        fail_until: usize,
+    }
+
+    #[async_trait]
+    impl Provider for MockRetryProvider {
+        fn metadata() -> goose::providers::base::ProviderMetadata {
+            goose::providers::base::ProviderMetadata::empty()
+        }
+
+        fn get_model_config(&self) -> ModelConfig {
+            self.model_config.clone()
+        }
+
+        async fn complete(
+            &self,
+            _system: &str,
+            _messages: &[Message],
+            _tools: &[Tool],
+        ) -> anyhow::Result<(Message, ProviderUsage), ProviderError> {
+            let count = self.call_count.fetch_add(1, Ordering::SeqCst);
+
+            if count < self.fail_until {
+                Ok((
+                    Message::assistant().with_text("Task failed - will retry."),
+                    ProviderUsage::new("mock".to_string(), Usage::default()),
+                ))
+            } else {
+                Ok((
+                    Message::assistant().with_text("Task completed successfully."),
+                    ProviderUsage::new("mock".to_string(), Usage::default()),
+                ))
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_retry_config_validation_integration() -> Result<()> {
+        let agent = Agent::new();
+
+        let model_config = ModelConfig::new("test-model".to_string());
+        let mock_provider = Arc::new(MockRetryProvider {
+            model_config,
+            call_count: Arc::new(AtomicUsize::new(0)),
+            fail_until: 0,
+        });
+        agent.update_provider(mock_provider.clone()).await?;
+
+        let retry_config = RetryConfig {
+            max_retries: 3,
+            checks: vec![SuccessCheck::Shell {
+                command: "echo 'success check'".to_string(),
+            }],
+            on_failure: Some("echo 'cleanup executed'".to_string()),
+            timeout_seconds: Some(30),
+            on_failure_timeout_seconds: Some(60),
+        };
+
+        assert!(
+            retry_config.validate().is_ok(),
+            "Valid config should pass validation"
+        );
+
+        let session_config = SessionConfig {
+            id: goose::session::Identifier::Name("test-retry".to_string()),
+            working_dir: std::env::current_dir()?,
+            schedule_id: None,
+            execution_mode: None,
+            max_turns: None,
+            retry_config: Some(retry_config),
+        };
+
+        let initial_messages = vec![Message::user().with_text("Complete this task")];
+
+        let reply_stream = agent.reply(&initial_messages, Some(session_config)).await?;
+        tokio::pin!(reply_stream);
+
+        let mut responses = Vec::new();
+        while let Some(response_result) = reply_stream.next().await {
+            match response_result {
+                Ok(AgentEvent::Message(response)) => responses.push(response),
+                Ok(_) => {}
+                Err(e) => return Err(e),
+            }
+        }
+
+        assert!(!responses.is_empty(), "Should have received responses");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_retry_success_check_execution() -> Result<()> {
+        use goose::agents::retry::execute_success_checks;
+
+        let retry_config = RetryConfig {
+            max_retries: 3,
+            checks: vec![],
+            on_failure: None,
+            timeout_seconds: Some(30),
+            on_failure_timeout_seconds: Some(60),
+        };
+
+        let success_checks = vec![SuccessCheck::Shell {
+            command: "echo 'test'".to_string(),
+        }];
+
+        let result = execute_success_checks(&success_checks, &retry_config).await;
+        assert!(result.is_ok(), "Success check should pass");
+        assert!(result.unwrap(), "Command should succeed");
+
+        let fail_checks = vec![SuccessCheck::Shell {
+            command: "false".to_string(),
+        }];
+
+        let result = execute_success_checks(&fail_checks, &retry_config).await;
+        assert!(result.is_ok(), "Success check execution should not error");
+        assert!(!result.unwrap(), "Command should fail");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_retry_logic_with_validation_errors() -> Result<()> {
+        let invalid_retry_config = RetryConfig {
+            max_retries: 0,
+            checks: vec![],
+            on_failure: None,
+            timeout_seconds: Some(0),
+            on_failure_timeout_seconds: None,
+        };
+
+        let validation_result = invalid_retry_config.validate();
+        assert!(
+            validation_result.is_err(),
+            "Should validate max_retries > 0"
+        );
+        assert!(validation_result
+            .unwrap_err()
+            .contains("max_retries must be greater than 0"));
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_retry_attempts_counter_reset() -> Result<()> {
+        let agent = Agent::new();
+
+        agent.reset_retry_attempts().await;
+        let initial_attempts = agent.get_retry_attempts().await;
+        assert_eq!(initial_attempts, 0);
+
+        let new_attempts = agent.increment_retry_attempts().await;
+        assert_eq!(new_attempts, 1);
+
+        agent.reset_retry_attempts().await;
+        let reset_attempts = agent.get_retry_attempts().await;
+        assert_eq!(reset_attempts, 0);
+
+        Ok(())
+    }
+}
+
 #[cfg(test)]
 mod max_turns_tests {
    use super::*;
@@ -831,6 +1009,7 @@ mod max_turns_tests {
            schedule_id: None,
            execution_mode: None,
            max_turns: Some(1),
+            retry_config: None,
        };
        let messages = vec![Message::user().with_text("Hello")];

--- a/documentation/docs/guides/recipes/recipe-reference.md
+++ b/documentation/docs/guides/recipes/recipe-reference.md
@@ -38,6 +38,7 @@ After creating recipe files, you can use [`goose` CLI commands](/docs/guides/goo
 | `extensions` | Array | List of extension configurations |
 | `sub_recipes` | Array | List of sub-recipes |
 | `response` | Object | Configuration for structured output validation |
+| `retry` | Object | Configuration for automated retry logic with success validation |

 ## Parameters

@@ -136,6 +137,87 @@ sub_recipes:
    path: "./sub-recipes/quality-analysis.yaml"
 ```

+## Automated Retry with Success Validation
+
+The `retry` field enables recipes to automatically retry execution if success criteria are not met. This is useful for recipes that might need multiple attempts to achieve their goal, or for implementing automated validation and recovery workflows.
+
+### Retry Configuration Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `max_retries` | Number | Maximum number of retry attempts (required) |
+| `timeout_seconds` | Number | (Optional) Timeout for success check commands (default: 300 seconds) |
+| `on_failure_timeout_seconds` | Number | (Optional) Timeout for on_failure commands (default: 600 seconds) |
+| `checks` | Array | List of success check configurations (required) |
+| `on_failure` | String | (Optional) Shell command to run when a retry attempt fails |
+
+### Success Check Configuration
+
+Each success check in the `checks` array has the following structure:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `type` | String | Type of check - currently only "shell" is supported |
+| `command` | String | Shell command to execute for validation (must exit with code 0 for success) |
+
+### How Retry Logic Works
+
+1. **Recipe Execution**: The recipe runs normally with the provided instructions
+2. **Success Validation**: After completion, all success checks are executed in order
+3. **Retry Decision**: If any success check fails and retry attempts remain:
+   - Execute the on_failure command (if configured)
+   - Reset the agent's message history to initial state
+   - Increment retry counter and restart execution
+4. **Completion**: Process stops when either:
+   - All success checks pass (success)
+   - Maximum retry attempts are reached (failure)
+
+### Basic Retry Example
+
+```yaml
+version: "1.0.0"
+title: "Counter Increment Task"
+description: "Increment a counter until it reaches target value"
+prompt: "Increment the counter value in /tmp/counter.txt by 1."
+
+retry:
+  max_retries: 5
+  timeout_seconds: 10
+  checks:
+    - type: shell
+      command: "test $(cat /tmp/counter.txt 2>/dev/null || echo 0) -ge 3"
+  on_failure: "echo 'Counter is at:' $(cat /tmp/counter.txt 2>/dev/null || echo 0) '(need 3 to succeed)'"
+```
+
+### Advanced Retry Example
+
+```yaml
+version: "1.0.0"
+title: "Service Health Check"
+description: "Start service and verify it's running properly"
+prompt: "Start the web service and verify it responds to health checks"
+
+retry:
+  max_retries: 3
+  timeout_seconds: 30
+  on_failure_timeout_seconds: 60
+  checks:
+    - type: shell
+      command: "curl -f http://localhost:8080/health"
+    - type: shell  
+      command: "pgrep -f 'web-service' > /dev/null"
+  on_failure: "systemctl stop web-service || killall web-service"
+```
+
+### Environment Variables
+
+You can configure retry behavior globally using environment variables:
+
+- `GOOSE_RECIPE_RETRY_TIMEOUT_SECONDS`: Global timeout for success check commands
+- `GOOSE_RECIPE_ON_FAILURE_TIMEOUT_SECONDS`: Global timeout for on_failure commands
+
+These environment variables are overridden by recipe-specific timeout configurations.
+
 ## Structured Output with `response`

 The `response` field enables recipes to enforce a final structured JSON output from Goose. When you specify a `json_schema`, Goose will:
@@ -243,6 +325,14 @@ extensions:
    bundled: true
    description: "Query codesearch directly from goose"

+retry:
+  max_retries: 3
+  timeout_seconds: 30
+  checks:
+    - type: shell
+      command: "echo 'Task validation check passed'"
+  on_failure: "echo 'Retry attempt failed, cleaning up...'"
+
 response:
  json_schema:
    type: object
@@ -313,8 +403,16 @@ Common errors to watch for:
 - Invalid YAML/JSON syntax
 - Missing required fields
 - Invalid extension configurations
+- Invalid retry configuration (missing required fields, invalid shell commands)

 When these occur, Goose will provide helpful error messages indicating what needs to be fixed.

+### Retry-Specific Errors
+
+- **Invalid success checks**: Shell commands that cannot be executed or have syntax errors
+- **Timeout errors**: Success checks or on_failure commands that exceed their timeout limits
+- **Max retries exceeded**: When all retry attempts are exhausted without success
+- **Missing required retry fields**: When `max_retries` or `checks` are not specified
+
 ## Learn More
 Check out the [Goose Recipes](/docs/guides/recipes) guide for more docs, tools, and resources to help you master Goose recipes.
--- a/documentation/docs/guides/recipes/session-recipes.md
+++ b/documentation/docs/guides/recipes/session-recipes.md
@@ -87,6 +87,12 @@ You can turn your current Goose session into a reusable recipe that includes the
     goose_provider: $provider    # Provider to use for this recipe
     goose_model: $model          # Specific model to use for this recipe
     temperature: $temperature    # Model temperature setting for this recipe (0.0 to 1.0)
+   retry:                         # Automated retry logic with success validation
+     max_retries: $max_retries    # Maximum number of retry attempts
+     checks:                      # Success validation checks
+     - type: shell
+       command: $validation_command
+     on_failure: $cleanup_command # Optional cleanup command on failure
   ```
   </details>

@@ -529,6 +535,37 @@ When scheduling Goose recipes with the CLI, you can use Goose's built-in cron sc
      - Help users understand what the recipe can do
      - Make it easy to get started

+## Advanced Features
+
+### Automated Retry Logic
+
+Recipes can include retry logic to automatically attempt task completion multiple times until success criteria are met. This is particularly useful for:
+
+- **Automation workflows** that need to ensure successful completion
+- **Development tasks** like running tests that may need multiple attempts  
+- **System operations** that require validation and cleanup
+
+**Basic retry configuration:**
+```yaml
+retry:
+  max_retries: 3
+  checks:
+    - type: shell
+      command: "test -f output.txt"  # Check if output file exists
+  on_failure: "rm -f temp_files*"   # Cleanup on failure
+```
+
+**How it works:**
+1. Recipe executes normally with provided instructions
+2. After completion, success checks validate the results
+3. If validation fails and retries remain:
+   - Optional cleanup command runs
+   - Agent state resets to initial conditions
+   - Recipe execution starts over
+4. Process continues until either success or max retries reached
+
+See the [Recipe Reference Guide](/docs/guides/recipes/recipe-reference#automated-retry-with-success-validation) for complete retry configuration options and examples.
+
 ## What's Included

 A recipe captures:
@@ -539,6 +576,7 @@ A recipe captures:
 - Project folder or file context  
 - Initial setup (but not full conversation history)
 - The model and provider to use when running the recipe (optional)
+- Retry logic and success validation configuration (if configured)


 To protect your privacy and system integrity, Goose excludes: