feat(gcpvertexai): do HTTP 429 like retries for Anthropic API HTTP 529 overloaded status code (#3026)

Co-authored-by: Michael Neale <michael.neale@gmail.com>
2025-12-18 14:44:21 +01:00 · 2025-07-16 22:10:48 -04:00
parent 33c082cd5b
commit 21b79ad240
4 changed files with 234 additions and 75 deletions
--- a/crates/goose-server/src/routes/providers_and_keys.json
+++ b/crates/goose-server/src/routes/providers_and_keys.json
@@ -20,7 +20,7 @@
    "gcp_vertex_ai": {
        "name": "GCP Vertex AI",
        "description": "Use Vertex AI platform models",
-        "models": ["claude-3-5-haiku@20241022", "claude-3-5-sonnet@20240620", "claude-3-5-sonnet-v2@20241022", "claude-3-7-sonnet@20250219", "gemini-1.5-pro-002", "gemini-2.0-flash-001", "gemini-2.0-pro-exp-02-05", "gemini-2.5-pro-exp-03-25", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-preview-05-06"],
+        "models": ["claude-3-5-haiku@20241022", "claude-3-5-sonnet@20240620", "claude-3-5-sonnet-v2@20241022", "claude-3-7-sonnet@20250219", "claude-sonnet-4@20250514", "claude-opus-4@20250514", "gemini-1.5-pro-002", "gemini-2.0-flash-001", "gemini-2.0-pro-exp-02-05", "gemini-2.5-pro-exp-03-25", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-preview-05-06", "gemini-2.5-flash", "gemini-2.5-pro"],
        "required_keys": ["GCP_PROJECT_ID", "GCP_LOCATION"]
    },
    "google": {
--- a/crates/goose/src/providers/formats/gcpvertexai.rs
+++ b/crates/goose/src/providers/formats/gcpvertexai.rs
@@ -83,6 +83,8 @@ pub enum ClaudeVersion {
    Haiku35,
    /// Claude Sonnet 4
    Sonnet4,
    /// Claude Opus 4
    Opus4,
    /// Generic Claude model for custom or new versions
    Generic(String),
 }
@@ -102,6 +104,10 @@ pub enum GeminiVersion {
    Flash25Preview,
    /// Gemini 2.5 Pro Preview version
    Pro25Preview,
    /// Gemini 2.5 Flash version
    Flash25,
    /// Gemini 2.5 Pro version
    Pro25,
    /// Generic Gemini model for custom or new versions
    Generic(String),
 }
@@ -115,6 +121,7 @@ impl fmt::Display for GcpVertexAIModel {
                ClaudeVersion::Sonnet37 => "claude-3-7-sonnet@20250219",
                ClaudeVersion::Haiku35 => "claude-3-5-haiku@20241022",
                ClaudeVersion::Sonnet4 => "claude-sonnet-4@20250514",
                ClaudeVersion::Opus4 => "claude-opus-4@20250514",
                ClaudeVersion::Generic(name) => name,
            },
            Self::Gemini(version) => match version {
@@ -124,6 +131,8 @@ impl fmt::Display for GcpVertexAIModel {
                GeminiVersion::Pro25Exp => "gemini-2.5-pro-exp-03-25",
                GeminiVersion::Flash25Preview => "gemini-2.5-flash-preview-05-20",
                GeminiVersion::Pro25Preview => "gemini-2.5-pro-preview-05-06",
                GeminiVersion::Flash25 => "gemini-2.5-flash",
                GeminiVersion::Pro25 => "gemini-2.5-pro",
                GeminiVersion::Generic(name) => name,
            },
        };
@@ -156,12 +165,15 @@ impl TryFrom<&str> for GcpVertexAIModel {
            "claude-3-7-sonnet@20250219" => Ok(Self::Claude(ClaudeVersion::Sonnet37)),
            "claude-3-5-haiku@20241022" => Ok(Self::Claude(ClaudeVersion::Haiku35)),
            "claude-sonnet-4@20250514" => Ok(Self::Claude(ClaudeVersion::Sonnet4)),
            "claude-opus-4@20250514" => Ok(Self::Claude(ClaudeVersion::Opus4)),
            "gemini-1.5-pro-002" => Ok(Self::Gemini(GeminiVersion::Pro15)),
            "gemini-2.0-flash-001" => Ok(Self::Gemini(GeminiVersion::Flash20)),
            "gemini-2.0-pro-exp-02-05" => Ok(Self::Gemini(GeminiVersion::Pro20Exp)),
            "gemini-2.5-pro-exp-03-25" => Ok(Self::Gemini(GeminiVersion::Pro25Exp)),
            "gemini-2.5-flash-preview-05-20" => Ok(Self::Gemini(GeminiVersion::Flash25Preview)),
            "gemini-2.5-pro-preview-05-06" => Ok(Self::Gemini(GeminiVersion::Pro25Preview)),
            "gemini-2.5-flash" => Ok(Self::Gemini(GeminiVersion::Flash25)),
            "gemini-2.5-pro" => Ok(Self::Gemini(GeminiVersion::Pro25)),
            // Generic models based on prefix matching
            _ if s.starts_with("claude-") => {
                Ok(Self::Claude(ClaudeVersion::Generic(s.to_string())))
@@ -427,7 +439,7 @@ mod tests {
        }
        // Test generic Gemini models
-        let gemini_models = ["gemini-3-pro", "gemini-2.5-flash", "gemini-experimental"];
+        let gemini_models = ["gemini-3-pro", "gemini-2.0-flash", "gemini-experimental"];
        for model_id in gemini_models {
            let model = GcpVertexAIModel::try_from(model_id)?;
--- a/crates/goose/src/providers/gcpvertexai.rs
+++ b/crates/goose/src/providers/gcpvertexai.rs
@@ -2,6 +2,7 @@ use std::time::Duration;
 use anyhow::Result;
 use async_trait::async_trait;
 use once_cell::sync::Lazy;
 use reqwest::{Client, StatusCode};
 use serde_json::Value;
 use tokio::time::sleep;
@@ -34,6 +35,9 @@ const DEFAULT_MAX_RETRIES: usize = 6;
 const DEFAULT_BACKOFF_MULTIPLIER: f64 = 2.0;
 /// Default maximum interval for retry (in milliseconds)
 const DEFAULT_MAX_RETRY_INTERVAL_MS: u64 = 320_000;
 /// Status code for Anthropic's API overloaded error (529)
 static STATUS_API_OVERLOADED: Lazy<StatusCode> =
    Lazy::new(|| StatusCode::from_u16(529).expect("Valid status code 529 for API_OVERLOADED"));
 /// Represents errors specific to GCP Vertex AI operations.
 #[derive(Debug, thiserror::Error)]
@@ -50,8 +54,10 @@ enum GcpVertexAIError {
 /// Retry configuration for handling rate limit errors
 #[derive(Debug, Clone)]
 struct RetryConfig {
-    /// Maximum number of retry attempts
+    /// Maximum number of retry attempts for 429 errors
-    max_retries: usize,
+    max_rate_limit_retries: usize,
    /// Maximum number of retry attempts for 529 errors
    max_overloaded_retries: usize,
    /// Initial interval between retries in milliseconds
    initial_interval_ms: u64,
    /// Multiplier for backoff (exponential)
@@ -63,7 +69,8 @@ struct RetryConfig {
 impl Default for RetryConfig {
    fn default() -> Self {
        Self {
-            max_retries: DEFAULT_MAX_RETRIES,
+            max_rate_limit_retries: DEFAULT_MAX_RETRIES,
            max_overloaded_retries: DEFAULT_MAX_RETRIES,
            initial_interval_ms: DEFAULT_INITIAL_RETRY_INTERVAL_MS,
            backoff_multiplier: DEFAULT_BACKOFF_MULTIPLIER,
            max_interval_ms: DEFAULT_MAX_RETRY_INTERVAL_MS,
@@ -92,6 +99,19 @@ impl RetryConfig {
        Duration::from_millis(jittered_delay_ms)
    }
    /// Get max retries for a specific error type
    #[allow(dead_code)] // Used in tests
    fn max_retries_for_status(&self, status: StatusCode) -> usize {
        if status == StatusCode::TOO_MANY_REQUESTS {
            self.max_rate_limit_retries
        } else if status == *STATUS_API_OVERLOADED {
            self.max_overloaded_retries
        } else {
            // Default to rate limit retries for any other status code
            self.max_rate_limit_retries
        }
    }
 }
 /// Provider implementation for Google Cloud Platform's Vertex AI service.
@@ -172,10 +192,32 @@ impl GcpVertexAIProvider {
    /// Loads retry configuration from environment variables or uses defaults.
    fn load_retry_config(config: &crate::config::Config) -> RetryConfig {
-        let max_retries = config
+        // Load max retries for 429 rate limit errors
        let max_rate_limit_retries = config
            .get_param("GCP_MAX_RATE_LIMIT_RETRIES")
            .ok()
            .and_then(|v: String| v.parse::<usize>().ok())
            .or_else(|| {
                // Fall back to generic GCP_MAX_RETRIES if specific one isn't set
                config
                    .get_param("GCP_MAX_RETRIES")
                    .ok()
                    .and_then(|v: String| v.parse::<usize>().ok())
            })
            .unwrap_or(DEFAULT_MAX_RETRIES);
        // Load max retries for 529 API overloaded errors
        let max_overloaded_retries = config
            .get_param("GCP_MAX_OVERLOADED_RETRIES")
            .ok()
            .and_then(|v: String| v.parse::<usize>().ok())
            .or_else(|| {
                // Fall back to generic GCP_MAX_RETRIES if specific one isn't set
                config
                    .get_param("GCP_MAX_RETRIES")
                    .ok()
                    .and_then(|v: String| v.parse::<usize>().ok())
            })
            .unwrap_or(DEFAULT_MAX_RETRIES);
        let initial_interval_ms = config
@@ -197,7 +239,8 @@ impl GcpVertexAIProvider {
            .unwrap_or(DEFAULT_MAX_RETRY_INTERVAL_MS);
        RetryConfig {
-            max_retries,
+            max_rate_limit_retries,
            max_overloaded_retries,
            initial_interval_ms,
            backoff_multiplier,
            max_interval_ms,
@@ -269,7 +312,7 @@ impl GcpVertexAIProvider {
    }
    /// Makes an authenticated POST request to the Vertex AI API at a specific location.
-    /// Includes retry logic for 429 Too Many Requests errors.
+    /// Includes retry logic for 429 (Too Many Requests) and 529 (API Overloaded) errors.
    ///
    /// # Arguments
    /// * `payload` - The request payload to send
@@ -285,21 +328,12 @@ impl GcpVertexAIProvider {
            .build_request_url(context.provider(), location)
            .map_err(|e| ProviderError::RequestFailed(e.to_string()))?;
-        // Initialize retry counter
+        // Initialize separate counters for different error types
-        let mut attempts = 0;
+        let mut rate_limit_attempts = 0;
        let mut overloaded_attempts = 0;
        let mut last_error = None;
        loop {
            // Check if we've exceeded max retries
            if attempts > 0 && attempts > self.retry_config.max_retries {
                let error_msg = format!(
                    "Exceeded maximum retry attempts ({}) for rate limiting (429)",
                    self.retry_config.max_retries
                );
                tracing::error!("{}", error_msg);
                return Err(last_error.unwrap_or(ProviderError::RateLimitExceeded(error_msg)));
            }
            // Get a fresh auth token for each attempt
            let auth_header = self
                .get_auth_header()
@@ -318,8 +352,91 @@ impl GcpVertexAIProvider {
            let status = response.status();
-            // If not a 429, process normally
+            // Handle 429 Too Many Requests and 529 API Overloaded errors
-            if status != StatusCode::TOO_MANY_REQUESTS {
+            match status {
                status if status == StatusCode::TOO_MANY_REQUESTS => {
                    rate_limit_attempts += 1;
                    if rate_limit_attempts > self.retry_config.max_rate_limit_retries {
                        let error_msg = format!(
                            "Exceeded maximum retry attempts ({}) for rate limiting (429) errors",
                            self.retry_config.max_rate_limit_retries
                        );
                        tracing::error!("{}", error_msg);
                        return Err(
                            last_error.unwrap_or(ProviderError::RateLimitExceeded(error_msg))
                        );
                    }
                    // Try to parse response for more detailed error info
                    let cite_gcp_vertex_429 =
                        "See https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429";
                    let response_text = response.text().await.unwrap_or_default();
                    let error_message =
                        if response_text.contains("Exceeded the Provisioned Throughput") {
                            // Handle 429 rate limit due to throughput limits
                            format!("Exceeded the Provisioned Throughput: {cite_gcp_vertex_429}")
                        } else {
                            // Handle generic 429 rate limit
                            format!("Pay-as-you-go resource exhausted: {cite_gcp_vertex_429}")
                        };
                    tracing::warn!(
                        "Rate limit exceeded error (429) (attempt {}/{}): {}. Retrying after backoff...",
                        rate_limit_attempts,
                        self.retry_config.max_rate_limit_retries,
                        error_message
                    );
                    // Store the error in case we need to return it after max retries
                    last_error = Some(ProviderError::RateLimitExceeded(error_message));
                    // Calculate and apply the backoff delay
                    let delay = self.retry_config.delay_for_attempt(rate_limit_attempts);
                    tracing::info!("Backing off for {:?} before retry (rate limit 429)", delay);
                    sleep(delay).await;
                }
                status if status == *STATUS_API_OVERLOADED => {
                    overloaded_attempts += 1;
                    if overloaded_attempts > self.retry_config.max_overloaded_retries {
                        let error_msg = format!(
                            "Exceeded maximum retry attempts ({}) for API overloaded (529) errors",
                            self.retry_config.max_overloaded_retries
                        );
                        tracing::error!("{}", error_msg);
                        return Err(
                            last_error.unwrap_or(ProviderError::RateLimitExceeded(error_msg))
                        );
                    }
                    // Handle 529 Overloaded error (https://docs.anthropic.com/en/api/errors)
                    let error_message =
                        "Vertex AI Provider API is temporarily overloaded. This is similar to a rate limit \
                        error but indicates backend processing capacity issues."
                            .to_string();
                    tracing::warn!(
                        "API overloaded error (529) (attempt {}/{}): {}. Retrying after backoff...",
                        overloaded_attempts,
                        self.retry_config.max_overloaded_retries,
                        error_message
                    );
                    // Store the error in case we need to return it after max retries
                    last_error = Some(ProviderError::RateLimitExceeded(error_message));
                    // Calculate and apply the backoff delay
                    let delay = self.retry_config.delay_for_attempt(overloaded_attempts);
                    tracing::info!(
                        "Backing off for {:?} before retry (API overloaded 529)",
                        delay
                    );
                    sleep(delay).await;
                }
                // For any other status codes, process normally
                _ => {
                    let response_json = response.json::<Value>().await.map_err(|e| {
                        ProviderError::RequestFailed(format!("Failed to parse response: {e}"))
                    })?;
@@ -344,34 +461,7 @@ impl GcpVertexAIProvider {
                        }
                    };
                }
-
+            }
            // Handle 429 Too Many Requests
            attempts += 1;
            // Try to parse response for more detailed error info
            let cite_gcp_vertex_429 =
                "See https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429";
            let response_text = response.text().await.unwrap_or_default();
            let quota_error = if response_text.contains("Exceeded the Provisioned Throughput") {
                format!("Exceeded the Provisioned Throughput: {cite_gcp_vertex_429}.")
            } else {
                format!("Pay-as-you-go resource exhausted: {cite_gcp_vertex_429}.")
            };
            tracing::warn!(
                "Rate limit exceeded (attempt {}/{}): {}. Retrying after backoff...",
                attempts,
                self.retry_config.max_retries,
                quota_error
            );
            // Store the error in case we need to return it after max retries
            last_error = Some(ProviderError::RateLimitExceeded(quota_error));
            // Calculate and apply the backoff delay
            let delay = self.retry_config.delay_for_attempt(attempts);
            tracing::info!("Backing off for {:?} before retry", delay);
            sleep(delay).await;
        }
    }
@@ -431,12 +521,15 @@ impl Provider for GcpVertexAIProvider {
            GcpVertexAIModel::Claude(ClaudeVersion::Sonnet37),
            GcpVertexAIModel::Claude(ClaudeVersion::Haiku35),
            GcpVertexAIModel::Claude(ClaudeVersion::Sonnet4),
            GcpVertexAIModel::Claude(ClaudeVersion::Opus4),
            GcpVertexAIModel::Gemini(GeminiVersion::Pro15),
            GcpVertexAIModel::Gemini(GeminiVersion::Flash20),
            GcpVertexAIModel::Gemini(GeminiVersion::Pro20Exp),
            GcpVertexAIModel::Gemini(GeminiVersion::Pro25Exp),
            GcpVertexAIModel::Gemini(GeminiVersion::Flash25Preview),
            GcpVertexAIModel::Gemini(GeminiVersion::Pro25Preview),
            GcpVertexAIModel::Gemini(GeminiVersion::Flash25),
            GcpVertexAIModel::Gemini(GeminiVersion::Pro25),
        ]
        .iter()
        .map(|model| model.to_string())
@@ -448,7 +541,7 @@ impl Provider for GcpVertexAIProvider {
            "gcp_vertex_ai",
            "GCP Vertex AI",
            "Access variety of AI models such as Claude, Gemini through Vertex AI",
-            GcpVertexAIModel::Gemini(GeminiVersion::Flash20)
+            GcpVertexAIModel::Gemini(GeminiVersion::Flash25)
                .to_string()
                .as_str(),
            known_models,
@@ -456,6 +549,18 @@ impl Provider for GcpVertexAIProvider {
            vec![
                ConfigKey::new("GCP_PROJECT_ID", true, false, None),
                ConfigKey::new("GCP_LOCATION", true, false, Some(Iowa.to_string().as_str())),
                ConfigKey::new(
                    "GCP_MAX_RATE_LIMIT_RETRIES",
                    false,
                    false,
                    Some(&DEFAULT_MAX_RETRIES.to_string()),
                ),
                ConfigKey::new(
                    "GCP_MAX_OVERLOADED_RETRIES",
                    false,
                    false,
                    Some(&DEFAULT_MAX_RETRIES.to_string()),
                ),
                ConfigKey::new(
                    "GCP_MAX_RETRIES",
                    false,
@@ -525,11 +630,13 @@ impl Provider for GcpVertexAIProvider {
 #[cfg(test)]
 mod tests {
    use super::*;
    use reqwest::StatusCode;
    #[test]
    fn test_retry_config_delay_calculation() {
        let config = RetryConfig {
-            max_retries: 5,
+            max_rate_limit_retries: 5,
            max_overloaded_retries: 5,
            initial_interval_ms: 1000,
            backoff_multiplier: 2.0,
            max_interval_ms: 32000,
@@ -552,6 +659,44 @@ mod tests {
        assert!(delay10.as_millis() <= 38400); // max_interval_ms * 1.2 (max jitter)
    }
    #[test]
    fn test_max_retries_for_status() {
        let config = RetryConfig {
            max_rate_limit_retries: 5,
            max_overloaded_retries: 10,
            initial_interval_ms: 1000,
            backoff_multiplier: 2.0,
            max_interval_ms: 32000,
        };
        // Check that we get the right max retries for each error type
        assert_eq!(
            config.max_retries_for_status(StatusCode::TOO_MANY_REQUESTS),
            5
        );
        assert_eq!(config.max_retries_for_status(*STATUS_API_OVERLOADED), 10);
        // For any other status code, we should get the rate limit retries
        assert_eq!(config.max_retries_for_status(StatusCode::BAD_REQUEST), 5);
    }
    #[test]
    fn test_status_overloaded_code() {
        // Test that we correctly handle the 529 status code
        // Verify the custom status code is created correctly
        assert_eq!(STATUS_API_OVERLOADED.as_u16(), 529);
        // This is not a standard HTTP status code, so it's classified as server error
        assert!(STATUS_API_OVERLOADED.is_server_error());
        // Should be different from TOO_MANY_REQUESTS (429)
        assert_ne!(*STATUS_API_OVERLOADED, StatusCode::TOO_MANY_REQUESTS);
        // Should be different from SERVICE_UNAVAILABLE (503)
        assert_ne!(*STATUS_API_OVERLOADED, StatusCode::SERVICE_UNAVAILABLE);
    }
    #[test]
    fn test_model_provider_conversion() {
        assert_eq!(ModelProvider::Anthropic.as_str(), "anthropic");
@@ -596,7 +741,8 @@ mod tests {
            .collect();
        assert!(model_names.contains(&"claude-3-5-sonnet-v2@20241022".to_string()));
        assert!(model_names.contains(&"gemini-1.5-pro-002".to_string()));
-        // Should contain the original 2 config keys plus 4 new retry-related ones
+        assert!(model_names.contains(&"gemini-2.5-pro".to_string()));
-        assert_eq!(metadata.config_keys.len(), 6);
+        // Should contain the original 2 config keys plus 6 new retry-related ones
        assert_eq!(metadata.config_keys.len(), 8);
    }
 }
--- a/documentation/docs/getting-started/providers.md
+++ b/documentation/docs/getting-started/providers.md
@@ -25,7 +25,8 @@ Goose relies heavily on tool calling capabilities and currently works best with
 | [Azure OpenAI](https://learn.microsoft.com/en-us/azure/ai-services/openai/) | Access Azure-hosted OpenAI models, including GPT-4 and GPT-3.5. Supports both API key and Azure credential chain authentication.                                                                                          | `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_DEPLOYMENT_NAME`, `AZURE_OPENAI_API_KEY` (optional)                                                                                           |
 | [Databricks](https://www.databricks.com/)                                   | Unified data analytics and AI platform for building and deploying models.                                                                                                                                                 | `DATABRICKS_HOST`, `DATABRICKS_TOKEN`                                                                                                                                               |
 | [Gemini](https://ai.google.dev/gemini-api/docs)                             | Advanced LLMs by Google with multimodal capabilities (text, images).                                                                                                                                                      | `GOOGLE_API_KEY`                                                                                                                                                                    |
-| [GCP Vertex AI](https://cloud.google.com/vertex-ai)                         | Google Cloud's Vertex AI platform, supporting Gemini and Claude models. **Credentials must be [configured in advance](https://cloud.google.com/vertex-ai/docs/authentication).**                 | `GCP_PROJECT_ID`, `GCP_LOCATION` and optional `GCP_MAX_RETRIES` (6), `GCP_INITIAL_RETRY_INTERVAL_MS` (5000), `GCP_BACKOFF_MULTIPLIER` (2.0), `GCP_MAX_RETRY_INTERVAL_MS` (320_000). |
+| [GCP Vertex AI](https://cloud.google.com/vertex-ai)                         | Google Cloud's Vertex AI platform, supporting Gemini and Claude models. **Credentials must be [configured in advance](https://cloud.google.com/vertex-ai/docs/authentication).**                 | `GCP_PROJECT_ID`, `GCP_LOCATION` and optionally `GCP_MAX_RATE_LIMIT_RETRIES` (5), `GCP_MAX_OVERLOADED_RETRIES` (5), `GCP_INITIAL_RETRY_INTERVAL_MS` (5000), `GCP_BACKOFF_MULTIPLIER` (2.0), `GCP_MAX_RETRY_INTERVAL_MS` (320_000). |
 | [GitHub Copilot](https://docs.github.com/en/copilot/using-github-copilot/ai-models) | Access to GitHub Copilot's chat models including gpt-4o, o1, o3-mini, and Claude models. Uses device code authentication flow for secure access. | Uses GitHub device code authentication flow (no API key needed) |
 | [Groq](https://groq.com/)                                                   | High-performance inference hardware and tools for LLMs.                                                                                                                                                                   | `GROQ_API_KEY`                                                                                                                                                                      |
 | [Ollama](https://ollama.com/)                                               | Local model runner supporting Qwen, Llama, DeepSeek, and other open-source models. **Because this provider runs locally, you must first [download and run a model](/docs/getting-started/providers#local-llms).**  | `OLLAMA_HOST`                                                                                                                                                                       |
 | [Ramalama](https://ramalama.ai/)                                            | Local model using native [OCI](https://opencontainers.org/) container runtimes, [CNCF](https://www.cncf.io/) tools, and supporting models as OCI artifacts. Ramalama API an compatible alternative to Ollama and can be used with the Goose Ollama provider. Supports Qwen, Llama, DeepSeek, and other open-source models. **Because this provider runs locally, you must first [download and run a model](/docs/getting-started/providers#local-llms).**  | `OLLAMA_HOST`                                                                                                                                                                       |