mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-18 14:44:21 +01:00
feat(gcpvertexai): do HTTP 429 like retries for Anthropic API HTTP 529 overloaded status code (#3026)
Co-authored-by: Michael Neale <michael.neale@gmail.com>
This commit is contained in:
@@ -20,7 +20,7 @@
|
|||||||
"gcp_vertex_ai": {
|
"gcp_vertex_ai": {
|
||||||
"name": "GCP Vertex AI",
|
"name": "GCP Vertex AI",
|
||||||
"description": "Use Vertex AI platform models",
|
"description": "Use Vertex AI platform models",
|
||||||
"models": ["claude-3-5-haiku@20241022", "claude-3-5-sonnet@20240620", "claude-3-5-sonnet-v2@20241022", "claude-3-7-sonnet@20250219", "gemini-1.5-pro-002", "gemini-2.0-flash-001", "gemini-2.0-pro-exp-02-05", "gemini-2.5-pro-exp-03-25", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-preview-05-06"],
|
"models": ["claude-3-5-haiku@20241022", "claude-3-5-sonnet@20240620", "claude-3-5-sonnet-v2@20241022", "claude-3-7-sonnet@20250219", "claude-sonnet-4@20250514", "claude-opus-4@20250514", "gemini-1.5-pro-002", "gemini-2.0-flash-001", "gemini-2.0-pro-exp-02-05", "gemini-2.5-pro-exp-03-25", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-preview-05-06", "gemini-2.5-flash", "gemini-2.5-pro"],
|
||||||
"required_keys": ["GCP_PROJECT_ID", "GCP_LOCATION"]
|
"required_keys": ["GCP_PROJECT_ID", "GCP_LOCATION"]
|
||||||
},
|
},
|
||||||
"google": {
|
"google": {
|
||||||
|
|||||||
@@ -83,6 +83,8 @@ pub enum ClaudeVersion {
|
|||||||
Haiku35,
|
Haiku35,
|
||||||
/// Claude Sonnet 4
|
/// Claude Sonnet 4
|
||||||
Sonnet4,
|
Sonnet4,
|
||||||
|
/// Claude Opus 4
|
||||||
|
Opus4,
|
||||||
/// Generic Claude model for custom or new versions
|
/// Generic Claude model for custom or new versions
|
||||||
Generic(String),
|
Generic(String),
|
||||||
}
|
}
|
||||||
@@ -102,6 +104,10 @@ pub enum GeminiVersion {
|
|||||||
Flash25Preview,
|
Flash25Preview,
|
||||||
/// Gemini 2.5 Pro Preview version
|
/// Gemini 2.5 Pro Preview version
|
||||||
Pro25Preview,
|
Pro25Preview,
|
||||||
|
/// Gemini 2.5 Flash version
|
||||||
|
Flash25,
|
||||||
|
/// Gemini 2.5 Pro version
|
||||||
|
Pro25,
|
||||||
/// Generic Gemini model for custom or new versions
|
/// Generic Gemini model for custom or new versions
|
||||||
Generic(String),
|
Generic(String),
|
||||||
}
|
}
|
||||||
@@ -115,6 +121,7 @@ impl fmt::Display for GcpVertexAIModel {
|
|||||||
ClaudeVersion::Sonnet37 => "claude-3-7-sonnet@20250219",
|
ClaudeVersion::Sonnet37 => "claude-3-7-sonnet@20250219",
|
||||||
ClaudeVersion::Haiku35 => "claude-3-5-haiku@20241022",
|
ClaudeVersion::Haiku35 => "claude-3-5-haiku@20241022",
|
||||||
ClaudeVersion::Sonnet4 => "claude-sonnet-4@20250514",
|
ClaudeVersion::Sonnet4 => "claude-sonnet-4@20250514",
|
||||||
|
ClaudeVersion::Opus4 => "claude-opus-4@20250514",
|
||||||
ClaudeVersion::Generic(name) => name,
|
ClaudeVersion::Generic(name) => name,
|
||||||
},
|
},
|
||||||
Self::Gemini(version) => match version {
|
Self::Gemini(version) => match version {
|
||||||
@@ -124,6 +131,8 @@ impl fmt::Display for GcpVertexAIModel {
|
|||||||
GeminiVersion::Pro25Exp => "gemini-2.5-pro-exp-03-25",
|
GeminiVersion::Pro25Exp => "gemini-2.5-pro-exp-03-25",
|
||||||
GeminiVersion::Flash25Preview => "gemini-2.5-flash-preview-05-20",
|
GeminiVersion::Flash25Preview => "gemini-2.5-flash-preview-05-20",
|
||||||
GeminiVersion::Pro25Preview => "gemini-2.5-pro-preview-05-06",
|
GeminiVersion::Pro25Preview => "gemini-2.5-pro-preview-05-06",
|
||||||
|
GeminiVersion::Flash25 => "gemini-2.5-flash",
|
||||||
|
GeminiVersion::Pro25 => "gemini-2.5-pro",
|
||||||
GeminiVersion::Generic(name) => name,
|
GeminiVersion::Generic(name) => name,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -156,12 +165,15 @@ impl TryFrom<&str> for GcpVertexAIModel {
|
|||||||
"claude-3-7-sonnet@20250219" => Ok(Self::Claude(ClaudeVersion::Sonnet37)),
|
"claude-3-7-sonnet@20250219" => Ok(Self::Claude(ClaudeVersion::Sonnet37)),
|
||||||
"claude-3-5-haiku@20241022" => Ok(Self::Claude(ClaudeVersion::Haiku35)),
|
"claude-3-5-haiku@20241022" => Ok(Self::Claude(ClaudeVersion::Haiku35)),
|
||||||
"claude-sonnet-4@20250514" => Ok(Self::Claude(ClaudeVersion::Sonnet4)),
|
"claude-sonnet-4@20250514" => Ok(Self::Claude(ClaudeVersion::Sonnet4)),
|
||||||
|
"claude-opus-4@20250514" => Ok(Self::Claude(ClaudeVersion::Opus4)),
|
||||||
"gemini-1.5-pro-002" => Ok(Self::Gemini(GeminiVersion::Pro15)),
|
"gemini-1.5-pro-002" => Ok(Self::Gemini(GeminiVersion::Pro15)),
|
||||||
"gemini-2.0-flash-001" => Ok(Self::Gemini(GeminiVersion::Flash20)),
|
"gemini-2.0-flash-001" => Ok(Self::Gemini(GeminiVersion::Flash20)),
|
||||||
"gemini-2.0-pro-exp-02-05" => Ok(Self::Gemini(GeminiVersion::Pro20Exp)),
|
"gemini-2.0-pro-exp-02-05" => Ok(Self::Gemini(GeminiVersion::Pro20Exp)),
|
||||||
"gemini-2.5-pro-exp-03-25" => Ok(Self::Gemini(GeminiVersion::Pro25Exp)),
|
"gemini-2.5-pro-exp-03-25" => Ok(Self::Gemini(GeminiVersion::Pro25Exp)),
|
||||||
"gemini-2.5-flash-preview-05-20" => Ok(Self::Gemini(GeminiVersion::Flash25Preview)),
|
"gemini-2.5-flash-preview-05-20" => Ok(Self::Gemini(GeminiVersion::Flash25Preview)),
|
||||||
"gemini-2.5-pro-preview-05-06" => Ok(Self::Gemini(GeminiVersion::Pro25Preview)),
|
"gemini-2.5-pro-preview-05-06" => Ok(Self::Gemini(GeminiVersion::Pro25Preview)),
|
||||||
|
"gemini-2.5-flash" => Ok(Self::Gemini(GeminiVersion::Flash25)),
|
||||||
|
"gemini-2.5-pro" => Ok(Self::Gemini(GeminiVersion::Pro25)),
|
||||||
// Generic models based on prefix matching
|
// Generic models based on prefix matching
|
||||||
_ if s.starts_with("claude-") => {
|
_ if s.starts_with("claude-") => {
|
||||||
Ok(Self::Claude(ClaudeVersion::Generic(s.to_string())))
|
Ok(Self::Claude(ClaudeVersion::Generic(s.to_string())))
|
||||||
@@ -427,7 +439,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Test generic Gemini models
|
// Test generic Gemini models
|
||||||
let gemini_models = ["gemini-3-pro", "gemini-2.5-flash", "gemini-experimental"];
|
let gemini_models = ["gemini-3-pro", "gemini-2.0-flash", "gemini-experimental"];
|
||||||
|
|
||||||
for model_id in gemini_models {
|
for model_id in gemini_models {
|
||||||
let model = GcpVertexAIModel::try_from(model_id)?;
|
let model = GcpVertexAIModel::try_from(model_id)?;
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use std::time::Duration;
|
|||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
use reqwest::{Client, StatusCode};
|
use reqwest::{Client, StatusCode};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use tokio::time::sleep;
|
use tokio::time::sleep;
|
||||||
@@ -34,6 +35,9 @@ const DEFAULT_MAX_RETRIES: usize = 6;
|
|||||||
const DEFAULT_BACKOFF_MULTIPLIER: f64 = 2.0;
|
const DEFAULT_BACKOFF_MULTIPLIER: f64 = 2.0;
|
||||||
/// Default maximum interval for retry (in milliseconds)
|
/// Default maximum interval for retry (in milliseconds)
|
||||||
const DEFAULT_MAX_RETRY_INTERVAL_MS: u64 = 320_000;
|
const DEFAULT_MAX_RETRY_INTERVAL_MS: u64 = 320_000;
|
||||||
|
/// Status code for Anthropic's API overloaded error (529)
|
||||||
|
static STATUS_API_OVERLOADED: Lazy<StatusCode> =
|
||||||
|
Lazy::new(|| StatusCode::from_u16(529).expect("Valid status code 529 for API_OVERLOADED"));
|
||||||
|
|
||||||
/// Represents errors specific to GCP Vertex AI operations.
|
/// Represents errors specific to GCP Vertex AI operations.
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
@@ -50,8 +54,10 @@ enum GcpVertexAIError {
|
|||||||
/// Retry configuration for handling rate limit errors
|
/// Retry configuration for handling rate limit errors
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct RetryConfig {
|
struct RetryConfig {
|
||||||
/// Maximum number of retry attempts
|
/// Maximum number of retry attempts for 429 errors
|
||||||
max_retries: usize,
|
max_rate_limit_retries: usize,
|
||||||
|
/// Maximum number of retry attempts for 529 errors
|
||||||
|
max_overloaded_retries: usize,
|
||||||
/// Initial interval between retries in milliseconds
|
/// Initial interval between retries in milliseconds
|
||||||
initial_interval_ms: u64,
|
initial_interval_ms: u64,
|
||||||
/// Multiplier for backoff (exponential)
|
/// Multiplier for backoff (exponential)
|
||||||
@@ -63,7 +69,8 @@ struct RetryConfig {
|
|||||||
impl Default for RetryConfig {
|
impl Default for RetryConfig {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
max_retries: DEFAULT_MAX_RETRIES,
|
max_rate_limit_retries: DEFAULT_MAX_RETRIES,
|
||||||
|
max_overloaded_retries: DEFAULT_MAX_RETRIES,
|
||||||
initial_interval_ms: DEFAULT_INITIAL_RETRY_INTERVAL_MS,
|
initial_interval_ms: DEFAULT_INITIAL_RETRY_INTERVAL_MS,
|
||||||
backoff_multiplier: DEFAULT_BACKOFF_MULTIPLIER,
|
backoff_multiplier: DEFAULT_BACKOFF_MULTIPLIER,
|
||||||
max_interval_ms: DEFAULT_MAX_RETRY_INTERVAL_MS,
|
max_interval_ms: DEFAULT_MAX_RETRY_INTERVAL_MS,
|
||||||
@@ -92,6 +99,19 @@ impl RetryConfig {
|
|||||||
|
|
||||||
Duration::from_millis(jittered_delay_ms)
|
Duration::from_millis(jittered_delay_ms)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get max retries for a specific error type
|
||||||
|
#[allow(dead_code)] // Used in tests
|
||||||
|
fn max_retries_for_status(&self, status: StatusCode) -> usize {
|
||||||
|
if status == StatusCode::TOO_MANY_REQUESTS {
|
||||||
|
self.max_rate_limit_retries
|
||||||
|
} else if status == *STATUS_API_OVERLOADED {
|
||||||
|
self.max_overloaded_retries
|
||||||
|
} else {
|
||||||
|
// Default to rate limit retries for any other status code
|
||||||
|
self.max_rate_limit_retries
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Provider implementation for Google Cloud Platform's Vertex AI service.
|
/// Provider implementation for Google Cloud Platform's Vertex AI service.
|
||||||
@@ -172,10 +192,32 @@ impl GcpVertexAIProvider {
|
|||||||
|
|
||||||
/// Loads retry configuration from environment variables or uses defaults.
|
/// Loads retry configuration from environment variables or uses defaults.
|
||||||
fn load_retry_config(config: &crate::config::Config) -> RetryConfig {
|
fn load_retry_config(config: &crate::config::Config) -> RetryConfig {
|
||||||
let max_retries = config
|
// Load max retries for 429 rate limit errors
|
||||||
|
let max_rate_limit_retries = config
|
||||||
|
.get_param("GCP_MAX_RATE_LIMIT_RETRIES")
|
||||||
|
.ok()
|
||||||
|
.and_then(|v: String| v.parse::<usize>().ok())
|
||||||
|
.or_else(|| {
|
||||||
|
// Fall back to generic GCP_MAX_RETRIES if specific one isn't set
|
||||||
|
config
|
||||||
.get_param("GCP_MAX_RETRIES")
|
.get_param("GCP_MAX_RETRIES")
|
||||||
.ok()
|
.ok()
|
||||||
.and_then(|v: String| v.parse::<usize>().ok())
|
.and_then(|v: String| v.parse::<usize>().ok())
|
||||||
|
})
|
||||||
|
.unwrap_or(DEFAULT_MAX_RETRIES);
|
||||||
|
|
||||||
|
// Load max retries for 529 API overloaded errors
|
||||||
|
let max_overloaded_retries = config
|
||||||
|
.get_param("GCP_MAX_OVERLOADED_RETRIES")
|
||||||
|
.ok()
|
||||||
|
.and_then(|v: String| v.parse::<usize>().ok())
|
||||||
|
.or_else(|| {
|
||||||
|
// Fall back to generic GCP_MAX_RETRIES if specific one isn't set
|
||||||
|
config
|
||||||
|
.get_param("GCP_MAX_RETRIES")
|
||||||
|
.ok()
|
||||||
|
.and_then(|v: String| v.parse::<usize>().ok())
|
||||||
|
})
|
||||||
.unwrap_or(DEFAULT_MAX_RETRIES);
|
.unwrap_or(DEFAULT_MAX_RETRIES);
|
||||||
|
|
||||||
let initial_interval_ms = config
|
let initial_interval_ms = config
|
||||||
@@ -197,7 +239,8 @@ impl GcpVertexAIProvider {
|
|||||||
.unwrap_or(DEFAULT_MAX_RETRY_INTERVAL_MS);
|
.unwrap_or(DEFAULT_MAX_RETRY_INTERVAL_MS);
|
||||||
|
|
||||||
RetryConfig {
|
RetryConfig {
|
||||||
max_retries,
|
max_rate_limit_retries,
|
||||||
|
max_overloaded_retries,
|
||||||
initial_interval_ms,
|
initial_interval_ms,
|
||||||
backoff_multiplier,
|
backoff_multiplier,
|
||||||
max_interval_ms,
|
max_interval_ms,
|
||||||
@@ -269,7 +312,7 @@ impl GcpVertexAIProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Makes an authenticated POST request to the Vertex AI API at a specific location.
|
/// Makes an authenticated POST request to the Vertex AI API at a specific location.
|
||||||
/// Includes retry logic for 429 Too Many Requests errors.
|
/// Includes retry logic for 429 (Too Many Requests) and 529 (API Overloaded) errors.
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
/// * `payload` - The request payload to send
|
/// * `payload` - The request payload to send
|
||||||
@@ -285,21 +328,12 @@ impl GcpVertexAIProvider {
|
|||||||
.build_request_url(context.provider(), location)
|
.build_request_url(context.provider(), location)
|
||||||
.map_err(|e| ProviderError::RequestFailed(e.to_string()))?;
|
.map_err(|e| ProviderError::RequestFailed(e.to_string()))?;
|
||||||
|
|
||||||
// Initialize retry counter
|
// Initialize separate counters for different error types
|
||||||
let mut attempts = 0;
|
let mut rate_limit_attempts = 0;
|
||||||
|
let mut overloaded_attempts = 0;
|
||||||
let mut last_error = None;
|
let mut last_error = None;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// Check if we've exceeded max retries
|
|
||||||
if attempts > 0 && attempts > self.retry_config.max_retries {
|
|
||||||
let error_msg = format!(
|
|
||||||
"Exceeded maximum retry attempts ({}) for rate limiting (429)",
|
|
||||||
self.retry_config.max_retries
|
|
||||||
);
|
|
||||||
tracing::error!("{}", error_msg);
|
|
||||||
return Err(last_error.unwrap_or(ProviderError::RateLimitExceeded(error_msg)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get a fresh auth token for each attempt
|
// Get a fresh auth token for each attempt
|
||||||
let auth_header = self
|
let auth_header = self
|
||||||
.get_auth_header()
|
.get_auth_header()
|
||||||
@@ -318,8 +352,91 @@ impl GcpVertexAIProvider {
|
|||||||
|
|
||||||
let status = response.status();
|
let status = response.status();
|
||||||
|
|
||||||
// If not a 429, process normally
|
// Handle 429 Too Many Requests and 529 API Overloaded errors
|
||||||
if status != StatusCode::TOO_MANY_REQUESTS {
|
match status {
|
||||||
|
status if status == StatusCode::TOO_MANY_REQUESTS => {
|
||||||
|
rate_limit_attempts += 1;
|
||||||
|
|
||||||
|
if rate_limit_attempts > self.retry_config.max_rate_limit_retries {
|
||||||
|
let error_msg = format!(
|
||||||
|
"Exceeded maximum retry attempts ({}) for rate limiting (429) errors",
|
||||||
|
self.retry_config.max_rate_limit_retries
|
||||||
|
);
|
||||||
|
tracing::error!("{}", error_msg);
|
||||||
|
return Err(
|
||||||
|
last_error.unwrap_or(ProviderError::RateLimitExceeded(error_msg))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to parse response for more detailed error info
|
||||||
|
let cite_gcp_vertex_429 =
|
||||||
|
"See https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429";
|
||||||
|
let response_text = response.text().await.unwrap_or_default();
|
||||||
|
|
||||||
|
let error_message =
|
||||||
|
if response_text.contains("Exceeded the Provisioned Throughput") {
|
||||||
|
// Handle 429 rate limit due to throughput limits
|
||||||
|
format!("Exceeded the Provisioned Throughput: {cite_gcp_vertex_429}")
|
||||||
|
} else {
|
||||||
|
// Handle generic 429 rate limit
|
||||||
|
format!("Pay-as-you-go resource exhausted: {cite_gcp_vertex_429}")
|
||||||
|
};
|
||||||
|
|
||||||
|
tracing::warn!(
|
||||||
|
"Rate limit exceeded error (429) (attempt {}/{}): {}. Retrying after backoff...",
|
||||||
|
rate_limit_attempts,
|
||||||
|
self.retry_config.max_rate_limit_retries,
|
||||||
|
error_message
|
||||||
|
);
|
||||||
|
|
||||||
|
// Store the error in case we need to return it after max retries
|
||||||
|
last_error = Some(ProviderError::RateLimitExceeded(error_message));
|
||||||
|
|
||||||
|
// Calculate and apply the backoff delay
|
||||||
|
let delay = self.retry_config.delay_for_attempt(rate_limit_attempts);
|
||||||
|
tracing::info!("Backing off for {:?} before retry (rate limit 429)", delay);
|
||||||
|
sleep(delay).await;
|
||||||
|
}
|
||||||
|
status if status == *STATUS_API_OVERLOADED => {
|
||||||
|
overloaded_attempts += 1;
|
||||||
|
|
||||||
|
if overloaded_attempts > self.retry_config.max_overloaded_retries {
|
||||||
|
let error_msg = format!(
|
||||||
|
"Exceeded maximum retry attempts ({}) for API overloaded (529) errors",
|
||||||
|
self.retry_config.max_overloaded_retries
|
||||||
|
);
|
||||||
|
tracing::error!("{}", error_msg);
|
||||||
|
return Err(
|
||||||
|
last_error.unwrap_or(ProviderError::RateLimitExceeded(error_msg))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle 529 Overloaded error (https://docs.anthropic.com/en/api/errors)
|
||||||
|
let error_message =
|
||||||
|
"Vertex AI Provider API is temporarily overloaded. This is similar to a rate limit \
|
||||||
|
error but indicates backend processing capacity issues."
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
tracing::warn!(
|
||||||
|
"API overloaded error (529) (attempt {}/{}): {}. Retrying after backoff...",
|
||||||
|
overloaded_attempts,
|
||||||
|
self.retry_config.max_overloaded_retries,
|
||||||
|
error_message
|
||||||
|
);
|
||||||
|
|
||||||
|
// Store the error in case we need to return it after max retries
|
||||||
|
last_error = Some(ProviderError::RateLimitExceeded(error_message));
|
||||||
|
|
||||||
|
// Calculate and apply the backoff delay
|
||||||
|
let delay = self.retry_config.delay_for_attempt(overloaded_attempts);
|
||||||
|
tracing::info!(
|
||||||
|
"Backing off for {:?} before retry (API overloaded 529)",
|
||||||
|
delay
|
||||||
|
);
|
||||||
|
sleep(delay).await;
|
||||||
|
}
|
||||||
|
// For any other status codes, process normally
|
||||||
|
_ => {
|
||||||
let response_json = response.json::<Value>().await.map_err(|e| {
|
let response_json = response.json::<Value>().await.map_err(|e| {
|
||||||
ProviderError::RequestFailed(format!("Failed to parse response: {e}"))
|
ProviderError::RequestFailed(format!("Failed to parse response: {e}"))
|
||||||
})?;
|
})?;
|
||||||
@@ -344,34 +461,7 @@ impl GcpVertexAIProvider {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// Handle 429 Too Many Requests
|
|
||||||
attempts += 1;
|
|
||||||
|
|
||||||
// Try to parse response for more detailed error info
|
|
||||||
let cite_gcp_vertex_429 =
|
|
||||||
"See https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429";
|
|
||||||
let response_text = response.text().await.unwrap_or_default();
|
|
||||||
let quota_error = if response_text.contains("Exceeded the Provisioned Throughput") {
|
|
||||||
format!("Exceeded the Provisioned Throughput: {cite_gcp_vertex_429}.")
|
|
||||||
} else {
|
|
||||||
format!("Pay-as-you-go resource exhausted: {cite_gcp_vertex_429}.")
|
|
||||||
};
|
|
||||||
|
|
||||||
tracing::warn!(
|
|
||||||
"Rate limit exceeded (attempt {}/{}): {}. Retrying after backoff...",
|
|
||||||
attempts,
|
|
||||||
self.retry_config.max_retries,
|
|
||||||
quota_error
|
|
||||||
);
|
|
||||||
|
|
||||||
// Store the error in case we need to return it after max retries
|
|
||||||
last_error = Some(ProviderError::RateLimitExceeded(quota_error));
|
|
||||||
|
|
||||||
// Calculate and apply the backoff delay
|
|
||||||
let delay = self.retry_config.delay_for_attempt(attempts);
|
|
||||||
tracing::info!("Backing off for {:?} before retry", delay);
|
|
||||||
sleep(delay).await;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -431,12 +521,15 @@ impl Provider for GcpVertexAIProvider {
|
|||||||
GcpVertexAIModel::Claude(ClaudeVersion::Sonnet37),
|
GcpVertexAIModel::Claude(ClaudeVersion::Sonnet37),
|
||||||
GcpVertexAIModel::Claude(ClaudeVersion::Haiku35),
|
GcpVertexAIModel::Claude(ClaudeVersion::Haiku35),
|
||||||
GcpVertexAIModel::Claude(ClaudeVersion::Sonnet4),
|
GcpVertexAIModel::Claude(ClaudeVersion::Sonnet4),
|
||||||
|
GcpVertexAIModel::Claude(ClaudeVersion::Opus4),
|
||||||
GcpVertexAIModel::Gemini(GeminiVersion::Pro15),
|
GcpVertexAIModel::Gemini(GeminiVersion::Pro15),
|
||||||
GcpVertexAIModel::Gemini(GeminiVersion::Flash20),
|
GcpVertexAIModel::Gemini(GeminiVersion::Flash20),
|
||||||
GcpVertexAIModel::Gemini(GeminiVersion::Pro20Exp),
|
GcpVertexAIModel::Gemini(GeminiVersion::Pro20Exp),
|
||||||
GcpVertexAIModel::Gemini(GeminiVersion::Pro25Exp),
|
GcpVertexAIModel::Gemini(GeminiVersion::Pro25Exp),
|
||||||
GcpVertexAIModel::Gemini(GeminiVersion::Flash25Preview),
|
GcpVertexAIModel::Gemini(GeminiVersion::Flash25Preview),
|
||||||
GcpVertexAIModel::Gemini(GeminiVersion::Pro25Preview),
|
GcpVertexAIModel::Gemini(GeminiVersion::Pro25Preview),
|
||||||
|
GcpVertexAIModel::Gemini(GeminiVersion::Flash25),
|
||||||
|
GcpVertexAIModel::Gemini(GeminiVersion::Pro25),
|
||||||
]
|
]
|
||||||
.iter()
|
.iter()
|
||||||
.map(|model| model.to_string())
|
.map(|model| model.to_string())
|
||||||
@@ -448,7 +541,7 @@ impl Provider for GcpVertexAIProvider {
|
|||||||
"gcp_vertex_ai",
|
"gcp_vertex_ai",
|
||||||
"GCP Vertex AI",
|
"GCP Vertex AI",
|
||||||
"Access variety of AI models such as Claude, Gemini through Vertex AI",
|
"Access variety of AI models such as Claude, Gemini through Vertex AI",
|
||||||
GcpVertexAIModel::Gemini(GeminiVersion::Flash20)
|
GcpVertexAIModel::Gemini(GeminiVersion::Flash25)
|
||||||
.to_string()
|
.to_string()
|
||||||
.as_str(),
|
.as_str(),
|
||||||
known_models,
|
known_models,
|
||||||
@@ -456,6 +549,18 @@ impl Provider for GcpVertexAIProvider {
|
|||||||
vec![
|
vec![
|
||||||
ConfigKey::new("GCP_PROJECT_ID", true, false, None),
|
ConfigKey::new("GCP_PROJECT_ID", true, false, None),
|
||||||
ConfigKey::new("GCP_LOCATION", true, false, Some(Iowa.to_string().as_str())),
|
ConfigKey::new("GCP_LOCATION", true, false, Some(Iowa.to_string().as_str())),
|
||||||
|
ConfigKey::new(
|
||||||
|
"GCP_MAX_RATE_LIMIT_RETRIES",
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
Some(&DEFAULT_MAX_RETRIES.to_string()),
|
||||||
|
),
|
||||||
|
ConfigKey::new(
|
||||||
|
"GCP_MAX_OVERLOADED_RETRIES",
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
Some(&DEFAULT_MAX_RETRIES.to_string()),
|
||||||
|
),
|
||||||
ConfigKey::new(
|
ConfigKey::new(
|
||||||
"GCP_MAX_RETRIES",
|
"GCP_MAX_RETRIES",
|
||||||
false,
|
false,
|
||||||
@@ -525,11 +630,13 @@ impl Provider for GcpVertexAIProvider {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use reqwest::StatusCode;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_retry_config_delay_calculation() {
|
fn test_retry_config_delay_calculation() {
|
||||||
let config = RetryConfig {
|
let config = RetryConfig {
|
||||||
max_retries: 5,
|
max_rate_limit_retries: 5,
|
||||||
|
max_overloaded_retries: 5,
|
||||||
initial_interval_ms: 1000,
|
initial_interval_ms: 1000,
|
||||||
backoff_multiplier: 2.0,
|
backoff_multiplier: 2.0,
|
||||||
max_interval_ms: 32000,
|
max_interval_ms: 32000,
|
||||||
@@ -552,6 +659,44 @@ mod tests {
|
|||||||
assert!(delay10.as_millis() <= 38400); // max_interval_ms * 1.2 (max jitter)
|
assert!(delay10.as_millis() <= 38400); // max_interval_ms * 1.2 (max jitter)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_max_retries_for_status() {
|
||||||
|
let config = RetryConfig {
|
||||||
|
max_rate_limit_retries: 5,
|
||||||
|
max_overloaded_retries: 10,
|
||||||
|
initial_interval_ms: 1000,
|
||||||
|
backoff_multiplier: 2.0,
|
||||||
|
max_interval_ms: 32000,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check that we get the right max retries for each error type
|
||||||
|
assert_eq!(
|
||||||
|
config.max_retries_for_status(StatusCode::TOO_MANY_REQUESTS),
|
||||||
|
5
|
||||||
|
);
|
||||||
|
assert_eq!(config.max_retries_for_status(*STATUS_API_OVERLOADED), 10);
|
||||||
|
|
||||||
|
// For any other status code, we should get the rate limit retries
|
||||||
|
assert_eq!(config.max_retries_for_status(StatusCode::BAD_REQUEST), 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_status_overloaded_code() {
|
||||||
|
// Test that we correctly handle the 529 status code
|
||||||
|
|
||||||
|
// Verify the custom status code is created correctly
|
||||||
|
assert_eq!(STATUS_API_OVERLOADED.as_u16(), 529);
|
||||||
|
|
||||||
|
// This is not a standard HTTP status code, so it's classified as server error
|
||||||
|
assert!(STATUS_API_OVERLOADED.is_server_error());
|
||||||
|
|
||||||
|
// Should be different from TOO_MANY_REQUESTS (429)
|
||||||
|
assert_ne!(*STATUS_API_OVERLOADED, StatusCode::TOO_MANY_REQUESTS);
|
||||||
|
|
||||||
|
// Should be different from SERVICE_UNAVAILABLE (503)
|
||||||
|
assert_ne!(*STATUS_API_OVERLOADED, StatusCode::SERVICE_UNAVAILABLE);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_model_provider_conversion() {
|
fn test_model_provider_conversion() {
|
||||||
assert_eq!(ModelProvider::Anthropic.as_str(), "anthropic");
|
assert_eq!(ModelProvider::Anthropic.as_str(), "anthropic");
|
||||||
@@ -596,7 +741,8 @@ mod tests {
|
|||||||
.collect();
|
.collect();
|
||||||
assert!(model_names.contains(&"claude-3-5-sonnet-v2@20241022".to_string()));
|
assert!(model_names.contains(&"claude-3-5-sonnet-v2@20241022".to_string()));
|
||||||
assert!(model_names.contains(&"gemini-1.5-pro-002".to_string()));
|
assert!(model_names.contains(&"gemini-1.5-pro-002".to_string()));
|
||||||
// Should contain the original 2 config keys plus 4 new retry-related ones
|
assert!(model_names.contains(&"gemini-2.5-pro".to_string()));
|
||||||
assert_eq!(metadata.config_keys.len(), 6);
|
// Should contain the original 2 config keys plus 6 new retry-related ones
|
||||||
|
assert_eq!(metadata.config_keys.len(), 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,7 +25,8 @@ Goose relies heavily on tool calling capabilities and currently works best with
|
|||||||
| [Azure OpenAI](https://learn.microsoft.com/en-us/azure/ai-services/openai/) | Access Azure-hosted OpenAI models, including GPT-4 and GPT-3.5. Supports both API key and Azure credential chain authentication. | `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_DEPLOYMENT_NAME`, `AZURE_OPENAI_API_KEY` (optional) |
|
| [Azure OpenAI](https://learn.microsoft.com/en-us/azure/ai-services/openai/) | Access Azure-hosted OpenAI models, including GPT-4 and GPT-3.5. Supports both API key and Azure credential chain authentication. | `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_DEPLOYMENT_NAME`, `AZURE_OPENAI_API_KEY` (optional) |
|
||||||
| [Databricks](https://www.databricks.com/) | Unified data analytics and AI platform for building and deploying models. | `DATABRICKS_HOST`, `DATABRICKS_TOKEN` |
|
| [Databricks](https://www.databricks.com/) | Unified data analytics and AI platform for building and deploying models. | `DATABRICKS_HOST`, `DATABRICKS_TOKEN` |
|
||||||
| [Gemini](https://ai.google.dev/gemini-api/docs) | Advanced LLMs by Google with multimodal capabilities (text, images). | `GOOGLE_API_KEY` |
|
| [Gemini](https://ai.google.dev/gemini-api/docs) | Advanced LLMs by Google with multimodal capabilities (text, images). | `GOOGLE_API_KEY` |
|
||||||
| [GCP Vertex AI](https://cloud.google.com/vertex-ai) | Google Cloud's Vertex AI platform, supporting Gemini and Claude models. **Credentials must be [configured in advance](https://cloud.google.com/vertex-ai/docs/authentication).** | `GCP_PROJECT_ID`, `GCP_LOCATION` and optional `GCP_MAX_RETRIES` (6), `GCP_INITIAL_RETRY_INTERVAL_MS` (5000), `GCP_BACKOFF_MULTIPLIER` (2.0), `GCP_MAX_RETRY_INTERVAL_MS` (320_000). |
|
| [GCP Vertex AI](https://cloud.google.com/vertex-ai) | Google Cloud's Vertex AI platform, supporting Gemini and Claude models. **Credentials must be [configured in advance](https://cloud.google.com/vertex-ai/docs/authentication).** | `GCP_PROJECT_ID`, `GCP_LOCATION` and optionally `GCP_MAX_RATE_LIMIT_RETRIES` (5), `GCP_MAX_OVERLOADED_RETRIES` (5), `GCP_INITIAL_RETRY_INTERVAL_MS` (5000), `GCP_BACKOFF_MULTIPLIER` (2.0), `GCP_MAX_RETRY_INTERVAL_MS` (320_000). |
|
||||||
|
| [GitHub Copilot](https://docs.github.com/en/copilot/using-github-copilot/ai-models) | Access to GitHub Copilot's chat models including gpt-4o, o1, o3-mini, and Claude models. Uses device code authentication flow for secure access. | Uses GitHub device code authentication flow (no API key needed) |
|
||||||
| [Groq](https://groq.com/) | High-performance inference hardware and tools for LLMs. | `GROQ_API_KEY` |
|
| [Groq](https://groq.com/) | High-performance inference hardware and tools for LLMs. | `GROQ_API_KEY` |
|
||||||
| [Ollama](https://ollama.com/) | Local model runner supporting Qwen, Llama, DeepSeek, and other open-source models. **Because this provider runs locally, you must first [download and run a model](/docs/getting-started/providers#local-llms).** | `OLLAMA_HOST` |
|
| [Ollama](https://ollama.com/) | Local model runner supporting Qwen, Llama, DeepSeek, and other open-source models. **Because this provider runs locally, you must first [download and run a model](/docs/getting-started/providers#local-llms).** | `OLLAMA_HOST` |
|
||||||
| [Ramalama](https://ramalama.ai/) | Local model using native [OCI](https://opencontainers.org/) container runtimes, [CNCF](https://www.cncf.io/) tools, and supporting models as OCI artifacts. Ramalama API an compatible alternative to Ollama and can be used with the Goose Ollama provider. Supports Qwen, Llama, DeepSeek, and other open-source models. **Because this provider runs locally, you must first [download and run a model](/docs/getting-started/providers#local-llms).** | `OLLAMA_HOST` |
|
| [Ramalama](https://ramalama.ai/) | Local model using native [OCI](https://opencontainers.org/) container runtimes, [CNCF](https://www.cncf.io/) tools, and supporting models as OCI artifacts. Ramalama API an compatible alternative to Ollama and can be used with the Goose Ollama provider. Supports Qwen, Llama, DeepSeek, and other open-source models. **Because this provider runs locally, you must first [download and run a model](/docs/getting-started/providers#local-llms).** | `OLLAMA_HOST` |
|
||||||
|
|||||||
Reference in New Issue
Block a user