ratelimiting and rag

This commit is contained in:
2025-09-21 06:49:55 +02:00
parent 0c20de4ca1
commit f58a76ac59
7 changed files with 410 additions and 130 deletions

View File

@@ -65,7 +65,16 @@ class LLMServiceConfig(BaseModel):
# Provider configurations
providers: Dict[str, ProviderConfig] = Field(default_factory=dict, description="Provider configurations")
# Token rate limiting (organization-wide)
token_limits_per_minute: Dict[str, int] = Field(
default_factory=lambda: {
"prompt_tokens": 20000, # PrivateMode Standard tier
"completion_tokens": 10000 # PrivateMode Standard tier
},
description="Token rate limits per minute (organization-wide)"
)
# Model routing (model_name -> provider_name)
model_routing: Dict[str, str] = Field(default_factory=dict, description="Model to provider routing")
@@ -91,8 +100,8 @@ def create_default_config() -> LLMServiceConfig:
supported_models=[], # Will be populated dynamically from proxy
capabilities=["chat", "embeddings", "tee"],
priority=1,
max_requests_per_minute=100,
max_requests_per_hour=2000,
max_requests_per_minute=20, # PrivateMode Standard tier limit: 20 req/min
max_requests_per_hour=1200, # 20 req/min * 60 min
supports_streaming=True,
supports_function_calling=True,
max_context_window=128000,