mirror of
https://github.com/aljazceru/enclava.git
synced 2025-12-17 07:24:34 +01:00
308 lines
12 KiB
Python
308 lines
12 KiB
Python
"""
|
|
Metrics and monitoring service
|
|
"""
|
|
import time
|
|
from typing import Dict, Any, Optional, List
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timedelta
|
|
import asyncio
|
|
from collections import defaultdict, deque
|
|
|
|
from app.core.config import settings
|
|
from app.core.logging import log_module_event, log_security_event
|
|
|
|
|
|
@dataclass
|
|
class MetricData:
|
|
"""Individual metric data point"""
|
|
timestamp: datetime
|
|
value: float
|
|
labels: Dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class RequestMetrics:
|
|
"""Request-related metrics"""
|
|
total_requests: int = 0
|
|
successful_requests: int = 0
|
|
failed_requests: int = 0
|
|
average_response_time: float = 0.0
|
|
total_tokens_used: int = 0
|
|
total_cost: float = 0.0
|
|
requests_by_model: Dict[str, int] = field(default_factory=dict)
|
|
requests_by_user: Dict[str, int] = field(default_factory=dict)
|
|
requests_by_endpoint: Dict[str, int] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class SystemMetrics:
|
|
"""System-related metrics"""
|
|
uptime: float = 0.0
|
|
memory_usage: float = 0.0
|
|
cpu_usage: float = 0.0
|
|
active_connections: int = 0
|
|
module_status: Dict[str, bool] = field(default_factory=dict)
|
|
|
|
|
|
class MetricsService:
|
|
"""Service for collecting and managing metrics"""
|
|
|
|
def __init__(self):
|
|
self.request_metrics = RequestMetrics()
|
|
self.system_metrics = SystemMetrics()
|
|
self.metric_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
|
|
self.start_time = time.time()
|
|
self.response_times: deque = deque(maxlen=100) # Keep last 100 response times
|
|
self.active_requests: Dict[str, float] = {} # Track active requests
|
|
|
|
async def initialize(self):
|
|
"""Initialize the metrics service"""
|
|
log_module_event("metrics_service", "initializing", {})
|
|
self.start_time = time.time()
|
|
|
|
# Start background tasks
|
|
asyncio.create_task(self._collect_system_metrics())
|
|
asyncio.create_task(self._cleanup_old_metrics())
|
|
|
|
log_module_event("metrics_service", "initialized", {"success": True})
|
|
|
|
async def _collect_system_metrics(self):
|
|
"""Collect system metrics periodically"""
|
|
while True:
|
|
try:
|
|
# Update uptime
|
|
self.system_metrics.uptime = time.time() - self.start_time
|
|
|
|
# Update active connections
|
|
self.system_metrics.active_connections = len(self.active_requests)
|
|
|
|
# Store historical data
|
|
self._store_metric("uptime", self.system_metrics.uptime)
|
|
self._store_metric("active_connections", self.system_metrics.active_connections)
|
|
|
|
await asyncio.sleep(60) # Collect every minute
|
|
|
|
except Exception as e:
|
|
log_module_event("metrics_service", "system_metrics_error", {"error": str(e)})
|
|
await asyncio.sleep(60)
|
|
|
|
async def _cleanup_old_metrics(self):
|
|
"""Clean up old metric data"""
|
|
while True:
|
|
try:
|
|
cutoff_time = datetime.now() - timedelta(hours=24)
|
|
|
|
for metric_name, metric_data in self.metric_history.items():
|
|
# Remove old data points
|
|
while metric_data and metric_data[0].timestamp < cutoff_time:
|
|
metric_data.popleft()
|
|
|
|
await asyncio.sleep(3600) # Clean up every hour
|
|
|
|
except Exception as e:
|
|
log_module_event("metrics_service", "cleanup_error", {"error": str(e)})
|
|
await asyncio.sleep(3600)
|
|
|
|
def _store_metric(self, name: str, value: float, labels: Optional[Dict[str, str]] = None):
|
|
"""Store a metric data point"""
|
|
if labels is None:
|
|
labels = {}
|
|
|
|
metric_data = MetricData(
|
|
timestamp=datetime.now(),
|
|
value=value,
|
|
labels=labels
|
|
)
|
|
|
|
self.metric_history[name].append(metric_data)
|
|
|
|
def start_request(self, request_id: str, endpoint: str, user_id: Optional[str] = None):
|
|
"""Start tracking a request"""
|
|
self.active_requests[request_id] = time.time()
|
|
|
|
# Update request metrics
|
|
self.request_metrics.total_requests += 1
|
|
|
|
# Track by endpoint
|
|
self.request_metrics.requests_by_endpoint[endpoint] = \
|
|
self.request_metrics.requests_by_endpoint.get(endpoint, 0) + 1
|
|
|
|
# Track by user
|
|
if user_id:
|
|
self.request_metrics.requests_by_user[user_id] = \
|
|
self.request_metrics.requests_by_user.get(user_id, 0) + 1
|
|
|
|
# Store metric
|
|
self._store_metric("requests_total", self.request_metrics.total_requests)
|
|
self._store_metric("requests_by_endpoint", 1, {"endpoint": endpoint})
|
|
|
|
if user_id:
|
|
self._store_metric("requests_by_user", 1, {"user_id": user_id})
|
|
|
|
def end_request(self, request_id: str, success: bool = True,
|
|
model: Optional[str] = None, tokens_used: int = 0,
|
|
cost: float = 0.0):
|
|
"""End tracking a request"""
|
|
if request_id not in self.active_requests:
|
|
return
|
|
|
|
# Calculate response time
|
|
response_time = time.time() - self.active_requests[request_id]
|
|
self.response_times.append(response_time)
|
|
|
|
# Update metrics
|
|
if success:
|
|
self.request_metrics.successful_requests += 1
|
|
else:
|
|
self.request_metrics.failed_requests += 1
|
|
|
|
# Update average response time
|
|
if self.response_times:
|
|
self.request_metrics.average_response_time = sum(self.response_times) / len(self.response_times)
|
|
|
|
# Update token and cost metrics
|
|
self.request_metrics.total_tokens_used += tokens_used
|
|
self.request_metrics.total_cost += cost
|
|
|
|
# Track by model
|
|
if model:
|
|
self.request_metrics.requests_by_model[model] = \
|
|
self.request_metrics.requests_by_model.get(model, 0) + 1
|
|
|
|
# Store metrics
|
|
self._store_metric("response_time", response_time)
|
|
self._store_metric("tokens_used", tokens_used)
|
|
self._store_metric("cost", cost)
|
|
|
|
if model:
|
|
self._store_metric("requests_by_model", 1, {"model": model})
|
|
|
|
# Clean up
|
|
del self.active_requests[request_id]
|
|
|
|
def record_error(self, error_type: str, error_message: str,
|
|
endpoint: Optional[str] = None, user_id: Optional[str] = None):
|
|
"""Record an error occurrence"""
|
|
labels = {"error_type": error_type}
|
|
|
|
if endpoint:
|
|
labels["endpoint"] = endpoint
|
|
if user_id:
|
|
labels["user_id"] = user_id
|
|
|
|
self._store_metric("errors_total", 1, labels)
|
|
|
|
# Log security events for authentication/authorization errors
|
|
if error_type in ["authentication_failed", "authorization_failed", "invalid_api_key"]:
|
|
log_security_event(error_type, user_id or "anonymous", {
|
|
"error": error_message,
|
|
"endpoint": endpoint
|
|
})
|
|
|
|
def record_module_status(self, module_name: str, is_healthy: bool):
|
|
"""Record module health status"""
|
|
self.system_metrics.module_status[module_name] = is_healthy
|
|
self._store_metric("module_health", 1 if is_healthy else 0, {"module": module_name})
|
|
|
|
def get_current_metrics(self) -> Dict[str, Any]:
|
|
"""Get current metrics snapshot"""
|
|
return {
|
|
"request_metrics": {
|
|
"total_requests": self.request_metrics.total_requests,
|
|
"successful_requests": self.request_metrics.successful_requests,
|
|
"failed_requests": self.request_metrics.failed_requests,
|
|
"success_rate": (
|
|
self.request_metrics.successful_requests / self.request_metrics.total_requests
|
|
if self.request_metrics.total_requests > 0 else 0
|
|
),
|
|
"average_response_time": self.request_metrics.average_response_time,
|
|
"total_tokens_used": self.request_metrics.total_tokens_used,
|
|
"total_cost": self.request_metrics.total_cost,
|
|
"requests_by_model": dict(self.request_metrics.requests_by_model),
|
|
"requests_by_user": dict(self.request_metrics.requests_by_user),
|
|
"requests_by_endpoint": dict(self.request_metrics.requests_by_endpoint)
|
|
},
|
|
"system_metrics": {
|
|
"uptime": self.system_metrics.uptime,
|
|
"active_connections": self.system_metrics.active_connections,
|
|
"module_status": dict(self.system_metrics.module_status)
|
|
}
|
|
}
|
|
|
|
def get_metrics_history(self, metric_name: str,
|
|
hours: int = 1) -> List[Dict[str, Any]]:
|
|
"""Get historical metrics data"""
|
|
if metric_name not in self.metric_history:
|
|
return []
|
|
|
|
cutoff_time = datetime.now() - timedelta(hours=hours)
|
|
|
|
return [
|
|
{
|
|
"timestamp": data.timestamp.isoformat(),
|
|
"value": data.value,
|
|
"labels": data.labels
|
|
}
|
|
for data in self.metric_history[metric_name]
|
|
if data.timestamp > cutoff_time
|
|
]
|
|
|
|
def get_top_metrics(self, metric_type: str, limit: int = 10) -> Dict[str, Any]:
|
|
"""Get top metrics by type"""
|
|
if metric_type == "models":
|
|
return dict(
|
|
sorted(self.request_metrics.requests_by_model.items(),
|
|
key=lambda x: x[1], reverse=True)[:limit]
|
|
)
|
|
elif metric_type == "users":
|
|
return dict(
|
|
sorted(self.request_metrics.requests_by_user.items(),
|
|
key=lambda x: x[1], reverse=True)[:limit]
|
|
)
|
|
elif metric_type == "endpoints":
|
|
return dict(
|
|
sorted(self.request_metrics.requests_by_endpoint.items(),
|
|
key=lambda x: x[1], reverse=True)[:limit]
|
|
)
|
|
else:
|
|
return {}
|
|
|
|
def get_health_check(self) -> Dict[str, Any]:
|
|
"""Get health check information"""
|
|
return {
|
|
"status": "healthy",
|
|
"uptime": self.system_metrics.uptime,
|
|
"active_connections": self.system_metrics.active_connections,
|
|
"total_requests": self.request_metrics.total_requests,
|
|
"success_rate": (
|
|
self.request_metrics.successful_requests / self.request_metrics.total_requests
|
|
if self.request_metrics.total_requests > 0 else 1.0
|
|
),
|
|
"modules": self.system_metrics.module_status,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
async def reset_metrics(self):
|
|
"""Reset all metrics (for testing purposes)"""
|
|
self.request_metrics = RequestMetrics()
|
|
self.system_metrics = SystemMetrics()
|
|
self.metric_history.clear()
|
|
self.response_times.clear()
|
|
self.active_requests.clear()
|
|
self.start_time = time.time()
|
|
|
|
log_module_event("metrics_service", "metrics_reset", {"success": True})
|
|
|
|
|
|
# Global metrics service instance
|
|
metrics_service = MetricsService()
|
|
|
|
|
|
def setup_metrics(app):
|
|
"""Setup metrics service with FastAPI app"""
|
|
# Store metrics service in app state
|
|
app.state.metrics_service = metrics_service
|
|
|
|
# Initialize metrics service
|
|
import asyncio
|
|
asyncio.create_task(metrics_service.initialize()) |