Files
enclava/backend/app/services/metrics.py
2025-08-19 09:50:15 +02:00

308 lines
12 KiB
Python

"""
Metrics and monitoring service
"""
import time
from typing import Dict, Any, Optional, List
from dataclasses import dataclass, field
from datetime import datetime, timedelta
import asyncio
from collections import defaultdict, deque
from app.core.config import settings
from app.core.logging import log_module_event, log_security_event
@dataclass
class MetricData:
"""Individual metric data point"""
timestamp: datetime
value: float
labels: Dict[str, str] = field(default_factory=dict)
@dataclass
class RequestMetrics:
"""Request-related metrics"""
total_requests: int = 0
successful_requests: int = 0
failed_requests: int = 0
average_response_time: float = 0.0
total_tokens_used: int = 0
total_cost: float = 0.0
requests_by_model: Dict[str, int] = field(default_factory=dict)
requests_by_user: Dict[str, int] = field(default_factory=dict)
requests_by_endpoint: Dict[str, int] = field(default_factory=dict)
@dataclass
class SystemMetrics:
"""System-related metrics"""
uptime: float = 0.0
memory_usage: float = 0.0
cpu_usage: float = 0.0
active_connections: int = 0
module_status: Dict[str, bool] = field(default_factory=dict)
class MetricsService:
"""Service for collecting and managing metrics"""
def __init__(self):
self.request_metrics = RequestMetrics()
self.system_metrics = SystemMetrics()
self.metric_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
self.start_time = time.time()
self.response_times: deque = deque(maxlen=100) # Keep last 100 response times
self.active_requests: Dict[str, float] = {} # Track active requests
async def initialize(self):
"""Initialize the metrics service"""
log_module_event("metrics_service", "initializing", {})
self.start_time = time.time()
# Start background tasks
asyncio.create_task(self._collect_system_metrics())
asyncio.create_task(self._cleanup_old_metrics())
log_module_event("metrics_service", "initialized", {"success": True})
async def _collect_system_metrics(self):
"""Collect system metrics periodically"""
while True:
try:
# Update uptime
self.system_metrics.uptime = time.time() - self.start_time
# Update active connections
self.system_metrics.active_connections = len(self.active_requests)
# Store historical data
self._store_metric("uptime", self.system_metrics.uptime)
self._store_metric("active_connections", self.system_metrics.active_connections)
await asyncio.sleep(60) # Collect every minute
except Exception as e:
log_module_event("metrics_service", "system_metrics_error", {"error": str(e)})
await asyncio.sleep(60)
async def _cleanup_old_metrics(self):
"""Clean up old metric data"""
while True:
try:
cutoff_time = datetime.now() - timedelta(hours=24)
for metric_name, metric_data in self.metric_history.items():
# Remove old data points
while metric_data and metric_data[0].timestamp < cutoff_time:
metric_data.popleft()
await asyncio.sleep(3600) # Clean up every hour
except Exception as e:
log_module_event("metrics_service", "cleanup_error", {"error": str(e)})
await asyncio.sleep(3600)
def _store_metric(self, name: str, value: float, labels: Optional[Dict[str, str]] = None):
"""Store a metric data point"""
if labels is None:
labels = {}
metric_data = MetricData(
timestamp=datetime.now(),
value=value,
labels=labels
)
self.metric_history[name].append(metric_data)
def start_request(self, request_id: str, endpoint: str, user_id: Optional[str] = None):
"""Start tracking a request"""
self.active_requests[request_id] = time.time()
# Update request metrics
self.request_metrics.total_requests += 1
# Track by endpoint
self.request_metrics.requests_by_endpoint[endpoint] = \
self.request_metrics.requests_by_endpoint.get(endpoint, 0) + 1
# Track by user
if user_id:
self.request_metrics.requests_by_user[user_id] = \
self.request_metrics.requests_by_user.get(user_id, 0) + 1
# Store metric
self._store_metric("requests_total", self.request_metrics.total_requests)
self._store_metric("requests_by_endpoint", 1, {"endpoint": endpoint})
if user_id:
self._store_metric("requests_by_user", 1, {"user_id": user_id})
def end_request(self, request_id: str, success: bool = True,
model: Optional[str] = None, tokens_used: int = 0,
cost: float = 0.0):
"""End tracking a request"""
if request_id not in self.active_requests:
return
# Calculate response time
response_time = time.time() - self.active_requests[request_id]
self.response_times.append(response_time)
# Update metrics
if success:
self.request_metrics.successful_requests += 1
else:
self.request_metrics.failed_requests += 1
# Update average response time
if self.response_times:
self.request_metrics.average_response_time = sum(self.response_times) / len(self.response_times)
# Update token and cost metrics
self.request_metrics.total_tokens_used += tokens_used
self.request_metrics.total_cost += cost
# Track by model
if model:
self.request_metrics.requests_by_model[model] = \
self.request_metrics.requests_by_model.get(model, 0) + 1
# Store metrics
self._store_metric("response_time", response_time)
self._store_metric("tokens_used", tokens_used)
self._store_metric("cost", cost)
if model:
self._store_metric("requests_by_model", 1, {"model": model})
# Clean up
del self.active_requests[request_id]
def record_error(self, error_type: str, error_message: str,
endpoint: Optional[str] = None, user_id: Optional[str] = None):
"""Record an error occurrence"""
labels = {"error_type": error_type}
if endpoint:
labels["endpoint"] = endpoint
if user_id:
labels["user_id"] = user_id
self._store_metric("errors_total", 1, labels)
# Log security events for authentication/authorization errors
if error_type in ["authentication_failed", "authorization_failed", "invalid_api_key"]:
log_security_event(error_type, user_id or "anonymous", {
"error": error_message,
"endpoint": endpoint
})
def record_module_status(self, module_name: str, is_healthy: bool):
"""Record module health status"""
self.system_metrics.module_status[module_name] = is_healthy
self._store_metric("module_health", 1 if is_healthy else 0, {"module": module_name})
def get_current_metrics(self) -> Dict[str, Any]:
"""Get current metrics snapshot"""
return {
"request_metrics": {
"total_requests": self.request_metrics.total_requests,
"successful_requests": self.request_metrics.successful_requests,
"failed_requests": self.request_metrics.failed_requests,
"success_rate": (
self.request_metrics.successful_requests / self.request_metrics.total_requests
if self.request_metrics.total_requests > 0 else 0
),
"average_response_time": self.request_metrics.average_response_time,
"total_tokens_used": self.request_metrics.total_tokens_used,
"total_cost": self.request_metrics.total_cost,
"requests_by_model": dict(self.request_metrics.requests_by_model),
"requests_by_user": dict(self.request_metrics.requests_by_user),
"requests_by_endpoint": dict(self.request_metrics.requests_by_endpoint)
},
"system_metrics": {
"uptime": self.system_metrics.uptime,
"active_connections": self.system_metrics.active_connections,
"module_status": dict(self.system_metrics.module_status)
}
}
def get_metrics_history(self, metric_name: str,
hours: int = 1) -> List[Dict[str, Any]]:
"""Get historical metrics data"""
if metric_name not in self.metric_history:
return []
cutoff_time = datetime.now() - timedelta(hours=hours)
return [
{
"timestamp": data.timestamp.isoformat(),
"value": data.value,
"labels": data.labels
}
for data in self.metric_history[metric_name]
if data.timestamp > cutoff_time
]
def get_top_metrics(self, metric_type: str, limit: int = 10) -> Dict[str, Any]:
"""Get top metrics by type"""
if metric_type == "models":
return dict(
sorted(self.request_metrics.requests_by_model.items(),
key=lambda x: x[1], reverse=True)[:limit]
)
elif metric_type == "users":
return dict(
sorted(self.request_metrics.requests_by_user.items(),
key=lambda x: x[1], reverse=True)[:limit]
)
elif metric_type == "endpoints":
return dict(
sorted(self.request_metrics.requests_by_endpoint.items(),
key=lambda x: x[1], reverse=True)[:limit]
)
else:
return {}
def get_health_check(self) -> Dict[str, Any]:
"""Get health check information"""
return {
"status": "healthy",
"uptime": self.system_metrics.uptime,
"active_connections": self.system_metrics.active_connections,
"total_requests": self.request_metrics.total_requests,
"success_rate": (
self.request_metrics.successful_requests / self.request_metrics.total_requests
if self.request_metrics.total_requests > 0 else 1.0
),
"modules": self.system_metrics.module_status,
"timestamp": datetime.now().isoformat()
}
async def reset_metrics(self):
"""Reset all metrics (for testing purposes)"""
self.request_metrics = RequestMetrics()
self.system_metrics = SystemMetrics()
self.metric_history.clear()
self.response_times.clear()
self.active_requests.clear()
self.start_time = time.time()
log_module_event("metrics_service", "metrics_reset", {"success": True})
# Global metrics service instance
metrics_service = MetricsService()
def setup_metrics(app):
"""Setup metrics service with FastAPI app"""
# Store metrics service in app state
app.state.metrics_service = metrics_service
# Initialize metrics service
import asyncio
asyncio.create_task(metrics_service.initialize())