""" Performance tests for the new LLM service. Tests response times, throughput, and resource usage. """ import pytest import asyncio import time import statistics from httpx import AsyncClient from unittest.mock import patch, AsyncMock from typing import List class TestLLMPerformance: """Performance tests for LLM service.""" @pytest.mark.asyncio async def test_chat_completion_latency(self, client: AsyncClient): """Test chat completion response latency.""" from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage # Mock fast response mock_response = ChatCompletionResponse( id="perf-test", object="chat.completion", created=int(time.time()), model="privatemode-llama-3-70b", choices=[ ChatChoice( index=0, message=ChatMessage( role="assistant", content="Performance test response." ), finish_reason="stop" ) ], usage=Usage( prompt_tokens=10, completion_tokens=5, total_tokens=15 ) ) latencies = [] with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat: mock_chat.return_value = mock_response # Measure latency over multiple requests for i in range(10): start_time = time.time() response = await client.post( "/api/v1/llm/chat/completions", json={ "model": "privatemode-llama-3-70b", "messages": [ {"role": "user", "content": f"Performance test {i}"} ] }, headers={"Authorization": "Bearer test-api-key"} ) latency = (time.time() - start_time) * 1000 # Convert to milliseconds latencies.append(latency) assert response.status_code == 200 # Analyze performance metrics avg_latency = statistics.mean(latencies) p95_latency = statistics.quantiles(latencies, n=20)[18] # 95th percentile p99_latency = statistics.quantiles(latencies, n=100)[98] # 99th percentile print(f"Average latency: {avg_latency:.2f}ms") print(f"P95 latency: {p95_latency:.2f}ms") print(f"P99 latency: {p99_latency:.2f}ms") # Performance assertions (for mocked responses, should be very fast) assert avg_latency < 100 # Less than 100ms average assert p95_latency < 200 # Less than 200ms for 95% of requests assert p99_latency < 500 # Less than 500ms for 99% of requests @pytest.mark.asyncio async def test_concurrent_throughput(self, client: AsyncClient): """Test concurrent request throughput.""" from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage mock_response = ChatCompletionResponse( id="throughput-test", object="chat.completion", created=int(time.time()), model="privatemode-llama-3-70b", choices=[ ChatChoice( index=0, message=ChatMessage( role="assistant", content="Throughput test response." ), finish_reason="stop" ) ], usage=Usage( prompt_tokens=8, completion_tokens=4, total_tokens=12 ) ) concurrent_levels = [1, 5, 10, 20] throughput_results = {} with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat: mock_chat.return_value = mock_response for concurrency in concurrent_levels: start_time = time.time() # Create concurrent requests tasks = [] for i in range(concurrency): task = client.post( "/api/v1/llm/chat/completions", json={ "model": "privatemode-llama-3-70b", "messages": [ {"role": "user", "content": f"Concurrent test {i}"} ] }, headers={"Authorization": "Bearer test-api-key"} ) tasks.append(task) # Execute all requests responses = await asyncio.gather(*tasks) elapsed_time = time.time() - start_time # Verify all requests succeeded for response in responses: assert response.status_code == 200 # Calculate throughput (requests per second) throughput = concurrency / elapsed_time throughput_results[concurrency] = throughput print(f"Concurrency {concurrency}: {throughput:.2f} req/s") # Performance assertions assert throughput_results[1] > 10 # At least 10 req/s for single requests assert throughput_results[5] > 30 # At least 30 req/s for 5 concurrent assert throughput_results[10] > 50 # At least 50 req/s for 10 concurrent @pytest.mark.asyncio async def test_embedding_performance(self, client: AsyncClient): """Test embedding generation performance.""" from app.services.llm.models import EmbeddingResponse, EmbeddingData, Usage # Create realistic embedding response embedding_vector = [0.1 * i for i in range(1024)] mock_response = EmbeddingResponse( object="list", data=[ EmbeddingData( object="embedding", embedding=embedding_vector, index=0 ) ], model="privatemode-embeddings", usage=Usage( prompt_tokens=10, total_tokens=10 ) ) latencies = [] with patch("app.services.llm.service.llm_service.create_embedding") as mock_embedding: mock_embedding.return_value = mock_response # Test different text lengths test_texts = [ "Short text", "Medium length text that contains more words and should take a bit longer to process.", "Very long text that contains many words and sentences. " * 10, # Repeat to make it longer ] for text in test_texts: start_time = time.time() response = await client.post( "/api/v1/llm/embeddings", json={ "model": "privatemode-embeddings", "input": text }, headers={"Authorization": "Bearer test-api-key"} ) latency = (time.time() - start_time) * 1000 latencies.append(latency) assert response.status_code == 200 data = response.json() assert len(data["data"][0]["embedding"]) == 1024 # Performance assertions for embeddings avg_latency = statistics.mean(latencies) print(f"Average embedding latency: {avg_latency:.2f}ms") assert avg_latency < 150 # Less than 150ms average for embeddings @pytest.mark.asyncio async def test_provider_status_performance(self, client: AsyncClient): """Test provider status endpoint performance.""" mock_status = { "privatemode": { "provider": "PrivateMode.ai", "status": "healthy", "latency_ms": 250.5, "success_rate": 0.98, "last_check": "2025-01-01T12:00:00Z", "models_available": ["privatemode-llama-3-70b", "privatemode-embeddings"] } } latencies = [] with patch("app.services.llm.service.llm_service.get_provider_status") as mock_provider: mock_provider.return_value = mock_status # Measure status endpoint performance for i in range(10): start_time = time.time() response = await client.get( "/api/v1/llm/providers/status", headers={"Authorization": "Bearer test-api-key"} ) latency = (time.time() - start_time) * 1000 latencies.append(latency) assert response.status_code == 200 avg_latency = statistics.mean(latencies) print(f"Average provider status latency: {avg_latency:.2f}ms") # Status endpoint should be very fast assert avg_latency < 50 # Less than 50ms for status checks @pytest.mark.asyncio async def test_models_endpoint_performance(self, client: AsyncClient): """Test models listing endpoint performance.""" from app.services.llm.models import Model # Create a realistic number of models mock_models = [] for i in range(20): # Simulate 20 available models mock_models.append( Model( id=f"privatemode-model-{i}", object="model", created=1234567890, owned_by="PrivateMode.ai", provider="PrivateMode.ai", capabilities=["tee", "chat"], context_window=32768 if i % 2 == 0 else 8192, supports_streaming=True, supports_function_calling=i % 3 == 0 ) ) latencies = [] with patch("app.services.llm.service.llm_service.get_models") as mock_models_call: mock_models_call.return_value = mock_models # Measure models endpoint performance for i in range(10): start_time = time.time() response = await client.get( "/api/v1/llm/models", headers={"Authorization": "Bearer test-api-key"} ) latency = (time.time() - start_time) * 1000 latencies.append(latency) assert response.status_code == 200 data = response.json() assert len(data["data"]) == 20 avg_latency = statistics.mean(latencies) print(f"Average models endpoint latency: {avg_latency:.2f}ms") # Models endpoint should be reasonably fast even with many models assert avg_latency < 100 # Less than 100ms for models listing @pytest.mark.asyncio async def test_error_handling_performance(self, client: AsyncClient): """Test that error handling doesn't significantly impact performance.""" error_latencies = [] with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat: mock_chat.side_effect = Exception("Simulated provider error") # Measure error handling performance for i in range(5): start_time = time.time() response = await client.post( "/api/v1/llm/chat/completions", json={ "model": "privatemode-llama-3-70b", "messages": [ {"role": "user", "content": f"Error test {i}"} ] }, headers={"Authorization": "Bearer test-api-key"} ) latency = (time.time() - start_time) * 1000 error_latencies.append(latency) # Should return error but quickly assert response.status_code in [500, 503] avg_error_latency = statistics.mean(error_latencies) print(f"Average error handling latency: {avg_error_latency:.2f}ms") # Error handling should be fast assert avg_error_latency < 200 # Less than 200ms for error responses @pytest.mark.asyncio async def test_memory_efficiency(self, client: AsyncClient): """Test memory efficiency during concurrent operations.""" from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage # Create a larger response to test memory handling large_content = "This is a large response. " * 100 # ~2.5KB content mock_response = ChatCompletionResponse( id="memory-test", object="chat.completion", created=int(time.time()), model="privatemode-llama-3-70b", choices=[ ChatChoice( index=0, message=ChatMessage( role="assistant", content=large_content ), finish_reason="stop" ) ], usage=Usage( prompt_tokens=50, completion_tokens=500, total_tokens=550 ) ) with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat: mock_chat.return_value = mock_response # Create many concurrent requests to test memory efficiency tasks = [] for i in range(50): # 50 concurrent requests with large responses task = client.post( "/api/v1/llm/chat/completions", json={ "model": "privatemode-llama-3-70b", "messages": [ {"role": "user", "content": f"Memory test {i}"} ] }, headers={"Authorization": "Bearer test-api-key"} ) tasks.append(task) start_time = time.time() responses = await asyncio.gather(*tasks) elapsed_time = time.time() - start_time # Verify all requests succeeded for response in responses: assert response.status_code == 200 data = response.json() assert len(data["choices"][0]["message"]["content"]) > 2000 print(f"50 concurrent large requests completed in {elapsed_time:.2f}s") # Should handle 50 concurrent requests with large responses efficiently assert elapsed_time < 5.0 # Less than 5 seconds for 50 concurrent requests @pytest.mark.asyncio async def test_security_analysis_performance(self, client: AsyncClient): """Test performance impact of security analysis.""" from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage # Mock response with security analysis mock_response = ChatCompletionResponse( id="security-perf-test", object="chat.completion", created=int(time.time()), model="privatemode-llama-3-70b", choices=[ ChatChoice( index=0, message=ChatMessage( role="assistant", content="Secure response with analysis." ), finish_reason="stop" ) ], usage=Usage( prompt_tokens=15, completion_tokens=8, total_tokens=23 ), security_analysis={ "risk_score": 0.1, "threats_detected": [], "risk_level": "low", "analysis_time_ms": 25.5 } ) latencies = [] with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat: mock_chat.return_value = mock_response # Measure latency with security analysis for i in range(10): start_time = time.time() response = await client.post( "/api/v1/llm/chat/completions", json={ "model": "privatemode-llama-3-70b", "messages": [ {"role": "user", "content": f"Security test {i}"} ] }, headers={"Authorization": "Bearer test-api-key"} ) latency = (time.time() - start_time) * 1000 latencies.append(latency) assert response.status_code == 200 data = response.json() assert "security_analysis" in data avg_latency = statistics.mean(latencies) print(f"Average latency with security analysis: {avg_latency:.2f}ms") # Security analysis should not significantly impact performance assert avg_latency < 150 # Less than 150ms with security analysis