plugin system

2025-12-17 15:34:36 +01:00 · 2025-08-24 17:46:15 +02:00
parent 5fdab97f7f
commit d1c59265d7
132 changed files with 4246 additions and 2007 deletions
--- a/backend/tests/performance/test_llm_performance.py
+++ b/backend/tests/performance/test_llm_performance.py
@@ -0,0 +1,466 @@
+"""
+Performance tests for the new LLM service.
+Tests response times, throughput, and resource usage.
+"""
+import pytest
+import asyncio
+import time
+import statistics
+from httpx import AsyncClient
+from unittest.mock import patch, AsyncMock
+from typing import List
+
+
+class TestLLMPerformance:
+    """Performance tests for LLM service."""
+
+    @pytest.mark.asyncio
+    async def test_chat_completion_latency(self, client: AsyncClient):
+        """Test chat completion response latency."""
+        from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
+        
+        # Mock fast response
+        mock_response = ChatCompletionResponse(
+            id="perf-test",
+            object="chat.completion",
+            created=int(time.time()),
+            model="privatemode-llama-3-70b",
+            choices=[
+                ChatChoice(
+                    index=0,
+                    message=ChatMessage(
+                        role="assistant",
+                        content="Performance test response."
+                    ),
+                    finish_reason="stop"
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=10,
+                completion_tokens=5,
+                total_tokens=15
+            )
+        )
+        
+        latencies = []
+        
+        with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
+            mock_chat.return_value = mock_response
+            
+            # Measure latency over multiple requests
+            for i in range(10):
+                start_time = time.time()
+                
+                response = await client.post(
+                    "/api/v1/llm/chat/completions",
+                    json={
+                        "model": "privatemode-llama-3-70b",
+                        "messages": [
+                            {"role": "user", "content": f"Performance test {i}"}
+                        ]
+                    },
+                    headers={"Authorization": "Bearer test-api-key"}
+                )
+                
+                latency = (time.time() - start_time) * 1000  # Convert to milliseconds
+                latencies.append(latency)
+                
+                assert response.status_code == 200
+        
+        # Analyze performance metrics
+        avg_latency = statistics.mean(latencies)
+        p95_latency = statistics.quantiles(latencies, n=20)[18]  # 95th percentile
+        p99_latency = statistics.quantiles(latencies, n=100)[98]  # 99th percentile
+        
+        print(f"Average latency: {avg_latency:.2f}ms")
+        print(f"P95 latency: {p95_latency:.2f}ms")
+        print(f"P99 latency: {p99_latency:.2f}ms")
+        
+        # Performance assertions (for mocked responses, should be very fast)
+        assert avg_latency < 100  # Less than 100ms average
+        assert p95_latency < 200   # Less than 200ms for 95% of requests
+        assert p99_latency < 500   # Less than 500ms for 99% of requests
+
+    @pytest.mark.asyncio
+    async def test_concurrent_throughput(self, client: AsyncClient):
+        """Test concurrent request throughput."""
+        from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
+        
+        mock_response = ChatCompletionResponse(
+            id="throughput-test",
+            object="chat.completion",
+            created=int(time.time()),
+            model="privatemode-llama-3-70b",
+            choices=[
+                ChatChoice(
+                    index=0,
+                    message=ChatMessage(
+                        role="assistant",
+                        content="Throughput test response."
+                    ),
+                    finish_reason="stop"
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=8,
+                completion_tokens=4,
+                total_tokens=12
+            )
+        )
+        
+        concurrent_levels = [1, 5, 10, 20]
+        throughput_results = {}
+        
+        with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
+            mock_chat.return_value = mock_response
+            
+            for concurrency in concurrent_levels:
+                start_time = time.time()
+                
+                # Create concurrent requests
+                tasks = []
+                for i in range(concurrency):
+                    task = client.post(
+                        "/api/v1/llm/chat/completions",
+                        json={
+                            "model": "privatemode-llama-3-70b",
+                            "messages": [
+                                {"role": "user", "content": f"Concurrent test {i}"}
+                            ]
+                        },
+                        headers={"Authorization": "Bearer test-api-key"}
+                    )
+                    tasks.append(task)
+                
+                # Execute all requests
+                responses = await asyncio.gather(*tasks)
+                elapsed_time = time.time() - start_time
+                
+                # Verify all requests succeeded
+                for response in responses:
+                    assert response.status_code == 200
+                
+                # Calculate throughput (requests per second)
+                throughput = concurrency / elapsed_time
+                throughput_results[concurrency] = throughput
+                
+                print(f"Concurrency {concurrency}: {throughput:.2f} req/s")
+        
+        # Performance assertions
+        assert throughput_results[1] > 10    # At least 10 req/s for single requests
+        assert throughput_results[5] > 30    # At least 30 req/s for 5 concurrent
+        assert throughput_results[10] > 50   # At least 50 req/s for 10 concurrent
+
+    @pytest.mark.asyncio
+    async def test_embedding_performance(self, client: AsyncClient):
+        """Test embedding generation performance."""
+        from app.services.llm.models import EmbeddingResponse, EmbeddingData, Usage
+        
+        # Create realistic embedding response
+        embedding_vector = [0.1 * i for i in range(1024)]
+        
+        mock_response = EmbeddingResponse(
+            object="list",
+            data=[
+                EmbeddingData(
+                    object="embedding",
+                    embedding=embedding_vector,
+                    index=0
+                )
+            ],
+            model="privatemode-embeddings",
+            usage=Usage(
+                prompt_tokens=10,
+                total_tokens=10
+            )
+        )
+        
+        latencies = []
+        
+        with patch("app.services.llm.service.llm_service.create_embedding") as mock_embedding:
+            mock_embedding.return_value = mock_response
+            
+            # Test different text lengths
+            test_texts = [
+                "Short text",
+                "Medium length text that contains more words and should take a bit longer to process.",
+                "Very long text that contains many words and sentences. " * 10,  # Repeat to make it longer
+            ]
+            
+            for text in test_texts:
+                start_time = time.time()
+                
+                response = await client.post(
+                    "/api/v1/llm/embeddings",
+                    json={
+                        "model": "privatemode-embeddings",
+                        "input": text
+                    },
+                    headers={"Authorization": "Bearer test-api-key"}
+                )
+                
+                latency = (time.time() - start_time) * 1000
+                latencies.append(latency)
+                
+                assert response.status_code == 200
+                data = response.json()
+                assert len(data["data"][0]["embedding"]) == 1024
+        
+        # Performance assertions for embeddings
+        avg_latency = statistics.mean(latencies)
+        print(f"Average embedding latency: {avg_latency:.2f}ms")
+        
+        assert avg_latency < 150  # Less than 150ms average for embeddings
+
+    @pytest.mark.asyncio
+    async def test_provider_status_performance(self, client: AsyncClient):
+        """Test provider status endpoint performance."""
+        mock_status = {
+            "privatemode": {
+                "provider": "PrivateMode.ai",
+                "status": "healthy",
+                "latency_ms": 250.5,
+                "success_rate": 0.98,
+                "last_check": "2025-01-01T12:00:00Z",
+                "models_available": ["privatemode-llama-3-70b", "privatemode-embeddings"]
+            }
+        }
+        
+        latencies = []
+        
+        with patch("app.services.llm.service.llm_service.get_provider_status") as mock_provider:
+            mock_provider.return_value = mock_status
+            
+            # Measure status endpoint performance
+            for i in range(10):
+                start_time = time.time()
+                
+                response = await client.get(
+                    "/api/v1/llm/providers/status",
+                    headers={"Authorization": "Bearer test-api-key"}
+                )
+                
+                latency = (time.time() - start_time) * 1000
+                latencies.append(latency)
+                
+                assert response.status_code == 200
+        
+        avg_latency = statistics.mean(latencies)
+        print(f"Average provider status latency: {avg_latency:.2f}ms")
+        
+        # Status endpoint should be very fast
+        assert avg_latency < 50  # Less than 50ms for status checks
+
+    @pytest.mark.asyncio
+    async def test_models_endpoint_performance(self, client: AsyncClient):
+        """Test models listing endpoint performance."""
+        from app.services.llm.models import Model
+        
+        # Create a realistic number of models
+        mock_models = []
+        for i in range(20):  # Simulate 20 available models
+            mock_models.append(
+                Model(
+                    id=f"privatemode-model-{i}",
+                    object="model",
+                    created=1234567890,
+                    owned_by="PrivateMode.ai",
+                    provider="PrivateMode.ai",
+                    capabilities=["tee", "chat"],
+                    context_window=32768 if i % 2 == 0 else 8192,
+                    supports_streaming=True,
+                    supports_function_calling=i % 3 == 0
+                )
+            )
+        
+        latencies = []
+        
+        with patch("app.services.llm.service.llm_service.get_models") as mock_models_call:
+            mock_models_call.return_value = mock_models
+            
+            # Measure models endpoint performance
+            for i in range(10):
+                start_time = time.time()
+                
+                response = await client.get(
+                    "/api/v1/llm/models",
+                    headers={"Authorization": "Bearer test-api-key"}
+                )
+                
+                latency = (time.time() - start_time) * 1000
+                latencies.append(latency)
+                
+                assert response.status_code == 200
+                data = response.json()
+                assert len(data["data"]) == 20
+        
+        avg_latency = statistics.mean(latencies)
+        print(f"Average models endpoint latency: {avg_latency:.2f}ms")
+        
+        # Models endpoint should be reasonably fast even with many models
+        assert avg_latency < 100  # Less than 100ms for models listing
+
+    @pytest.mark.asyncio
+    async def test_error_handling_performance(self, client: AsyncClient):
+        """Test that error handling doesn't significantly impact performance."""
+        error_latencies = []
+        
+        with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
+            mock_chat.side_effect = Exception("Simulated provider error")
+            
+            # Measure error handling performance
+            for i in range(5):
+                start_time = time.time()
+                
+                response = await client.post(
+                    "/api/v1/llm/chat/completions",
+                    json={
+                        "model": "privatemode-llama-3-70b",
+                        "messages": [
+                            {"role": "user", "content": f"Error test {i}"}
+                        ]
+                    },
+                    headers={"Authorization": "Bearer test-api-key"}
+                )
+                
+                latency = (time.time() - start_time) * 1000
+                error_latencies.append(latency)
+                
+                # Should return error but quickly
+                assert response.status_code in [500, 503]
+        
+        avg_error_latency = statistics.mean(error_latencies)
+        print(f"Average error handling latency: {avg_error_latency:.2f}ms")
+        
+        # Error handling should be fast
+        assert avg_error_latency < 200  # Less than 200ms for error responses
+
+    @pytest.mark.asyncio
+    async def test_memory_efficiency(self, client: AsyncClient):
+        """Test memory efficiency during concurrent operations."""
+        from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
+        
+        # Create a larger response to test memory handling
+        large_content = "This is a large response. " * 100  # ~2.5KB content
+        
+        mock_response = ChatCompletionResponse(
+            id="memory-test",
+            object="chat.completion",
+            created=int(time.time()),
+            model="privatemode-llama-3-70b",
+            choices=[
+                ChatChoice(
+                    index=0,
+                    message=ChatMessage(
+                        role="assistant",
+                        content=large_content
+                    ),
+                    finish_reason="stop"
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=50,
+                completion_tokens=500,
+                total_tokens=550
+            )
+        )
+        
+        with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
+            mock_chat.return_value = mock_response
+            
+            # Create many concurrent requests to test memory efficiency
+            tasks = []
+            for i in range(50):  # 50 concurrent requests with large responses
+                task = client.post(
+                    "/api/v1/llm/chat/completions",
+                    json={
+                        "model": "privatemode-llama-3-70b",
+                        "messages": [
+                            {"role": "user", "content": f"Memory test {i}"}
+                        ]
+                    },
+                    headers={"Authorization": "Bearer test-api-key"}
+                )
+                tasks.append(task)
+            
+            start_time = time.time()
+            responses = await asyncio.gather(*tasks)
+            elapsed_time = time.time() - start_time
+            
+            # Verify all requests succeeded
+            for response in responses:
+                assert response.status_code == 200
+                data = response.json()
+                assert len(data["choices"][0]["message"]["content"]) > 2000
+        
+        print(f"50 concurrent large requests completed in {elapsed_time:.2f}s")
+        
+        # Should handle 50 concurrent requests with large responses efficiently
+        assert elapsed_time < 5.0  # Less than 5 seconds for 50 concurrent requests
+
+    @pytest.mark.asyncio
+    async def test_security_analysis_performance(self, client: AsyncClient):
+        """Test performance impact of security analysis."""
+        from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
+        
+        # Mock response with security analysis
+        mock_response = ChatCompletionResponse(
+            id="security-perf-test",
+            object="chat.completion",
+            created=int(time.time()),
+            model="privatemode-llama-3-70b",
+            choices=[
+                ChatChoice(
+                    index=0,
+                    message=ChatMessage(
+                        role="assistant",
+                        content="Secure response with analysis."
+                    ),
+                    finish_reason="stop"
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=15,
+                completion_tokens=8,
+                total_tokens=23
+            ),
+            security_analysis={
+                "risk_score": 0.1,
+                "threats_detected": [],
+                "risk_level": "low",
+                "analysis_time_ms": 25.5
+            }
+        )
+        
+        latencies = []
+        
+        with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
+            mock_chat.return_value = mock_response
+            
+            # Measure latency with security analysis
+            for i in range(10):
+                start_time = time.time()
+                
+                response = await client.post(
+                    "/api/v1/llm/chat/completions",
+                    json={
+                        "model": "privatemode-llama-3-70b",
+                        "messages": [
+                            {"role": "user", "content": f"Security test {i}"}
+                        ]
+                    },
+                    headers={"Authorization": "Bearer test-api-key"}
+                )
+                
+                latency = (time.time() - start_time) * 1000
+                latencies.append(latency)
+                
+                assert response.status_code == 200
+                data = response.json()
+                assert "security_analysis" in data
+        
+        avg_latency = statistics.mean(latencies)
+        print(f"Average latency with security analysis: {avg_latency:.2f}ms")
+        
+        # Security analysis should not significantly impact performance
+        assert avg_latency < 150  # Less than 150ms with security analysis