mirror of
https://github.com/aljazceru/enclava.git
synced 2025-12-17 15:34:36 +01:00
plugin system
This commit is contained in:
466
backend/tests/performance/test_llm_performance.py
Normal file
466
backend/tests/performance/test_llm_performance.py
Normal file
@@ -0,0 +1,466 @@
|
||||
"""
|
||||
Performance tests for the new LLM service.
|
||||
Tests response times, throughput, and resource usage.
|
||||
"""
|
||||
import pytest
|
||||
import asyncio
|
||||
import time
|
||||
import statistics
|
||||
from httpx import AsyncClient
|
||||
from unittest.mock import patch, AsyncMock
|
||||
from typing import List
|
||||
|
||||
|
||||
class TestLLMPerformance:
|
||||
"""Performance tests for LLM service."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_completion_latency(self, client: AsyncClient):
|
||||
"""Test chat completion response latency."""
|
||||
from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
|
||||
|
||||
# Mock fast response
|
||||
mock_response = ChatCompletionResponse(
|
||||
id="perf-test",
|
||||
object="chat.completion",
|
||||
created=int(time.time()),
|
||||
model="privatemode-llama-3-70b",
|
||||
choices=[
|
||||
ChatChoice(
|
||||
index=0,
|
||||
message=ChatMessage(
|
||||
role="assistant",
|
||||
content="Performance test response."
|
||||
),
|
||||
finish_reason="stop"
|
||||
)
|
||||
],
|
||||
usage=Usage(
|
||||
prompt_tokens=10,
|
||||
completion_tokens=5,
|
||||
total_tokens=15
|
||||
)
|
||||
)
|
||||
|
||||
latencies = []
|
||||
|
||||
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
||||
mock_chat.return_value = mock_response
|
||||
|
||||
# Measure latency over multiple requests
|
||||
for i in range(10):
|
||||
start_time = time.time()
|
||||
|
||||
response = await client.post(
|
||||
"/api/v1/llm/chat/completions",
|
||||
json={
|
||||
"model": "privatemode-llama-3-70b",
|
||||
"messages": [
|
||||
{"role": "user", "content": f"Performance test {i}"}
|
||||
]
|
||||
},
|
||||
headers={"Authorization": "Bearer test-api-key"}
|
||||
)
|
||||
|
||||
latency = (time.time() - start_time) * 1000 # Convert to milliseconds
|
||||
latencies.append(latency)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
# Analyze performance metrics
|
||||
avg_latency = statistics.mean(latencies)
|
||||
p95_latency = statistics.quantiles(latencies, n=20)[18] # 95th percentile
|
||||
p99_latency = statistics.quantiles(latencies, n=100)[98] # 99th percentile
|
||||
|
||||
print(f"Average latency: {avg_latency:.2f}ms")
|
||||
print(f"P95 latency: {p95_latency:.2f}ms")
|
||||
print(f"P99 latency: {p99_latency:.2f}ms")
|
||||
|
||||
# Performance assertions (for mocked responses, should be very fast)
|
||||
assert avg_latency < 100 # Less than 100ms average
|
||||
assert p95_latency < 200 # Less than 200ms for 95% of requests
|
||||
assert p99_latency < 500 # Less than 500ms for 99% of requests
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_concurrent_throughput(self, client: AsyncClient):
|
||||
"""Test concurrent request throughput."""
|
||||
from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
|
||||
|
||||
mock_response = ChatCompletionResponse(
|
||||
id="throughput-test",
|
||||
object="chat.completion",
|
||||
created=int(time.time()),
|
||||
model="privatemode-llama-3-70b",
|
||||
choices=[
|
||||
ChatChoice(
|
||||
index=0,
|
||||
message=ChatMessage(
|
||||
role="assistant",
|
||||
content="Throughput test response."
|
||||
),
|
||||
finish_reason="stop"
|
||||
)
|
||||
],
|
||||
usage=Usage(
|
||||
prompt_tokens=8,
|
||||
completion_tokens=4,
|
||||
total_tokens=12
|
||||
)
|
||||
)
|
||||
|
||||
concurrent_levels = [1, 5, 10, 20]
|
||||
throughput_results = {}
|
||||
|
||||
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
||||
mock_chat.return_value = mock_response
|
||||
|
||||
for concurrency in concurrent_levels:
|
||||
start_time = time.time()
|
||||
|
||||
# Create concurrent requests
|
||||
tasks = []
|
||||
for i in range(concurrency):
|
||||
task = client.post(
|
||||
"/api/v1/llm/chat/completions",
|
||||
json={
|
||||
"model": "privatemode-llama-3-70b",
|
||||
"messages": [
|
||||
{"role": "user", "content": f"Concurrent test {i}"}
|
||||
]
|
||||
},
|
||||
headers={"Authorization": "Bearer test-api-key"}
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Execute all requests
|
||||
responses = await asyncio.gather(*tasks)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
# Verify all requests succeeded
|
||||
for response in responses:
|
||||
assert response.status_code == 200
|
||||
|
||||
# Calculate throughput (requests per second)
|
||||
throughput = concurrency / elapsed_time
|
||||
throughput_results[concurrency] = throughput
|
||||
|
||||
print(f"Concurrency {concurrency}: {throughput:.2f} req/s")
|
||||
|
||||
# Performance assertions
|
||||
assert throughput_results[1] > 10 # At least 10 req/s for single requests
|
||||
assert throughput_results[5] > 30 # At least 30 req/s for 5 concurrent
|
||||
assert throughput_results[10] > 50 # At least 50 req/s for 10 concurrent
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_embedding_performance(self, client: AsyncClient):
|
||||
"""Test embedding generation performance."""
|
||||
from app.services.llm.models import EmbeddingResponse, EmbeddingData, Usage
|
||||
|
||||
# Create realistic embedding response
|
||||
embedding_vector = [0.1 * i for i in range(1024)]
|
||||
|
||||
mock_response = EmbeddingResponse(
|
||||
object="list",
|
||||
data=[
|
||||
EmbeddingData(
|
||||
object="embedding",
|
||||
embedding=embedding_vector,
|
||||
index=0
|
||||
)
|
||||
],
|
||||
model="privatemode-embeddings",
|
||||
usage=Usage(
|
||||
prompt_tokens=10,
|
||||
total_tokens=10
|
||||
)
|
||||
)
|
||||
|
||||
latencies = []
|
||||
|
||||
with patch("app.services.llm.service.llm_service.create_embedding") as mock_embedding:
|
||||
mock_embedding.return_value = mock_response
|
||||
|
||||
# Test different text lengths
|
||||
test_texts = [
|
||||
"Short text",
|
||||
"Medium length text that contains more words and should take a bit longer to process.",
|
||||
"Very long text that contains many words and sentences. " * 10, # Repeat to make it longer
|
||||
]
|
||||
|
||||
for text in test_texts:
|
||||
start_time = time.time()
|
||||
|
||||
response = await client.post(
|
||||
"/api/v1/llm/embeddings",
|
||||
json={
|
||||
"model": "privatemode-embeddings",
|
||||
"input": text
|
||||
},
|
||||
headers={"Authorization": "Bearer test-api-key"}
|
||||
)
|
||||
|
||||
latency = (time.time() - start_time) * 1000
|
||||
latencies.append(latency)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert len(data["data"][0]["embedding"]) == 1024
|
||||
|
||||
# Performance assertions for embeddings
|
||||
avg_latency = statistics.mean(latencies)
|
||||
print(f"Average embedding latency: {avg_latency:.2f}ms")
|
||||
|
||||
assert avg_latency < 150 # Less than 150ms average for embeddings
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_provider_status_performance(self, client: AsyncClient):
|
||||
"""Test provider status endpoint performance."""
|
||||
mock_status = {
|
||||
"privatemode": {
|
||||
"provider": "PrivateMode.ai",
|
||||
"status": "healthy",
|
||||
"latency_ms": 250.5,
|
||||
"success_rate": 0.98,
|
||||
"last_check": "2025-01-01T12:00:00Z",
|
||||
"models_available": ["privatemode-llama-3-70b", "privatemode-embeddings"]
|
||||
}
|
||||
}
|
||||
|
||||
latencies = []
|
||||
|
||||
with patch("app.services.llm.service.llm_service.get_provider_status") as mock_provider:
|
||||
mock_provider.return_value = mock_status
|
||||
|
||||
# Measure status endpoint performance
|
||||
for i in range(10):
|
||||
start_time = time.time()
|
||||
|
||||
response = await client.get(
|
||||
"/api/v1/llm/providers/status",
|
||||
headers={"Authorization": "Bearer test-api-key"}
|
||||
)
|
||||
|
||||
latency = (time.time() - start_time) * 1000
|
||||
latencies.append(latency)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
avg_latency = statistics.mean(latencies)
|
||||
print(f"Average provider status latency: {avg_latency:.2f}ms")
|
||||
|
||||
# Status endpoint should be very fast
|
||||
assert avg_latency < 50 # Less than 50ms for status checks
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_models_endpoint_performance(self, client: AsyncClient):
|
||||
"""Test models listing endpoint performance."""
|
||||
from app.services.llm.models import Model
|
||||
|
||||
# Create a realistic number of models
|
||||
mock_models = []
|
||||
for i in range(20): # Simulate 20 available models
|
||||
mock_models.append(
|
||||
Model(
|
||||
id=f"privatemode-model-{i}",
|
||||
object="model",
|
||||
created=1234567890,
|
||||
owned_by="PrivateMode.ai",
|
||||
provider="PrivateMode.ai",
|
||||
capabilities=["tee", "chat"],
|
||||
context_window=32768 if i % 2 == 0 else 8192,
|
||||
supports_streaming=True,
|
||||
supports_function_calling=i % 3 == 0
|
||||
)
|
||||
)
|
||||
|
||||
latencies = []
|
||||
|
||||
with patch("app.services.llm.service.llm_service.get_models") as mock_models_call:
|
||||
mock_models_call.return_value = mock_models
|
||||
|
||||
# Measure models endpoint performance
|
||||
for i in range(10):
|
||||
start_time = time.time()
|
||||
|
||||
response = await client.get(
|
||||
"/api/v1/llm/models",
|
||||
headers={"Authorization": "Bearer test-api-key"}
|
||||
)
|
||||
|
||||
latency = (time.time() - start_time) * 1000
|
||||
latencies.append(latency)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert len(data["data"]) == 20
|
||||
|
||||
avg_latency = statistics.mean(latencies)
|
||||
print(f"Average models endpoint latency: {avg_latency:.2f}ms")
|
||||
|
||||
# Models endpoint should be reasonably fast even with many models
|
||||
assert avg_latency < 100 # Less than 100ms for models listing
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_error_handling_performance(self, client: AsyncClient):
|
||||
"""Test that error handling doesn't significantly impact performance."""
|
||||
error_latencies = []
|
||||
|
||||
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
||||
mock_chat.side_effect = Exception("Simulated provider error")
|
||||
|
||||
# Measure error handling performance
|
||||
for i in range(5):
|
||||
start_time = time.time()
|
||||
|
||||
response = await client.post(
|
||||
"/api/v1/llm/chat/completions",
|
||||
json={
|
||||
"model": "privatemode-llama-3-70b",
|
||||
"messages": [
|
||||
{"role": "user", "content": f"Error test {i}"}
|
||||
]
|
||||
},
|
||||
headers={"Authorization": "Bearer test-api-key"}
|
||||
)
|
||||
|
||||
latency = (time.time() - start_time) * 1000
|
||||
error_latencies.append(latency)
|
||||
|
||||
# Should return error but quickly
|
||||
assert response.status_code in [500, 503]
|
||||
|
||||
avg_error_latency = statistics.mean(error_latencies)
|
||||
print(f"Average error handling latency: {avg_error_latency:.2f}ms")
|
||||
|
||||
# Error handling should be fast
|
||||
assert avg_error_latency < 200 # Less than 200ms for error responses
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_memory_efficiency(self, client: AsyncClient):
|
||||
"""Test memory efficiency during concurrent operations."""
|
||||
from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
|
||||
|
||||
# Create a larger response to test memory handling
|
||||
large_content = "This is a large response. " * 100 # ~2.5KB content
|
||||
|
||||
mock_response = ChatCompletionResponse(
|
||||
id="memory-test",
|
||||
object="chat.completion",
|
||||
created=int(time.time()),
|
||||
model="privatemode-llama-3-70b",
|
||||
choices=[
|
||||
ChatChoice(
|
||||
index=0,
|
||||
message=ChatMessage(
|
||||
role="assistant",
|
||||
content=large_content
|
||||
),
|
||||
finish_reason="stop"
|
||||
)
|
||||
],
|
||||
usage=Usage(
|
||||
prompt_tokens=50,
|
||||
completion_tokens=500,
|
||||
total_tokens=550
|
||||
)
|
||||
)
|
||||
|
||||
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
||||
mock_chat.return_value = mock_response
|
||||
|
||||
# Create many concurrent requests to test memory efficiency
|
||||
tasks = []
|
||||
for i in range(50): # 50 concurrent requests with large responses
|
||||
task = client.post(
|
||||
"/api/v1/llm/chat/completions",
|
||||
json={
|
||||
"model": "privatemode-llama-3-70b",
|
||||
"messages": [
|
||||
{"role": "user", "content": f"Memory test {i}"}
|
||||
]
|
||||
},
|
||||
headers={"Authorization": "Bearer test-api-key"}
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
start_time = time.time()
|
||||
responses = await asyncio.gather(*tasks)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
# Verify all requests succeeded
|
||||
for response in responses:
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert len(data["choices"][0]["message"]["content"]) > 2000
|
||||
|
||||
print(f"50 concurrent large requests completed in {elapsed_time:.2f}s")
|
||||
|
||||
# Should handle 50 concurrent requests with large responses efficiently
|
||||
assert elapsed_time < 5.0 # Less than 5 seconds for 50 concurrent requests
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_security_analysis_performance(self, client: AsyncClient):
|
||||
"""Test performance impact of security analysis."""
|
||||
from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
|
||||
|
||||
# Mock response with security analysis
|
||||
mock_response = ChatCompletionResponse(
|
||||
id="security-perf-test",
|
||||
object="chat.completion",
|
||||
created=int(time.time()),
|
||||
model="privatemode-llama-3-70b",
|
||||
choices=[
|
||||
ChatChoice(
|
||||
index=0,
|
||||
message=ChatMessage(
|
||||
role="assistant",
|
||||
content="Secure response with analysis."
|
||||
),
|
||||
finish_reason="stop"
|
||||
)
|
||||
],
|
||||
usage=Usage(
|
||||
prompt_tokens=15,
|
||||
completion_tokens=8,
|
||||
total_tokens=23
|
||||
),
|
||||
security_analysis={
|
||||
"risk_score": 0.1,
|
||||
"threats_detected": [],
|
||||
"risk_level": "low",
|
||||
"analysis_time_ms": 25.5
|
||||
}
|
||||
)
|
||||
|
||||
latencies = []
|
||||
|
||||
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
||||
mock_chat.return_value = mock_response
|
||||
|
||||
# Measure latency with security analysis
|
||||
for i in range(10):
|
||||
start_time = time.time()
|
||||
|
||||
response = await client.post(
|
||||
"/api/v1/llm/chat/completions",
|
||||
json={
|
||||
"model": "privatemode-llama-3-70b",
|
||||
"messages": [
|
||||
{"role": "user", "content": f"Security test {i}"}
|
||||
]
|
||||
},
|
||||
headers={"Authorization": "Bearer test-api-key"}
|
||||
)
|
||||
|
||||
latency = (time.time() - start_time) * 1000
|
||||
latencies.append(latency)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert "security_analysis" in data
|
||||
|
||||
avg_latency = statistics.mean(latencies)
|
||||
print(f"Average latency with security analysis: {avg_latency:.2f}ms")
|
||||
|
||||
# Security analysis should not significantly impact performance
|
||||
assert avg_latency < 150 # Less than 150ms with security analysis
|
||||
Reference in New Issue
Block a user