mirror of
https://github.com/aljazceru/enclava.git
synced 2025-12-17 23:44:24 +01:00
466 lines
18 KiB
Python
466 lines
18 KiB
Python
"""
|
|
Performance tests for the new LLM service.
|
|
Tests response times, throughput, and resource usage.
|
|
"""
|
|
import pytest
|
|
import asyncio
|
|
import time
|
|
import statistics
|
|
from httpx import AsyncClient
|
|
from unittest.mock import patch, AsyncMock
|
|
from typing import List
|
|
|
|
|
|
class TestLLMPerformance:
|
|
"""Performance tests for LLM service."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_chat_completion_latency(self, client: AsyncClient):
|
|
"""Test chat completion response latency."""
|
|
from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
|
|
|
|
# Mock fast response
|
|
mock_response = ChatCompletionResponse(
|
|
id="perf-test",
|
|
object="chat.completion",
|
|
created=int(time.time()),
|
|
model="privatemode-llama-3-70b",
|
|
choices=[
|
|
ChatChoice(
|
|
index=0,
|
|
message=ChatMessage(
|
|
role="assistant",
|
|
content="Performance test response."
|
|
),
|
|
finish_reason="stop"
|
|
)
|
|
],
|
|
usage=Usage(
|
|
prompt_tokens=10,
|
|
completion_tokens=5,
|
|
total_tokens=15
|
|
)
|
|
)
|
|
|
|
latencies = []
|
|
|
|
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
|
mock_chat.return_value = mock_response
|
|
|
|
# Measure latency over multiple requests
|
|
for i in range(10):
|
|
start_time = time.time()
|
|
|
|
response = await client.post(
|
|
"/api/v1/llm/chat/completions",
|
|
json={
|
|
"model": "privatemode-llama-3-70b",
|
|
"messages": [
|
|
{"role": "user", "content": f"Performance test {i}"}
|
|
]
|
|
},
|
|
headers={"Authorization": "Bearer test-api-key"}
|
|
)
|
|
|
|
latency = (time.time() - start_time) * 1000 # Convert to milliseconds
|
|
latencies.append(latency)
|
|
|
|
assert response.status_code == 200
|
|
|
|
# Analyze performance metrics
|
|
avg_latency = statistics.mean(latencies)
|
|
p95_latency = statistics.quantiles(latencies, n=20)[18] # 95th percentile
|
|
p99_latency = statistics.quantiles(latencies, n=100)[98] # 99th percentile
|
|
|
|
print(f"Average latency: {avg_latency:.2f}ms")
|
|
print(f"P95 latency: {p95_latency:.2f}ms")
|
|
print(f"P99 latency: {p99_latency:.2f}ms")
|
|
|
|
# Performance assertions (for mocked responses, should be very fast)
|
|
assert avg_latency < 100 # Less than 100ms average
|
|
assert p95_latency < 200 # Less than 200ms for 95% of requests
|
|
assert p99_latency < 500 # Less than 500ms for 99% of requests
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_concurrent_throughput(self, client: AsyncClient):
|
|
"""Test concurrent request throughput."""
|
|
from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
|
|
|
|
mock_response = ChatCompletionResponse(
|
|
id="throughput-test",
|
|
object="chat.completion",
|
|
created=int(time.time()),
|
|
model="privatemode-llama-3-70b",
|
|
choices=[
|
|
ChatChoice(
|
|
index=0,
|
|
message=ChatMessage(
|
|
role="assistant",
|
|
content="Throughput test response."
|
|
),
|
|
finish_reason="stop"
|
|
)
|
|
],
|
|
usage=Usage(
|
|
prompt_tokens=8,
|
|
completion_tokens=4,
|
|
total_tokens=12
|
|
)
|
|
)
|
|
|
|
concurrent_levels = [1, 5, 10, 20]
|
|
throughput_results = {}
|
|
|
|
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
|
mock_chat.return_value = mock_response
|
|
|
|
for concurrency in concurrent_levels:
|
|
start_time = time.time()
|
|
|
|
# Create concurrent requests
|
|
tasks = []
|
|
for i in range(concurrency):
|
|
task = client.post(
|
|
"/api/v1/llm/chat/completions",
|
|
json={
|
|
"model": "privatemode-llama-3-70b",
|
|
"messages": [
|
|
{"role": "user", "content": f"Concurrent test {i}"}
|
|
]
|
|
},
|
|
headers={"Authorization": "Bearer test-api-key"}
|
|
)
|
|
tasks.append(task)
|
|
|
|
# Execute all requests
|
|
responses = await asyncio.gather(*tasks)
|
|
elapsed_time = time.time() - start_time
|
|
|
|
# Verify all requests succeeded
|
|
for response in responses:
|
|
assert response.status_code == 200
|
|
|
|
# Calculate throughput (requests per second)
|
|
throughput = concurrency / elapsed_time
|
|
throughput_results[concurrency] = throughput
|
|
|
|
print(f"Concurrency {concurrency}: {throughput:.2f} req/s")
|
|
|
|
# Performance assertions
|
|
assert throughput_results[1] > 10 # At least 10 req/s for single requests
|
|
assert throughput_results[5] > 30 # At least 30 req/s for 5 concurrent
|
|
assert throughput_results[10] > 50 # At least 50 req/s for 10 concurrent
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_embedding_performance(self, client: AsyncClient):
|
|
"""Test embedding generation performance."""
|
|
from app.services.llm.models import EmbeddingResponse, EmbeddingData, Usage
|
|
|
|
# Create realistic embedding response
|
|
embedding_vector = [0.1 * i for i in range(1024)]
|
|
|
|
mock_response = EmbeddingResponse(
|
|
object="list",
|
|
data=[
|
|
EmbeddingData(
|
|
object="embedding",
|
|
embedding=embedding_vector,
|
|
index=0
|
|
)
|
|
],
|
|
model="privatemode-embeddings",
|
|
usage=Usage(
|
|
prompt_tokens=10,
|
|
total_tokens=10
|
|
)
|
|
)
|
|
|
|
latencies = []
|
|
|
|
with patch("app.services.llm.service.llm_service.create_embedding") as mock_embedding:
|
|
mock_embedding.return_value = mock_response
|
|
|
|
# Test different text lengths
|
|
test_texts = [
|
|
"Short text",
|
|
"Medium length text that contains more words and should take a bit longer to process.",
|
|
"Very long text that contains many words and sentences. " * 10, # Repeat to make it longer
|
|
]
|
|
|
|
for text in test_texts:
|
|
start_time = time.time()
|
|
|
|
response = await client.post(
|
|
"/api/v1/llm/embeddings",
|
|
json={
|
|
"model": "privatemode-embeddings",
|
|
"input": text
|
|
},
|
|
headers={"Authorization": "Bearer test-api-key"}
|
|
)
|
|
|
|
latency = (time.time() - start_time) * 1000
|
|
latencies.append(latency)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert len(data["data"][0]["embedding"]) == 1024
|
|
|
|
# Performance assertions for embeddings
|
|
avg_latency = statistics.mean(latencies)
|
|
print(f"Average embedding latency: {avg_latency:.2f}ms")
|
|
|
|
assert avg_latency < 150 # Less than 150ms average for embeddings
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_provider_status_performance(self, client: AsyncClient):
|
|
"""Test provider status endpoint performance."""
|
|
mock_status = {
|
|
"privatemode": {
|
|
"provider": "PrivateMode.ai",
|
|
"status": "healthy",
|
|
"latency_ms": 250.5,
|
|
"success_rate": 0.98,
|
|
"last_check": "2025-01-01T12:00:00Z",
|
|
"models_available": ["privatemode-llama-3-70b", "privatemode-embeddings"]
|
|
}
|
|
}
|
|
|
|
latencies = []
|
|
|
|
with patch("app.services.llm.service.llm_service.get_provider_status") as mock_provider:
|
|
mock_provider.return_value = mock_status
|
|
|
|
# Measure status endpoint performance
|
|
for i in range(10):
|
|
start_time = time.time()
|
|
|
|
response = await client.get(
|
|
"/api/v1/llm/providers/status",
|
|
headers={"Authorization": "Bearer test-api-key"}
|
|
)
|
|
|
|
latency = (time.time() - start_time) * 1000
|
|
latencies.append(latency)
|
|
|
|
assert response.status_code == 200
|
|
|
|
avg_latency = statistics.mean(latencies)
|
|
print(f"Average provider status latency: {avg_latency:.2f}ms")
|
|
|
|
# Status endpoint should be very fast
|
|
assert avg_latency < 50 # Less than 50ms for status checks
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_models_endpoint_performance(self, client: AsyncClient):
|
|
"""Test models listing endpoint performance."""
|
|
from app.services.llm.models import Model
|
|
|
|
# Create a realistic number of models
|
|
mock_models = []
|
|
for i in range(20): # Simulate 20 available models
|
|
mock_models.append(
|
|
Model(
|
|
id=f"privatemode-model-{i}",
|
|
object="model",
|
|
created=1234567890,
|
|
owned_by="PrivateMode.ai",
|
|
provider="PrivateMode.ai",
|
|
capabilities=["tee", "chat"],
|
|
context_window=32768 if i % 2 == 0 else 8192,
|
|
supports_streaming=True,
|
|
supports_function_calling=i % 3 == 0
|
|
)
|
|
)
|
|
|
|
latencies = []
|
|
|
|
with patch("app.services.llm.service.llm_service.get_models") as mock_models_call:
|
|
mock_models_call.return_value = mock_models
|
|
|
|
# Measure models endpoint performance
|
|
for i in range(10):
|
|
start_time = time.time()
|
|
|
|
response = await client.get(
|
|
"/api/v1/llm/models",
|
|
headers={"Authorization": "Bearer test-api-key"}
|
|
)
|
|
|
|
latency = (time.time() - start_time) * 1000
|
|
latencies.append(latency)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert len(data["data"]) == 20
|
|
|
|
avg_latency = statistics.mean(latencies)
|
|
print(f"Average models endpoint latency: {avg_latency:.2f}ms")
|
|
|
|
# Models endpoint should be reasonably fast even with many models
|
|
assert avg_latency < 100 # Less than 100ms for models listing
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_error_handling_performance(self, client: AsyncClient):
|
|
"""Test that error handling doesn't significantly impact performance."""
|
|
error_latencies = []
|
|
|
|
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
|
mock_chat.side_effect = Exception("Simulated provider error")
|
|
|
|
# Measure error handling performance
|
|
for i in range(5):
|
|
start_time = time.time()
|
|
|
|
response = await client.post(
|
|
"/api/v1/llm/chat/completions",
|
|
json={
|
|
"model": "privatemode-llama-3-70b",
|
|
"messages": [
|
|
{"role": "user", "content": f"Error test {i}"}
|
|
]
|
|
},
|
|
headers={"Authorization": "Bearer test-api-key"}
|
|
)
|
|
|
|
latency = (time.time() - start_time) * 1000
|
|
error_latencies.append(latency)
|
|
|
|
# Should return error but quickly
|
|
assert response.status_code in [500, 503]
|
|
|
|
avg_error_latency = statistics.mean(error_latencies)
|
|
print(f"Average error handling latency: {avg_error_latency:.2f}ms")
|
|
|
|
# Error handling should be fast
|
|
assert avg_error_latency < 200 # Less than 200ms for error responses
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_memory_efficiency(self, client: AsyncClient):
|
|
"""Test memory efficiency during concurrent operations."""
|
|
from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
|
|
|
|
# Create a larger response to test memory handling
|
|
large_content = "This is a large response. " * 100 # ~2.5KB content
|
|
|
|
mock_response = ChatCompletionResponse(
|
|
id="memory-test",
|
|
object="chat.completion",
|
|
created=int(time.time()),
|
|
model="privatemode-llama-3-70b",
|
|
choices=[
|
|
ChatChoice(
|
|
index=0,
|
|
message=ChatMessage(
|
|
role="assistant",
|
|
content=large_content
|
|
),
|
|
finish_reason="stop"
|
|
)
|
|
],
|
|
usage=Usage(
|
|
prompt_tokens=50,
|
|
completion_tokens=500,
|
|
total_tokens=550
|
|
)
|
|
)
|
|
|
|
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
|
mock_chat.return_value = mock_response
|
|
|
|
# Create many concurrent requests to test memory efficiency
|
|
tasks = []
|
|
for i in range(50): # 50 concurrent requests with large responses
|
|
task = client.post(
|
|
"/api/v1/llm/chat/completions",
|
|
json={
|
|
"model": "privatemode-llama-3-70b",
|
|
"messages": [
|
|
{"role": "user", "content": f"Memory test {i}"}
|
|
]
|
|
},
|
|
headers={"Authorization": "Bearer test-api-key"}
|
|
)
|
|
tasks.append(task)
|
|
|
|
start_time = time.time()
|
|
responses = await asyncio.gather(*tasks)
|
|
elapsed_time = time.time() - start_time
|
|
|
|
# Verify all requests succeeded
|
|
for response in responses:
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert len(data["choices"][0]["message"]["content"]) > 2000
|
|
|
|
print(f"50 concurrent large requests completed in {elapsed_time:.2f}s")
|
|
|
|
# Should handle 50 concurrent requests with large responses efficiently
|
|
assert elapsed_time < 5.0 # Less than 5 seconds for 50 concurrent requests
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_security_analysis_performance(self, client: AsyncClient):
|
|
"""Test performance impact of security analysis."""
|
|
from app.services.llm.models import ChatCompletionResponse, ChatChoice, ChatMessage, Usage
|
|
|
|
# Mock response with security analysis
|
|
mock_response = ChatCompletionResponse(
|
|
id="security-perf-test",
|
|
object="chat.completion",
|
|
created=int(time.time()),
|
|
model="privatemode-llama-3-70b",
|
|
choices=[
|
|
ChatChoice(
|
|
index=0,
|
|
message=ChatMessage(
|
|
role="assistant",
|
|
content="Secure response with analysis."
|
|
),
|
|
finish_reason="stop"
|
|
)
|
|
],
|
|
usage=Usage(
|
|
prompt_tokens=15,
|
|
completion_tokens=8,
|
|
total_tokens=23
|
|
),
|
|
security_analysis={
|
|
"risk_score": 0.1,
|
|
"threats_detected": [],
|
|
"risk_level": "low",
|
|
"analysis_time_ms": 25.5
|
|
}
|
|
)
|
|
|
|
latencies = []
|
|
|
|
with patch("app.services.llm.service.llm_service.create_chat_completion") as mock_chat:
|
|
mock_chat.return_value = mock_response
|
|
|
|
# Measure latency with security analysis
|
|
for i in range(10):
|
|
start_time = time.time()
|
|
|
|
response = await client.post(
|
|
"/api/v1/llm/chat/completions",
|
|
json={
|
|
"model": "privatemode-llama-3-70b",
|
|
"messages": [
|
|
{"role": "user", "content": f"Security test {i}"}
|
|
]
|
|
},
|
|
headers={"Authorization": "Bearer test-api-key"}
|
|
)
|
|
|
|
latency = (time.time() - start_time) * 1000
|
|
latencies.append(latency)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert "security_analysis" in data
|
|
|
|
avg_latency = statistics.mean(latencies)
|
|
print(f"Average latency with security analysis: {avg_latency:.2f}ms")
|
|
|
|
# Security analysis should not significantly impact performance
|
|
assert avg_latency < 150 # Less than 150ms with security analysis |