Files
enclava/backend/tests/integration/test_real_rag_integration.py
2025-08-19 09:50:15 +02:00

419 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Real RAG Integration Test with attention.pdf
This test creates a real collection and uploads the actual attention.pdf file
"""
import asyncio
import aiohttp
import aiofiles
import json
import os
import time
import tempfile
from pathlib import Path
from typing import Optional, Dict, Any, Tuple
class RealRAGIntegrationTest:
"""Test the complete RAG system with real file uploads"""
def __init__(self,
frontend_url: str = "http://localhost:53000",
backend_url: str = "http://localhost:58000"):
self.frontend_url = frontend_url
self.backend_url = backend_url
self.session: Optional[aiohttp.ClientSession] = None
self.test_collection_id: Optional[str] = None
self.test_document_ids: list = []
self.auth_token = "test-auth-token-123" # Mock token for testing
async def __aenter__(self):
self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.session:
await self.session.close()
async def create_test_collection(self) -> Dict[str, Any]:
"""Create a test collection for attention.pdf"""
print("📁 Creating test collection for attention.pdf...")
collection_data = {
"name": "Attention Paper Collection",
"description": "Collection for the famous 'Attention Is All You Need' paper"
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.auth_token}"
}
async with self.session.post(
f"{self.frontend_url}/api/rag/collections",
json=collection_data,
headers=headers
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"Failed to create collection: {response.status} - {error_text}")
data = await response.json()
if not data.get("success"):
raise Exception(f"Collection creation failed: {data.get('error')}")
collection = data["collection"]
self.test_collection_id = collection["id"]
print(f"✅ Created collection: {collection['name']} (ID: {self.test_collection_id})")
print(f" Description: {collection['description']}")
print(f" Status: {collection['status']}")
return collection
async def upload_attention_pdf(self) -> Dict[str, Any]:
"""Upload the attention.pdf file to the test collection"""
print("📄 Uploading attention.pdf...")
# Find attention.pdf in the current directory or parent directories
pdf_path = None
search_paths = [
Path.cwd() / "attention.pdf",
Path.cwd().parent / "attention.pdf",
Path.cwd() / "tests" / "attention.pdf",
Path.cwd() / "backend" / "tests" / "attention.pdf",
]
for path in search_paths:
if path.exists():
pdf_path = path
break
if not pdf_path:
# Create a mock PDF file for testing
print("⚠️ attention.pdf not found, creating mock PDF for testing...")
pdf_content = b"%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [0 0 612 792]\n/Contents 4 0 R\n>>\nendobj\n4 0 obj\n<<\n/Length 44\n>>\nstream\nBT\n/F1 12 Tf\n72 720 Td\n(Attention Is All You Need - Mock PDF) Tj\nET\nendstream\nendobj\nxref\n0 5\n0000000000 65535 f \n0000000009 00000 n \n0000000058 00000 n \n0000000115 00000 n \n0000000207 00000 n \ntrailer\n<<\n/Size 5\n/Root 1 0 R\n>>\nstartxref\n298\n%%EOF"
else:
async with aiofiles.open(pdf_path, 'rb') as f:
pdf_content = await f.read()
# Prepare form data
form_data = aiohttp.FormData()
form_data.add_field('collection_id', str(self.test_collection_id))
form_data.add_field('file', pdf_content, filename='attention.pdf', content_type='application/pdf')
headers = {
"Authorization": f"Bearer {self.auth_token}"
}
print(f" 📊 File size: {len(pdf_content):,} bytes")
print(f" 📂 Target collection: {self.test_collection_id}")
async with self.session.post(
f"{self.frontend_url}/api/rag/documents",
data=form_data,
headers=headers
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"Failed to upload document: {response.status} - {error_text}")
data = await response.json()
if not data.get("success"):
raise Exception(f"Document upload failed: {data.get('error')}")
document = data["document"]
self.test_document_ids.append(document["id"])
print(f"✅ Uploaded attention.pdf successfully!")
print(f" Document ID: {document['id']}")
print(f" Status: {document['status']}")
print(f" File size: {document['size']:,} bytes")
print(f" File type: {document['file_type']}")
return document
async def wait_for_processing(self, document_id: str, max_wait: int = 60) -> Dict[str, Any]:
"""Wait for document processing to complete"""
print("⏳ Waiting for document processing...")
headers = {
"Authorization": f"Bearer {self.auth_token}"
}
start_time = time.time()
while time.time() - start_time < max_wait:
async with self.session.get(
f"{self.frontend_url}/api/rag/documents/{document_id}",
headers=headers
) as response:
if response.status == 200:
data = await response.json()
if data.get("success"):
document = data["document"]
status = document["status"]
print(f" Status: {status}")
if status in ["processed", "indexed", "error"]:
if status == "error":
error = document.get("processing_error", "Unknown error")
print(f"❌ Processing failed: {error}")
else:
print(f"✅ Processing completed with status: {status}")
if document.get("word_count"):
print(f" Word count: {document['word_count']:,}")
if document.get("character_count"):
print(f" Character count: {document['character_count']:,}")
if document.get("vector_count"):
print(f" Vector count: {document['vector_count']:,}")
return document
await asyncio.sleep(2)
raise Exception(f"Document processing timeout after {max_wait} seconds")
async def test_document_content(self, document_id: str) -> None:
"""Test that document content was processed correctly"""
print("📖 Testing processed document content...")
headers = {
"Authorization": f"Bearer {self.auth_token}"
}
async with self.session.get(
f"{self.frontend_url}/api/rag/documents/{document_id}",
headers=headers
) as response:
if response.status != 200:
raise Exception(f"Failed to get document: {response.status}")
data = await response.json()
document = data["document"]
# Check converted content
converted_content = document.get("converted_content", "")
if converted_content:
print(f"✅ Document converted to markdown ({len(converted_content)} characters)")
# Look for key terms from attention paper
key_terms = ["attention", "transformer", "neural", "machine translation"]
found_terms = [term for term in key_terms if term.lower() in converted_content.lower()]
if found_terms:
print(f"✅ Found relevant terms: {', '.join(found_terms)}")
else:
print("⚠️ No specific attention paper terms found (might be mock content)")
# Show a snippet
snippet = converted_content[:200] + "..." if len(converted_content) > 200 else converted_content
print(f"📄 Content preview: {snippet}")
else:
print("⚠️ No converted content available")
# Check metadata
metadata = document.get("metadata", {})
if metadata:
print("✅ Document metadata extracted:")
for key, value in metadata.items():
if isinstance(value, list):
print(f" {key}: {len(value)} items")
else:
print(f" {key}: {value}")
else:
print("⚠️ No metadata extracted")
async def test_document_download(self, document_id: str) -> None:
"""Test document download functionality"""
print("⬇️ Testing document download...")
headers = {
"Authorization": f"Bearer {self.auth_token}"
}
async with self.session.get(
f"{self.frontend_url}/api/rag/documents/{document_id}/download",
headers=headers
) as response:
if response.status != 200:
raise Exception(f"Failed to download document: {response.status}")
content = await response.read()
content_type = response.headers.get("content-type", "")
filename = response.headers.get("content-disposition", "")
print(f"✅ Download successful!")
print(f" Content size: {len(content):,} bytes")
print(f" Content type: {content_type}")
print(f" Filename header: {filename}")
# Verify it's a PDF
if content.startswith(b"%PDF"):
print("✅ Downloaded file is a valid PDF")
else:
print("⚠️ Downloaded file may not be a valid PDF")
async def test_rag_stats(self) -> Dict[str, Any]:
"""Test RAG system statistics"""
print("📊 Testing RAG system statistics...")
headers = {
"Authorization": f"Bearer {self.auth_token}"
}
async with self.session.get(
f"{self.frontend_url}/api/rag/stats",
headers=headers
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"Failed to get stats: {response.status} - {error_text}")
data = await response.json()
if not data.get("success"):
raise Exception(f"Stats request failed: {data.get('error')}")
stats = data["stats"]
print("✅ RAG system statistics:")
# Collections stats
collections = stats.get("collections", {})
print(f" 📁 Collections: {collections.get('total', 0)} total, {collections.get('active', 0)} active")
# Documents stats
documents = stats.get("documents", {})
print(f" 📄 Documents: {documents.get('total', 0)} total, {documents.get('processed', 0)} processed, {documents.get('processing', 0)} processing")
# Storage stats
storage = stats.get("storage", {})
print(f" 💾 Storage: {storage.get('total_size_mb', 0):.2f} MB total")
# Vectors stats
vectors = stats.get("vectors", {})
print(f" 🔢 Vectors: {vectors.get('total', 0)} total")
return stats
async def cleanup_test_data(self) -> None:
"""Clean up test collection and documents"""
print("🧹 Cleaning up test data...")
headers = {
"Authorization": f"Bearer {self.auth_token}"
}
# Delete documents first
for doc_id in self.test_document_ids:
try:
async with self.session.delete(
f"{self.frontend_url}/api/rag/documents/{doc_id}",
headers=headers
) as response:
if response.status == 200:
print(f"✅ Deleted document {doc_id}")
else:
print(f"⚠️ Failed to delete document {doc_id}: {response.status}")
except Exception as e:
print(f"⚠️ Error deleting document {doc_id}: {e}")
# Delete collection
if self.test_collection_id:
try:
async with self.session.delete(
f"{self.frontend_url}/api/rag/collections/{self.test_collection_id}",
headers=headers
) as response:
if response.status == 200:
print(f"✅ Deleted collection {self.test_collection_id}")
else:
print(f"⚠️ Failed to delete collection {self.test_collection_id}: {response.status}")
except Exception as e:
print(f"⚠️ Error deleting collection {self.test_collection_id}: {e}")
async def run_full_test(self) -> bool:
"""Run the complete integration test"""
print("🚀 Starting Real RAG Integration Test with attention.pdf")
print("=" * 70)
try:
# Step 1: Create collection
collection = await self.create_test_collection()
# Step 2: Upload attention.pdf
document = await self.upload_attention_pdf()
# Step 3: Wait for processing
processed_doc = await self.wait_for_processing(document["id"])
# Step 4: Test document content
await self.test_document_content(document["id"])
# Step 5: Test document download
await self.test_document_download(document["id"])
# Step 6: Test system stats
stats = await self.test_rag_stats()
print("\n" + "=" * 70)
print("🎉 ALL REAL RAG INTEGRATION TESTS PASSED!")
print(f"✅ Collection created: {collection['name']}")
print(f"✅ Document uploaded: attention.pdf")
print(f"✅ Processing completed: {processed_doc['status']}")
print(f"✅ Content extracted: {processed_doc.get('word_count', 0)} words")
print(f"✅ Download functional: PDF retrieved")
print(f"✅ Stats working: {stats['documents']['total']} documents in system")
print("\n🎯 Your RAG system is fully operational and ready for production!")
return True
except Exception as e:
print(f"\n❌ Integration test failed: {e}")
import traceback
traceback.print_exc()
return False
finally:
# Always cleanup
try:
await self.cleanup_test_data()
except Exception as e:
print(f"⚠️ Cleanup error: {e}")
async def main():
"""Main test runner"""
# Test with different URL configurations
test_configs = [
{
"frontend_url": "http://localhost:53000",
"backend_url": "http://localhost:58000",
"name": "External URLs (from host machine)"
}
]
all_passed = True
for config in test_configs:
print(f"\n🔧 Testing with {config['name']}")
print(f" Frontend: {config['frontend_url']}")
print(f" Backend: {config['backend_url']}")
async with RealRAGIntegrationTest(
frontend_url=config["frontend_url"],
backend_url=config["backend_url"]
) as test_runner:
passed = await test_runner.run_full_test()
if not passed:
all_passed = False
if all_passed:
print("\n🎉 All test configurations passed!")
return 0
else:
print("\n❌ Some tests failed!")
return 1
if __name__ == "__main__":
import sys
exit_code = asyncio.run(main())
sys.exit(exit_code)