#!/usr/bin/env python3 """ Real RAG Integration Test with attention.pdf This test creates a real collection and uploads the actual attention.pdf file """ import asyncio import aiohttp import aiofiles import json import os import time import tempfile from pathlib import Path from typing import Optional, Dict, Any, Tuple class RealRAGIntegrationTest: """Test the complete RAG system with real file uploads""" def __init__(self, frontend_url: str = "http://localhost:53000", backend_url: str = "http://localhost:58000"): self.frontend_url = frontend_url self.backend_url = backend_url self.session: Optional[aiohttp.ClientSession] = None self.test_collection_id: Optional[str] = None self.test_document_ids: list = [] self.auth_token = "test-auth-token-123" # Mock token for testing async def __aenter__(self): self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60)) return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self.session: await self.session.close() async def create_test_collection(self) -> Dict[str, Any]: """Create a test collection for attention.pdf""" print("๐Ÿ“ Creating test collection for attention.pdf...") collection_data = { "name": "Attention Paper Collection", "description": "Collection for the famous 'Attention Is All You Need' paper" } headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.auth_token}" } async with self.session.post( f"{self.frontend_url}/api/rag/collections", json=collection_data, headers=headers ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Failed to create collection: {response.status} - {error_text}") data = await response.json() if not data.get("success"): raise Exception(f"Collection creation failed: {data.get('error')}") collection = data["collection"] self.test_collection_id = collection["id"] print(f"โœ… Created collection: {collection['name']} (ID: {self.test_collection_id})") print(f" Description: {collection['description']}") print(f" Status: {collection['status']}") return collection async def upload_attention_pdf(self) -> Dict[str, Any]: """Upload the attention.pdf file to the test collection""" print("๐Ÿ“„ Uploading attention.pdf...") # Find attention.pdf in the current directory or parent directories pdf_path = None search_paths = [ Path.cwd() / "attention.pdf", Path.cwd().parent / "attention.pdf", Path.cwd() / "tests" / "attention.pdf", Path.cwd() / "backend" / "tests" / "attention.pdf", ] for path in search_paths: if path.exists(): pdf_path = path break if not pdf_path: # Create a mock PDF file for testing print("โš ๏ธ attention.pdf not found, creating mock PDF for testing...") pdf_content = b"%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [0 0 612 792]\n/Contents 4 0 R\n>>\nendobj\n4 0 obj\n<<\n/Length 44\n>>\nstream\nBT\n/F1 12 Tf\n72 720 Td\n(Attention Is All You Need - Mock PDF) Tj\nET\nendstream\nendobj\nxref\n0 5\n0000000000 65535 f \n0000000009 00000 n \n0000000058 00000 n \n0000000115 00000 n \n0000000207 00000 n \ntrailer\n<<\n/Size 5\n/Root 1 0 R\n>>\nstartxref\n298\n%%EOF" else: async with aiofiles.open(pdf_path, 'rb') as f: pdf_content = await f.read() # Prepare form data form_data = aiohttp.FormData() form_data.add_field('collection_id', str(self.test_collection_id)) form_data.add_field('file', pdf_content, filename='attention.pdf', content_type='application/pdf') headers = { "Authorization": f"Bearer {self.auth_token}" } print(f" ๐Ÿ“Š File size: {len(pdf_content):,} bytes") print(f" ๐Ÿ“‚ Target collection: {self.test_collection_id}") async with self.session.post( f"{self.frontend_url}/api/rag/documents", data=form_data, headers=headers ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Failed to upload document: {response.status} - {error_text}") data = await response.json() if not data.get("success"): raise Exception(f"Document upload failed: {data.get('error')}") document = data["document"] self.test_document_ids.append(document["id"]) print(f"โœ… Uploaded attention.pdf successfully!") print(f" Document ID: {document['id']}") print(f" Status: {document['status']}") print(f" File size: {document['size']:,} bytes") print(f" File type: {document['file_type']}") return document async def wait_for_processing(self, document_id: str, max_wait: int = 60) -> Dict[str, Any]: """Wait for document processing to complete""" print("โณ Waiting for document processing...") headers = { "Authorization": f"Bearer {self.auth_token}" } start_time = time.time() while time.time() - start_time < max_wait: async with self.session.get( f"{self.frontend_url}/api/rag/documents/{document_id}", headers=headers ) as response: if response.status == 200: data = await response.json() if data.get("success"): document = data["document"] status = document["status"] print(f" Status: {status}") if status in ["processed", "indexed", "error"]: if status == "error": error = document.get("processing_error", "Unknown error") print(f"โŒ Processing failed: {error}") else: print(f"โœ… Processing completed with status: {status}") if document.get("word_count"): print(f" Word count: {document['word_count']:,}") if document.get("character_count"): print(f" Character count: {document['character_count']:,}") if document.get("vector_count"): print(f" Vector count: {document['vector_count']:,}") return document await asyncio.sleep(2) raise Exception(f"Document processing timeout after {max_wait} seconds") async def test_document_content(self, document_id: str) -> None: """Test that document content was processed correctly""" print("๐Ÿ“– Testing processed document content...") headers = { "Authorization": f"Bearer {self.auth_token}" } async with self.session.get( f"{self.frontend_url}/api/rag/documents/{document_id}", headers=headers ) as response: if response.status != 200: raise Exception(f"Failed to get document: {response.status}") data = await response.json() document = data["document"] # Check converted content converted_content = document.get("converted_content", "") if converted_content: print(f"โœ… Document converted to markdown ({len(converted_content)} characters)") # Look for key terms from attention paper key_terms = ["attention", "transformer", "neural", "machine translation"] found_terms = [term for term in key_terms if term.lower() in converted_content.lower()] if found_terms: print(f"โœ… Found relevant terms: {', '.join(found_terms)}") else: print("โš ๏ธ No specific attention paper terms found (might be mock content)") # Show a snippet snippet = converted_content[:200] + "..." if len(converted_content) > 200 else converted_content print(f"๐Ÿ“„ Content preview: {snippet}") else: print("โš ๏ธ No converted content available") # Check metadata metadata = document.get("metadata", {}) if metadata: print("โœ… Document metadata extracted:") for key, value in metadata.items(): if isinstance(value, list): print(f" {key}: {len(value)} items") else: print(f" {key}: {value}") else: print("โš ๏ธ No metadata extracted") async def test_document_download(self, document_id: str) -> None: """Test document download functionality""" print("โฌ‡๏ธ Testing document download...") headers = { "Authorization": f"Bearer {self.auth_token}" } async with self.session.get( f"{self.frontend_url}/api/rag/documents/{document_id}/download", headers=headers ) as response: if response.status != 200: raise Exception(f"Failed to download document: {response.status}") content = await response.read() content_type = response.headers.get("content-type", "") filename = response.headers.get("content-disposition", "") print(f"โœ… Download successful!") print(f" Content size: {len(content):,} bytes") print(f" Content type: {content_type}") print(f" Filename header: {filename}") # Verify it's a PDF if content.startswith(b"%PDF"): print("โœ… Downloaded file is a valid PDF") else: print("โš ๏ธ Downloaded file may not be a valid PDF") async def test_rag_stats(self) -> Dict[str, Any]: """Test RAG system statistics""" print("๐Ÿ“Š Testing RAG system statistics...") headers = { "Authorization": f"Bearer {self.auth_token}" } async with self.session.get( f"{self.frontend_url}/api/rag/stats", headers=headers ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Failed to get stats: {response.status} - {error_text}") data = await response.json() if not data.get("success"): raise Exception(f"Stats request failed: {data.get('error')}") stats = data["stats"] print("โœ… RAG system statistics:") # Collections stats collections = stats.get("collections", {}) print(f" ๐Ÿ“ Collections: {collections.get('total', 0)} total, {collections.get('active', 0)} active") # Documents stats documents = stats.get("documents", {}) print(f" ๐Ÿ“„ Documents: {documents.get('total', 0)} total, {documents.get('processed', 0)} processed, {documents.get('processing', 0)} processing") # Storage stats storage = stats.get("storage", {}) print(f" ๐Ÿ’พ Storage: {storage.get('total_size_mb', 0):.2f} MB total") # Vectors stats vectors = stats.get("vectors", {}) print(f" ๐Ÿ”ข Vectors: {vectors.get('total', 0)} total") return stats async def cleanup_test_data(self) -> None: """Clean up test collection and documents""" print("๐Ÿงน Cleaning up test data...") headers = { "Authorization": f"Bearer {self.auth_token}" } # Delete documents first for doc_id in self.test_document_ids: try: async with self.session.delete( f"{self.frontend_url}/api/rag/documents/{doc_id}", headers=headers ) as response: if response.status == 200: print(f"โœ… Deleted document {doc_id}") else: print(f"โš ๏ธ Failed to delete document {doc_id}: {response.status}") except Exception as e: print(f"โš ๏ธ Error deleting document {doc_id}: {e}") # Delete collection if self.test_collection_id: try: async with self.session.delete( f"{self.frontend_url}/api/rag/collections/{self.test_collection_id}", headers=headers ) as response: if response.status == 200: print(f"โœ… Deleted collection {self.test_collection_id}") else: print(f"โš ๏ธ Failed to delete collection {self.test_collection_id}: {response.status}") except Exception as e: print(f"โš ๏ธ Error deleting collection {self.test_collection_id}: {e}") async def run_full_test(self) -> bool: """Run the complete integration test""" print("๐Ÿš€ Starting Real RAG Integration Test with attention.pdf") print("=" * 70) try: # Step 1: Create collection collection = await self.create_test_collection() # Step 2: Upload attention.pdf document = await self.upload_attention_pdf() # Step 3: Wait for processing processed_doc = await self.wait_for_processing(document["id"]) # Step 4: Test document content await self.test_document_content(document["id"]) # Step 5: Test document download await self.test_document_download(document["id"]) # Step 6: Test system stats stats = await self.test_rag_stats() print("\n" + "=" * 70) print("๐ŸŽ‰ ALL REAL RAG INTEGRATION TESTS PASSED!") print(f"โœ… Collection created: {collection['name']}") print(f"โœ… Document uploaded: attention.pdf") print(f"โœ… Processing completed: {processed_doc['status']}") print(f"โœ… Content extracted: {processed_doc.get('word_count', 0)} words") print(f"โœ… Download functional: PDF retrieved") print(f"โœ… Stats working: {stats['documents']['total']} documents in system") print("\n๐ŸŽฏ Your RAG system is fully operational and ready for production!") return True except Exception as e: print(f"\nโŒ Integration test failed: {e}") import traceback traceback.print_exc() return False finally: # Always cleanup try: await self.cleanup_test_data() except Exception as e: print(f"โš ๏ธ Cleanup error: {e}") async def main(): """Main test runner""" # Test with different URL configurations test_configs = [ { "frontend_url": "http://localhost:53000", "backend_url": "http://localhost:58000", "name": "External URLs (from host machine)" } ] all_passed = True for config in test_configs: print(f"\n๐Ÿ”ง Testing with {config['name']}") print(f" Frontend: {config['frontend_url']}") print(f" Backend: {config['backend_url']}") async with RealRAGIntegrationTest( frontend_url=config["frontend_url"], backend_url=config["backend_url"] ) as test_runner: passed = await test_runner.run_full_test() if not passed: all_passed = False if all_passed: print("\n๐ŸŽ‰ All test configurations passed!") return 0 else: print("\nโŒ Some tests failed!") return 1 if __name__ == "__main__": import sys exit_code = asyncio.run(main()) sys.exit(exit_code)