""" RAG Integration Test Tests the full RAG (Retrieval Augmented Generation) system end-to-end including collections, document upload/processing, and search functionality """ import asyncio import json import time import httpx import tempfile import os from typing import Dict, Any, List from io import BytesIO class RAGIntegrationTest: """Test the complete RAG system through API endpoints""" def __init__(self, base_url: str = "http://localhost:58000", frontend_url: str = "http://localhost:53000"): self.base_url = base_url self.frontend_url = frontend_url self.client = httpx.AsyncClient(timeout=30.0) self.test_collection_id = None self.test_document_ids = [] self.auth_token = None async def setup_auth(self): """Setup authentication for testing""" print("๐Ÿ” Setting up authentication...") # For mock tests, we'll skip actual auth and use a dummy token self.auth_token = "test-token-123" print("โœ“ Mock authentication token set") async def test_rag_module_loaded(self): """Test that RAG module is loaded and operational""" print("๐Ÿงช Testing RAG module loading...") response = await self.client.get(f"{self.base_url}/api/v1/modules/") assert response.status_code == 200 data = response.json() modules_by_name = {mod["name"]: mod for mod in data["modules"]} # Check RAG module is loaded assert "rag" in modules_by_name, "RAG module not found in loaded modules" rag_module = modules_by_name["rag"] assert rag_module.get("initialized", False), "RAG module should be initialized" print("โœ“ RAG module is loaded and initialized") # Check RAG module stats if "stats" in rag_module: stats = rag_module["stats"] expected_fields = ["documents_indexed", "searches_performed", "average_search_time"] for field in expected_fields: assert field in stats, f"RAG module missing {field} stat" print(f"โœ“ RAG module stats: {len(stats)} metrics available") print(f" Documents indexed: {stats.get('documents_indexed', 0)}") print(f" Searches performed: {stats.get('searches_performed', 0)}") print(f" Cache hits: {stats.get('cache_hits', 0)}") return rag_module async def test_content_module_integration(self): """Test content module integration with markitdown""" print("\n๐Ÿงช Testing content module integration...") response = await self.client.get(f"{self.base_url}/api/v1/modules/") assert response.status_code == 200 data = response.json() modules_by_name = {mod["name"]: mod for mod in data["modules"]} # Check content module is loaded assert "content" in modules_by_name, "Content module not found in loaded modules" content_module = modules_by_name["content"] assert content_module.get("initialized", False), "Content module should be initialized" print("โœ“ Content module is loaded and initialized") # Check content module stats for markitdown integration if "stats" in content_module: stats = content_module["stats"] expected_fields = ["documents_processed", "conversion_success_rate", "supported_formats"] for field in expected_fields: if field in stats: print(f"โœ“ Content stat '{field}': {stats[field]}") return content_module async def test_collection_management(self): """Test collection CRUD operations""" print("\n๐Ÿงช Testing collection management...") # Test GET collections (should start empty or with mock data) response = await self.client.get(f"{self.frontend_url}/api/rag/collections") assert response.status_code == 200 initial_data = response.json() assert "success" in initial_data assert initial_data["success"] is True initial_count = len(initial_data.get("collections", [])) print(f"โœ“ Initial collections count: {initial_count}") # Test POST - Create new collection test_collection = { "name": "Test Integration Collection", "description": "Created during RAG integration testing" } response = await self.client.post( f"{self.frontend_url}/api/rag/collections", json=test_collection, headers={"Authorization": f"Bearer {self.auth_token}"} ) assert response.status_code == 200 create_data = response.json() assert create_data["success"] is True assert "collection" in create_data created_collection = create_data["collection"] self.test_collection_id = created_collection["id"] assert created_collection["name"] == test_collection["name"] assert created_collection["description"] == test_collection["description"] assert created_collection["document_count"] == 0 assert created_collection["status"] == "active" print(f"โœ“ Created test collection: {self.test_collection_id}") # Test GET specific collection response = await self.client.get(f"{self.frontend_url}/api/rag/collections/{self.test_collection_id}") assert response.status_code == 200 get_data = response.json() assert get_data["success"] is True assert get_data["collection"]["id"] == self.test_collection_id print("โœ“ Collection retrieval successful") return created_collection async def test_document_upload_processing(self): """Test document upload and processing""" print("\n๐Ÿงช Testing document upload and processing...") assert self.test_collection_id, "Test collection must be created first" # Create test documents test_documents = [ { "name": "test_document.txt", "content": "This is a test document for RAG integration testing. It contains sample text for processing.", "content_type": "text/plain" }, { "name": "test_document.md", "content": "# Test Markdown Document\n\nThis is a **markdown** document for testing.\n\n## Features\n- RAG integration\n- Document processing\n- Content conversion", "content_type": "text/markdown" }, { "name": "test_data.json", "content": '{"name": "test", "type": "integration", "features": ["rag", "upload", "processing"]}', "content_type": "application/json" } ] uploaded_documents = [] for doc in test_documents: print(f" Uploading {doc['name']}...") # Create form data files = { "file": (doc["name"], BytesIO(doc["content"].encode()), doc["content_type"]) } data = { "collection_id": self.test_collection_id } response = await self.client.post( f"{self.frontend_url}/api/rag/documents", files=files, data=data, headers={"Authorization": f"Bearer {self.auth_token}"} ) assert response.status_code == 200 upload_data = response.json() assert upload_data["success"] is True assert "document" in upload_data uploaded_doc = upload_data["document"] self.test_document_ids.append(uploaded_doc["id"]) uploaded_documents.append(uploaded_doc) # Verify document properties assert uploaded_doc["original_filename"] == doc["name"] assert uploaded_doc["collection_id"] == self.test_collection_id assert uploaded_doc["status"] == "processed" assert uploaded_doc["word_count"] > 0 print(f" โœ“ {doc['name']} uploaded successfully (ID: {uploaded_doc['id']})") print(f"โœ“ Successfully uploaded {len(uploaded_documents)} documents") return uploaded_documents async def test_document_browsing_search(self): """Test document browsing and search functionality""" print("\n๐Ÿงช Testing document browsing and search...") # Test GET all documents response = await self.client.get( f"{self.frontend_url}/api/rag/documents", headers={"Authorization": f"Bearer {self.auth_token}"} ) assert response.status_code == 200 browse_data = response.json() assert browse_data["success"] is True all_documents = browse_data["documents"] # Should have at least our test documents test_docs_found = [doc for doc in all_documents if doc["id"] in self.test_document_ids] assert len(test_docs_found) >= 3, f"Expected at least 3 test documents, found {len(test_docs_found)}" print(f"โœ“ Found {len(all_documents)} total documents, {len(test_docs_found)} are our test documents") # Test filtering by collection response = await self.client.get( f"{self.frontend_url}/api/rag/documents?collection_id={self.test_collection_id}", headers={"Authorization": f"Bearer {self.auth_token}"} ) assert response.status_code == 200 filtered_data = response.json() filtered_documents = filtered_data["documents"] # All filtered documents should belong to our test collection for doc in filtered_documents: assert doc["collection_id"] == self.test_collection_id print(f"โœ“ Collection filtering works: {len(filtered_documents)} documents in test collection") return all_documents async def test_document_download(self): """Test document download functionality""" print("\n๐Ÿงช Testing document download...") assert len(self.test_document_ids) > 0, "Test documents must be uploaded first" # Test downloading the first test document test_doc_id = self.test_document_ids[0] response = await self.client.get( f"{self.frontend_url}/api/rag/documents/{test_doc_id}/download", headers={"Authorization": f"Bearer {self.auth_token}"} ) assert response.status_code == 200 # Check content type header content_type = response.headers.get("content-type") assert content_type is not None # Check content disposition header content_disposition = response.headers.get("content-disposition") assert content_disposition is not None assert "attachment" in content_disposition # Check content content = response.content assert len(content) > 0 print(f"โœ“ Document download successful: {len(content)} bytes") print(f" Content-Type: {content_type}") print(f" Content-Disposition: {content_disposition}") return True async def test_document_deletion(self): """Test document deletion""" print("\n๐Ÿงช Testing document deletion...") assert len(self.test_document_ids) > 0, "Test documents must exist for deletion" # Delete the last test document doc_to_delete = self.test_document_ids[-1] response = await self.client.delete( f"{self.frontend_url}/api/rag/documents/{doc_to_delete}", headers={"Authorization": f"Bearer {self.auth_token}"} ) assert response.status_code == 200 delete_data = response.json() assert delete_data["success"] is True print(f"โœ“ Document {doc_to_delete} deleted successfully") # Verify document is no longer accessible response = await self.client.get( f"{self.frontend_url}/api/rag/documents", headers={"Authorization": f"Bearer {self.auth_token}"} ) remaining_docs = response.json()["documents"] deleted_doc_ids = [doc["id"] for doc in remaining_docs] assert doc_to_delete not in deleted_doc_ids, "Deleted document should not appear in listing" # Remove from our test tracking self.test_document_ids.remove(doc_to_delete) print("โœ“ Document deletion verified") return True async def test_collection_deletion(self): """Test collection deletion (cleanup)""" print("\n๐Ÿงช Testing collection deletion...") assert self.test_collection_id, "Test collection must exist for deletion" # First, delete any remaining documents in the collection for doc_id in self.test_document_ids[:]: # Create a copy to avoid modification during iteration response = await self.client.delete( f"{self.frontend_url}/api/rag/documents/{doc_id}", headers={"Authorization": f"Bearer {self.auth_token}"} ) if response.status_code == 200: self.test_document_ids.remove(doc_id) print(f"โœ“ Cleaned up remaining documents") # Now delete the collection response = await self.client.delete( f"{self.frontend_url}/api/rag/collections/{self.test_collection_id}", headers={"Authorization": f"Bearer {self.auth_token}"} ) # Note: Mock implementation might return error if collection has documents # This is acceptable behavior for the test if response.status_code == 200: delete_data = response.json() assert delete_data["success"] is True print(f"โœ“ Test collection {self.test_collection_id} deleted successfully") elif response.status_code == 400: error_data = response.json() if "documents" in error_data.get("error", "").lower(): print("โœ“ Collection deletion correctly prevented when documents exist") else: raise AssertionError(f"Unexpected error: {error_data}") else: raise AssertionError(f"Unexpected status code: {response.status_code}") return True async def test_frontend_api_endpoints(self): """Test all frontend API endpoints""" print("\n๐Ÿงช Testing frontend API endpoint availability...") endpoints_to_test = [ "/api/rag/collections", f"/api/rag/collections/{self.test_collection_id or 'test'}", "/api/rag/documents", ] for endpoint in endpoints_to_test: try: response = await self.client.get(f"{self.frontend_url}{endpoint}") # We expect either 200 (success) or 404 (not found) for valid endpoints assert response.status_code in [200, 404], f"Endpoint {endpoint} returned {response.status_code}" print(f"โœ“ Endpoint {endpoint} is accessible") except Exception as e: print(f"โš  Endpoint {endpoint} error: {e}") return True async def test_performance_metrics(self): """Test RAG system performance""" print("\n๐Ÿงช Testing RAG system performance...") # Test API response times endpoints = [ "/api/rag/collections", "/api/rag/documents" ] for endpoint in endpoints: start_time = time.time() response = await self.client.get(f"{self.frontend_url}{endpoint}") response_time = time.time() - start_time assert response.status_code == 200, f"Endpoint {endpoint} failed" assert response_time < 2.0, f"Endpoint {endpoint} too slow: {response_time:.2f}s" print(f"โœ“ {endpoint} response time: {response_time:.3f}s") # Test concurrent requests start_time = time.time() tasks = [] for i in range(3): task = self.client.get(f"{self.frontend_url}/api/rag/collections") tasks.append(task) responses = await asyncio.gather(*tasks) total_time = time.time() - start_time for resp in responses: assert resp.status_code == 200 print(f"โœ“ 3 concurrent requests completed in {total_time:.3f}s") return True async def run_all_tests(self): """Run all RAG integration tests""" print("๐Ÿš€ Starting RAG Integration Tests") print("=" * 60) try: # Setup await self.setup_auth() # Test 1: Module Loading rag_module = await self.test_rag_module_loaded() content_module = await self.test_content_module_integration() # Test 2: Collection Management test_collection = await self.test_collection_management() # Test 3: Document Upload and Processing uploaded_docs = await self.test_document_upload_processing() # Test 4: Document Browsing and Search all_docs = await self.test_document_browsing_search() # Test 5: Document Download await self.test_document_download() # Test 6: Document Deletion await self.test_document_deletion() # Test 7: Frontend API Endpoints await self.test_frontend_api_endpoints() # Test 8: Performance await self.test_performance_metrics() # Test 9: Cleanup (Collection Deletion) await self.test_collection_deletion() print("\n" + "=" * 60) print("๐ŸŽ‰ ALL RAG INTEGRATION TESTS PASSED!") print("โœ“ RAG module operational") print("โœ“ Content module with markitdown integration working") print("โœ“ Collection management (CRUD) functional") print(f"โœ“ Document upload/processing tested with {len(uploaded_docs)} documents") print("โœ“ Document browsing and search working") print("โœ“ Document download functionality verified") print("โœ“ Document deletion working correctly") print("โœ“ Frontend API endpoints accessible") print("โœ“ Performance within acceptable limits") print("โœ“ System cleanup successful") return True except Exception as e: print(f"\nโŒ RAG integration test failed: {e}") import traceback traceback.print_exc() # Attempt cleanup on failure try: if self.test_document_ids: print("๐Ÿงน Attempting cleanup of test documents...") for doc_id in self.test_document_ids: await self.client.delete( f"{self.frontend_url}/api/rag/documents/{doc_id}", headers={"Authorization": f"Bearer {self.auth_token}"} ) if self.test_collection_id: print("๐Ÿงน Attempting cleanup of test collection...") await self.client.delete( f"{self.frontend_url}/api/rag/collections/{self.test_collection_id}", headers={"Authorization": f"Bearer {self.auth_token}"} ) except: print("โš  Cleanup failed - manual cleanup may be required") return False finally: await self.client.aclose() async def run_rag_quick_test(): """Run a quick RAG system verification""" print("๐Ÿ”ง Quick RAG System Verification") print("-" * 40) async with httpx.AsyncClient(timeout=10.0) as client: try: # Test frontend RAG endpoints response = await client.get("http://localhost:53000/api/rag/collections") if response.status_code == 200: data = response.json() collection_count = len(data.get("collections", [])) print(f"โœ… RAG Collections API responding: {collection_count} collections") else: print(f"โš  RAG Collections API returned {response.status_code}") response = await client.get("http://localhost:53000/api/rag/documents") if response.status_code == 200: data = response.json() document_count = len(data.get("documents", [])) print(f"โœ… RAG Documents API responding: {document_count} documents") else: print(f"โš  RAG Documents API returned {response.status_code}") # Test backend modules response = await client.get("http://localhost:58000/api/v1/modules/") if response.status_code == 200: data = response.json() modules = {mod["name"]: mod for mod in data.get("modules", [])} if "rag" in modules: rag_status = "๐ŸŸข" if modules["rag"].get("initialized") else "๐ŸŸก" print(f" {rag_status} RAG module: v{modules['rag'].get('version', 'unknown')}") if "content" in modules: content_status = "๐ŸŸข" if modules["content"].get("initialized") else "๐ŸŸก" print(f" {content_status} Content module: v{modules['content'].get('version', 'unknown')}") print("โœ… RAG system basic verification complete") return True except Exception as e: print(f"โŒ RAG system verification failed: {e}") return False if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == "--quick": # Quick test result = asyncio.run(run_rag_quick_test()) sys.exit(0 if result else 1) else: # Full RAG integration test test_runner = RAGIntegrationTest() result = asyncio.run(test_runner.run_all_tests()) sys.exit(0 if result else 1)