clean commit

2025-12-17 23:44:24 +01:00 · 2025-08-19 09:50:15 +02:00
parent 3c5cca407d
commit 69a947fa0b
249 changed files with 65688 additions and 0 deletions
--- a/backend/app/services/rag_service.py
+++ b/backend/app/services/rag_service.py
@@ -0,0 +1,789 @@
+"""
+RAG Service
+Handles all RAG (Retrieval Augmented Generation) operations including
+collections, documents, processing, and vector operations
+"""
+
+import os
+import uuid
+import mimetypes
+import logging
+from typing import List, Optional, Dict, Any, Tuple
+from pathlib import Path
+from datetime import datetime
+import hashlib
+import asyncio
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, delete, func, and_, or_
+from sqlalchemy.orm import selectinload
+
+from app.models.rag_collection import RagCollection
+from app.models.rag_document import RagDocument
+from app.utils.exceptions import APIException
+
+logger = logging.getLogger(__name__)
+
+
+class RAGService:
+    """Service for RAG operations"""
+    
+    def __init__(self, db: AsyncSession):
+        self.db = db
+        self.upload_dir = Path("storage/rag_documents")
+        self.upload_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Collection Operations
+    
+    async def create_collection(self, name: str, description: Optional[str] = None) -> RagCollection:
+        """Create a new RAG collection"""
+        # Check if collection name already exists
+        stmt = select(RagCollection).where(RagCollection.name == name, RagCollection.is_active == True)
+        existing = await self.db.scalar(stmt)
+        if existing:
+            raise APIException(status_code=400, error_code="COLLECTION_EXISTS", detail=f"Collection '{name}' already exists")
+        
+        # Generate unique Qdrant collection name
+        qdrant_name = f"rag_{name.lower().replace(' ', '_').replace('-', '_')}_{uuid.uuid4().hex[:8]}"
+        
+        # Create collection
+        collection = RagCollection(
+            name=name,
+            description=description,
+            qdrant_collection_name=qdrant_name,
+            status='active'
+        )
+        
+        self.db.add(collection)
+        await self.db.commit()
+        await self.db.refresh(collection)
+        
+        # TODO: Create Qdrant collection
+        await self._create_qdrant_collection(qdrant_name)
+        
+        return collection
+    
+    async def get_collections(self, skip: int = 0, limit: int = 100) -> List[RagCollection]:
+        """Get all active collections"""
+        stmt = (
+            select(RagCollection)
+            .where(RagCollection.is_active == True)
+            .order_by(RagCollection.created_at.desc())
+            .offset(skip)
+            .limit(limit)
+        )
+        result = await self.db.execute(stmt)
+        return result.scalars().all()
+    
+    async def get_collection(self, collection_id: int) -> Optional[RagCollection]:
+        """Get a collection by ID"""
+        stmt = select(RagCollection).where(
+            RagCollection.id == collection_id,
+            RagCollection.is_active == True
+        )
+        return await self.db.scalar(stmt)
+    
+    async def get_all_collections(self, skip: int = 0, limit: int = 100) -> List[dict]:
+        """Get all collections from Qdrant (source of truth) with additional metadata from PostgreSQL."""
+        logger.info("Getting all RAG collections from Qdrant (source of truth)")
+        
+        all_collections = []
+        
+        try:
+            # Get RAG module instance to access Qdrant collections
+            from app.services.module_manager import module_manager
+            rag_module = module_manager.get_module("rag")
+            
+            if not rag_module or not hasattr(rag_module, 'qdrant_client'):
+                logger.warning("RAG module or Qdrant client not available")
+                # Fallback to PostgreSQL only
+                managed_collections = await self.get_collections(skip=skip, limit=limit)
+                return [
+                    {
+                        "id": collection.id,
+                        "name": collection.name,
+                        "description": collection.description or "",
+                        "document_count": collection.document_count or 0,
+                        "size_bytes": collection.size_bytes or 0,
+                        "vector_count": collection.vector_count or 0,
+                        "status": collection.status,
+                        "created_at": collection.created_at.isoformat() if collection.created_at else "",
+                        "updated_at": collection.updated_at.isoformat() if collection.updated_at else "",
+                        "is_active": collection.is_active,
+                        "qdrant_collection_name": collection.qdrant_collection_name,
+                        "is_managed": True,
+                        "source": "managed"
+                    }
+                    for collection in managed_collections
+                ]
+            
+            # Get all collections from Qdrant (source of truth) using safe method
+            qdrant_collection_names = await rag_module._get_collections_safely()
+            logger.info(f"Found {len(qdrant_collection_names)} collections in Qdrant")
+            
+            # Get metadata from PostgreSQL for additional info
+            db_metadata = await self.get_collections(skip=0, limit=1000)
+            metadata_by_name = {col.qdrant_collection_name: col for col in db_metadata}
+            
+            # Process each Qdrant collection
+            for qdrant_name in qdrant_collection_names:
+                logger.info(f"Processing Qdrant collection: {qdrant_name}")
+                
+                try:
+                    # Get detailed collection info from Qdrant using safe method
+                    collection_info = await rag_module._get_collection_info_safely(qdrant_name)
+                    point_count = collection_info.get("points_count", 0)
+                    vector_size = collection_info.get("vector_size", 384)
+                    
+                    # Estimate collection size (points * vector_size * 4 bytes + metadata overhead)
+                    estimated_size = int(point_count * vector_size * 4 * 1.2)  # 20% overhead for metadata
+                    
+                    # Get metadata from PostgreSQL if available
+                    db_metadata_entry = metadata_by_name.get(qdrant_name)
+                    
+                    if db_metadata_entry:
+                        # Use PostgreSQL metadata but Qdrant data for counts/size
+                        collection_data = {
+                            "id": db_metadata_entry.id,
+                            "name": db_metadata_entry.name,
+                            "description": db_metadata_entry.description or "",
+                            "document_count": point_count,  # From Qdrant (real data)
+                            "size_bytes": estimated_size,   # From Qdrant (real data)
+                            "vector_count": point_count,    # From Qdrant (real data)
+                            "status": db_metadata_entry.status,
+                            "created_at": db_metadata_entry.created_at.isoformat() if db_metadata_entry.created_at else "",
+                            "updated_at": db_metadata_entry.updated_at.isoformat() if db_metadata_entry.updated_at else "",
+                            "is_active": db_metadata_entry.is_active,
+                            "qdrant_collection_name": qdrant_name,
+                            "is_managed": True,
+                            "source": "managed"
+                        }
+                    else:
+                        # Collection exists in Qdrant but not in our metadata
+                        from datetime import datetime
+                        now = datetime.utcnow()
+                        collection_data = {
+                            "id": f"ext_{qdrant_name}",  # External identifier
+                            "name": qdrant_name,
+                            "description": f"External Qdrant collection (vectors: {vector_size}d, points: {point_count})",
+                            "document_count": point_count,  # From Qdrant
+                            "size_bytes": estimated_size,   # From Qdrant
+                            "vector_count": point_count,    # From Qdrant
+                            "status": "active",
+                            "created_at": now.isoformat(),
+                            "updated_at": now.isoformat(),
+                            "is_active": True,
+                            "qdrant_collection_name": qdrant_name,
+                            "is_managed": False,
+                            "source": "external"
+                        }
+                    
+                    all_collections.append(collection_data)
+                    
+                except Exception as e:
+                    logger.error(f"Error processing collection {qdrant_name}: {e}")
+                    # Still add the collection but with minimal info
+                    from datetime import datetime
+                    now = datetime.utcnow()
+                    collection_data = {
+                        "id": f"ext_{qdrant_name}",
+                        "name": qdrant_name,
+                        "description": f"External Qdrant collection (error loading details: {str(e)})",
+                        "document_count": 0,
+                        "size_bytes": 0,
+                        "vector_count": 0,
+                        "status": "error",
+                        "created_at": now.isoformat(),
+                        "updated_at": now.isoformat(),
+                        "is_active": True,
+                        "qdrant_collection_name": qdrant_name,
+                        "is_managed": False,
+                        "source": "external"
+                    }
+                    all_collections.append(collection_data)
+                    
+        except Exception as e:
+            logger.error(f"Error fetching collections from Qdrant: {e}")
+            # Fallback to managed collections only
+            managed_collections = await self.get_collections(skip=skip, limit=limit)
+            return [
+                {
+                    "id": collection.id,
+                    "name": collection.name,
+                    "description": collection.description or "",
+                    "document_count": collection.document_count or 0,
+                    "size_bytes": collection.size_bytes or 0,
+                    "vector_count": collection.vector_count or 0,
+                    "status": collection.status,
+                    "created_at": collection.created_at.isoformat() if collection.created_at else "",
+                    "updated_at": collection.updated_at.isoformat() if collection.updated_at else "",
+                    "is_active": collection.is_active,
+                    "qdrant_collection_name": collection.qdrant_collection_name,
+                    "is_managed": True,
+                    "source": "managed"
+                }
+                for collection in managed_collections
+            ]
+        
+        # Apply pagination
+        if skip > 0 or limit < len(all_collections):
+            all_collections = all_collections[skip:skip + limit]
+        
+        logger.info(f"Total collections returned: {len(all_collections)}")
+        return all_collections
+    
+    async def delete_collection(self, collection_id: int, cascade: bool = True) -> bool:
+        """Delete a collection and optionally all its documents"""
+        collection = await self.get_collection(collection_id)
+        if not collection:
+            return False
+        
+        # Get all documents in the collection
+        stmt = select(RagDocument).where(
+            RagDocument.collection_id == collection_id,
+            RagDocument.is_deleted == False
+        )
+        result = await self.db.execute(stmt)
+        documents = result.scalars().all()
+        
+        if documents and not cascade:
+            raise APIException(
+                status_code=400, 
+                error_code="COLLECTION_HAS_DOCUMENTS",
+                detail=f"Cannot delete collection with {len(documents)} documents. Set cascade=true to delete documents along with collection."
+            )
+        
+        # Delete all documents in the collection (cascade deletion)
+        if documents:
+            for document in documents:
+                # Soft delete document
+                document.is_deleted = True
+                document.deleted_at = datetime.utcnow()
+                
+                # Delete physical file if it exists
+                try:
+                    import os
+                    if os.path.exists(document.file_path):
+                        os.remove(document.file_path)
+                except Exception as e:
+                    logger.warning(f"Failed to delete file {document.file_path}: {e}")
+        
+        # Soft delete collection
+        collection.is_active = False
+        collection.updated_at = datetime.utcnow()
+        
+        await self.db.commit()
+        
+        # Delete Qdrant collection
+        try:
+            await self._delete_qdrant_collection(collection.qdrant_collection_name)
+        except Exception as e:
+            logger.warning(f"Failed to delete Qdrant collection {collection.qdrant_collection_name}: {e}")
+        
+        return True
+    
+    # Document Operations
+    
+    async def upload_document(
+        self, 
+        collection_id: int, 
+        file_content: bytes, 
+        filename: str, 
+        content_type: Optional[str] = None
+    ) -> RagDocument:
+        """Upload and process a document"""
+        # Verify collection exists
+        collection = await self.get_collection(collection_id)
+        if not collection:
+            raise APIException(status_code=404, error_code="COLLECTION_NOT_FOUND", detail="Collection not found")
+        
+        # Validate file type
+        file_ext = Path(filename).suffix.lower()
+        if not self._is_supported_file_type(file_ext):
+            raise APIException(
+                status_code=400, 
+                error_code="UNSUPPORTED_FILE_TYPE",
+                detail=f"Unsupported file type: {file_ext}. Supported: .pdf, .docx, .doc, .txt, .md"
+            )
+        
+        # Generate safe filename
+        safe_filename = self._generate_safe_filename(filename)
+        file_path = self.upload_dir / f"{collection_id}" / safe_filename
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Save file
+        with open(file_path, 'wb') as f:
+            f.write(file_content)
+        
+        # Detect MIME type
+        if not content_type:
+            content_type, _ = mimetypes.guess_type(filename)
+        
+        # Create document record
+        document = RagDocument(
+            collection_id=collection_id,
+            filename=safe_filename,
+            original_filename=filename,
+            file_path=str(file_path),
+            file_type=file_ext.lstrip('.'),
+            file_size=len(file_content),
+            mime_type=content_type,
+            status='processing'
+        )
+        
+        self.db.add(document)
+        await self.db.commit()
+        await self.db.refresh(document)
+        
+        # Load the collection relationship to avoid lazy loading issues
+        from sqlalchemy.orm import selectinload
+        from sqlalchemy import select
+        stmt = select(RagDocument).options(selectinload(RagDocument.collection)).where(RagDocument.id == document.id)
+        result = await self.db.execute(stmt)
+        document = result.scalar_one()
+        
+        # Add document to processing queue
+        from app.services.document_processor import document_processor
+        await document_processor.add_task(document.id, priority=1)
+        
+        return document
+    
+    async def get_documents(
+        self, 
+        collection_id: Optional[int] = None,
+        skip: int = 0, 
+        limit: int = 100
+    ) -> List[RagDocument]:
+        """Get documents, optionally filtered by collection"""
+        stmt = (
+            select(RagDocument)
+            .options(selectinload(RagDocument.collection))
+            .where(RagDocument.is_deleted == False)
+            .order_by(RagDocument.created_at.desc())
+            .offset(skip)
+            .limit(limit)
+        )
+        
+        if collection_id:
+            stmt = stmt.where(RagDocument.collection_id == collection_id)
+        
+        result = await self.db.execute(stmt)
+        return result.scalars().all()
+    
+    async def get_document(self, document_id: int) -> Optional[RagDocument]:
+        """Get a document by ID"""
+        stmt = (
+            select(RagDocument)
+            .options(selectinload(RagDocument.collection))
+            .where(
+                RagDocument.id == document_id,
+                RagDocument.is_deleted == False
+            )
+        )
+        return await self.db.scalar(stmt)
+    
+    async def delete_document(self, document_id: int) -> bool:
+        """Delete a document"""
+        document = await self.get_document(document_id)
+        if not document:
+            return False
+        
+        # Soft delete document
+        document.is_deleted = True
+        document.deleted_at = datetime.utcnow()
+        document.updated_at = datetime.utcnow()
+        
+        await self.db.commit()
+        
+        # Update collection statistics
+        await self._update_collection_stats(document.collection_id)
+        
+        # Remove vectors from Qdrant
+        await self._delete_document_vectors(document.id, document.collection.qdrant_collection_name)
+        
+        # Remove file
+        try:
+            if os.path.exists(document.file_path):
+                os.remove(document.file_path)
+        except Exception as e:
+            print(f"Warning: Could not delete file {document.file_path}: {e}")
+        
+        return True
+    
+    async def download_document(self, document_id: int) -> Optional[Tuple[bytes, str, str]]:
+        """Download original document file"""
+        document = await self.get_document(document_id)
+        if not document or not os.path.exists(document.file_path):
+            return None
+        
+        try:
+            with open(document.file_path, 'rb') as f:
+                content = f.read()
+            
+            return content, document.original_filename, document.mime_type or 'application/octet-stream'
+        except Exception:
+            return None
+    
+    # Stats and Analytics
+    
+    async def get_stats(self) -> Dict[str, Any]:
+        """Get RAG system statistics"""
+        # Collection stats
+        collection_count_stmt = select(func.count(RagCollection.id)).where(RagCollection.is_active == True)
+        total_collections = await self.db.scalar(collection_count_stmt)
+        
+        # Document stats
+        doc_count_stmt = select(func.count(RagDocument.id)).where(RagDocument.is_deleted == False)
+        total_documents = await self.db.scalar(doc_count_stmt)
+        
+        # Processing stats
+        processing_stmt = select(func.count(RagDocument.id)).where(
+            RagDocument.is_deleted == False,
+            RagDocument.status == 'processing'
+        )
+        processing_documents = await self.db.scalar(processing_stmt)
+        
+        # Size stats
+        size_stmt = select(func.sum(RagDocument.file_size)).where(RagDocument.is_deleted == False)
+        total_size = await self.db.scalar(size_stmt) or 0
+        
+        # Vector stats
+        vector_stmt = select(func.sum(RagDocument.vector_count)).where(RagDocument.is_deleted == False)
+        total_vectors = await self.db.scalar(vector_stmt) or 0
+        
+        return {
+            "collections": {
+                "total": total_collections or 0,
+                "active": total_collections or 0
+            },
+            "documents": {
+                "total": total_documents or 0,
+                "processing": processing_documents or 0,
+                "processed": (total_documents or 0) - (processing_documents or 0)
+            },
+            "storage": {
+                "total_size_bytes": total_size,
+                "total_size_mb": round(total_size / (1024 * 1024), 2) if total_size else 0
+            },
+            "vectors": {
+                "total": total_vectors
+            }
+        }
+    
+    # Private Helper Methods
+    
+    def _is_supported_file_type(self, file_ext: str) -> bool:
+        """Check if file type is supported"""
+        supported_types = {'.pdf', '.docx', '.doc', '.txt', '.md', '.html', '.json', '.csv', '.xlsx', '.xls'}
+        return file_ext.lower() in supported_types
+    
+    def _generate_safe_filename(self, filename: str) -> str:
+        """Generate a safe filename for storage"""
+        # Extract extension
+        path = Path(filename)
+        ext = path.suffix
+        name = path.stem
+        
+        # Create hash of original filename for uniqueness
+        hash_suffix = hashlib.md5(filename.encode()).hexdigest()[:8]
+        
+        # Sanitize name
+        safe_name = "".join(c for c in name if c.isalnum() or c in (' ', '-', '_')).strip()
+        safe_name = safe_name.replace(' ', '_')
+        
+        # Combine with timestamp and hash
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        return f"{safe_name}_{timestamp}_{hash_suffix}{ext}"
+    
+    async def _create_qdrant_collection(self, collection_name: str):
+        """Create collection in Qdrant vector database"""
+        try:
+            # Get RAG module to create the collection
+            try:
+                from app.services.module_manager import module_manager
+                rag_module = module_manager.get_module('rag')
+            except ImportError as e:
+                logger.error(f"Failed to import module_manager: {e}")
+                rag_module = None
+            
+            if rag_module and hasattr(rag_module, 'create_collection'):
+                success = await rag_module.create_collection(collection_name)
+                if success:
+                    logger.info(f"Created Qdrant collection: {collection_name}")
+                else:
+                    logger.error(f"Failed to create Qdrant collection: {collection_name}")
+            else:
+                logger.warning("RAG module not available for collection creation")
+                
+        except Exception as e:
+            logger.error(f"Error creating Qdrant collection {collection_name}: {e}")
+            # Don't re-raise the error - collection is already saved in database
+            # The Qdrant collection can be created later if needed
+    
+    async def _delete_qdrant_collection(self, collection_name: str):
+        """Delete collection from Qdrant vector database"""
+        try:
+            # Get RAG module to delete the collection
+            try:
+                from app.services.module_manager import module_manager
+                rag_module = module_manager.get_module('rag')
+            except ImportError as e:
+                logger.error(f"Failed to import module_manager: {e}")
+                rag_module = None
+            
+            if rag_module and hasattr(rag_module, 'delete_collection'):
+                success = await rag_module.delete_collection(collection_name)
+                if success:
+                    logger.info(f"Deleted Qdrant collection: {collection_name}")
+                else:
+                    logger.warning(f"Qdrant collection not found or already deleted: {collection_name}")
+            else:
+                logger.warning("RAG module not available for collection deletion")
+                
+        except Exception as e:
+            logger.error(f"Error deleting Qdrant collection {collection_name}: {e}")
+            # Don't re-raise the error for deletion as it's not critical if cleanup fails
+    
+    async def _update_collection_stats(self, collection_id: int):
+        """Update collection statistics (document count, size, etc.)"""
+        try:
+            # Get collection
+            collection = await self.get_collection(collection_id) 
+            if not collection:
+                return
+            
+            # Count active documents
+            stmt = select(func.count(RagDocument.id)).where(
+                RagDocument.collection_id == collection_id,
+                RagDocument.is_deleted == False
+            )
+            doc_count = await self.db.scalar(stmt) or 0
+            
+            # Sum file sizes
+            stmt = select(func.sum(RagDocument.file_size)).where(
+                RagDocument.collection_id == collection_id,
+                RagDocument.is_deleted == False
+            )
+            total_size = await self.db.scalar(stmt) or 0
+            
+            # Sum vector counts
+            stmt = select(func.sum(RagDocument.vector_count)).where(
+                RagDocument.collection_id == collection_id,
+                RagDocument.is_deleted == False
+            )
+            vector_count = await self.db.scalar(stmt) or 0
+            
+            # Update collection
+            collection.document_count = doc_count
+            collection.size_bytes = total_size
+            collection.vector_count = vector_count
+            collection.updated_at = datetime.utcnow()
+            
+            await self.db.commit()
+            
+        except Exception as e:
+            logger.error(f"Failed to update collection stats for {collection_id}: {e}")
+    
+    async def _delete_document_vectors(self, document_id: int, collection_name: str):
+        """Delete document vectors from Qdrant"""
+        try:
+            # Get RAG module to delete the document vectors
+            try:
+                from app.services.module_manager import module_manager
+                rag_module = module_manager.get_module('rag')
+            except ImportError as e:
+                logger.error(f"Failed to import module_manager: {e}")
+                rag_module = None
+            
+            if rag_module and hasattr(rag_module, 'delete_document'):
+                # Create a document ID that matches what was used during indexing
+                doc_id = str(document_id)
+                success = await rag_module.delete_document(doc_id, collection_name)
+                if success:
+                    logger.info(f"Deleted vectors for document {document_id} from collection {collection_name}")
+                else:
+                    logger.warning(f"No vectors found for document {document_id} in collection {collection_name}")
+            else:
+                logger.warning("RAG module not available for document vector deletion")
+                
+        except Exception as e:
+            logger.error(f"Error deleting document vectors for {document_id} from {collection_name}: {e}")
+            # Don't re-raise the error as document deletion should continue
+    
+    async def _get_qdrant_collections(self) -> List[str]:
+        """Get list of all collection names from Qdrant"""
+        try:
+            # Get RAG module to access Qdrant collections
+            from app.services.module_manager import module_manager
+            rag_module = module_manager.get_module('rag')
+            
+            if rag_module and hasattr(rag_module, '_get_collections_safely'):
+                return await rag_module._get_collections_safely()
+            else:
+                logger.warning("RAG module or safe collections method not available")
+                return []
+                
+        except Exception as e:
+            logger.error(f"Error getting Qdrant collections: {e}")
+            return []
+    
+    async def _get_qdrant_collection_point_count(self, collection_name: str) -> int:
+        """Get the number of points (documents) in a Qdrant collection"""
+        try:
+            # Get RAG module to access Qdrant collections
+            from app.services.module_manager import module_manager
+            rag_module = module_manager.get_module('rag')
+            
+            if rag_module and hasattr(rag_module, '_get_collection_info_safely'):
+                collection_info = await rag_module._get_collection_info_safely(collection_name)
+                return collection_info.get("points_count", 0)
+            else:
+                logger.warning("RAG module or safe collection info method not available")
+                return 0
+                
+        except Exception as e:
+            logger.warning(f"Could not get point count for collection {collection_name}: {e}")
+            return 0
+    
+    async def _process_document(self, document_id: int):
+        """Process document content and create vectors"""
+        try:
+            # Get fresh document from database
+            async with self.db as session:
+                document = await session.get(RagDocument, document_id)
+                if not document:
+                    return
+                
+                # Process with RAG module (now includes content processing)
+                try:
+                    from app.services.module_manager import module_manager
+                    rag_module = module_manager.get_module('rag')
+                except ImportError as e:
+                    logger.error(f"Failed to import module_manager: {e}")
+                    rag_module = None
+                
+                if rag_module:
+                    # Read file content
+                    with open(document.file_path, 'rb') as f:
+                        file_content = f.read()
+                    
+                    # Process with RAG module
+                    try:
+                        processed_doc = await rag_module.process_document(
+                            file_content, 
+                            document.original_filename, 
+                            {}
+                        )
+                        
+                        # Success case - update document with processed content
+                        document.converted_content = processed_doc.content
+                        document.word_count = processed_doc.word_count
+                        document.character_count = len(processed_doc.content)
+                        document.document_metadata = processed_doc.metadata
+                        document.status = 'processed'
+                        document.processed_at = datetime.utcnow()
+                        
+                        # Index the processed document in the correct Qdrant collection
+                        try:
+                            # Get the collection's Qdrant collection name
+                            from sqlalchemy.orm import selectinload
+                            from sqlalchemy import select
+                            stmt = select(RagDocument).options(selectinload(RagDocument.collection)).where(RagDocument.id == document_id)
+                            result = await session.execute(stmt)
+                            doc_with_collection = result.scalar_one()
+                            
+                            qdrant_collection_name = doc_with_collection.collection.qdrant_collection_name
+                            
+                            # Index in Qdrant with the correct collection name
+                            await rag_module.index_processed_document(processed_doc, qdrant_collection_name)
+                            
+                            # Calculate actual vector count (estimate based on content length)
+                            document.vector_count = max(1, len(processed_doc.content) // 500)  # ~500 chars per chunk
+                            document.status = 'indexed'
+                            document.indexed_at = datetime.utcnow()
+                            
+                        except Exception as index_error:
+                            logger.error(f"Failed to index document {document_id} in Qdrant: {index_error}")
+                            document.status = 'error'
+                            document.processing_error = f"Indexing failed: {str(index_error)}"
+                        
+                        # Update collection stats
+                        if document.status == 'indexed':
+                            collection = doc_with_collection.collection
+                            collection.document_count += 1
+                            collection.size_bytes += document.file_size
+                            collection.vector_count += document.vector_count
+                            collection.updated_at = datetime.utcnow()
+                        
+                    except Exception as e:
+                        # Error case - mark document as failed
+                        document.status = 'error'
+                        document.processing_error = str(e)
+                    
+                    await session.commit()
+                else:
+                    # No RAG module available
+                    document.status = 'error'
+                    document.processing_error = 'RAG module not available'
+                    await session.commit()
+                    
+        except Exception as e:
+            # Update document with error status
+            async with self.db as session:
+                document = await session.get(RagDocument, document_id)
+                if document:
+                    document.status = 'error'
+                    document.processing_error = str(e)
+                    await session.commit()
+    
+    async def reprocess_document(self, document_id: int) -> bool:
+        """Restart processing for a stuck or failed document"""
+        try:
+            # Get document from database
+            document = await self.get_document(document_id)
+            if not document:
+                logger.error(f"Document {document_id} not found for reprocessing")
+                return False
+            
+            # Check if document is in a state where reprocessing makes sense
+            if document.status not in ['processing', 'error']:
+                logger.warning(f"Document {document_id} status is '{document.status}', cannot reprocess")
+                return False
+            
+            logger.info(f"Restarting processing for document {document_id} (current status: {document.status})")
+            
+            # Reset document status and clear errors
+            document.status = 'pending'
+            document.processing_error = None
+            document.processed_at = None
+            document.indexed_at = None
+            document.updated_at = datetime.utcnow()
+            
+            await self.db.commit()
+            
+            # Re-queue document for processing
+            try:
+                from app.services.document_processor import document_processor
+                success = await document_processor.add_task(document_id, priority=1)
+                
+                if success:
+                    logger.info(f"Document {document_id} successfully re-queued for processing")
+                else:
+                    logger.error(f"Failed to re-queue document {document_id} for processing")
+                    # Revert status back to error
+                    document.status = 'error'
+                    document.processing_error = "Failed to re-queue for processing"
+                    await self.db.commit()
+                
+                return success
+                
+            except Exception as e:
+                logger.error(f"Error re-queuing document {document_id}: {e}")
+                # Revert status back to error
+                document.status = 'error'
+                document.processing_error = f"Failed to re-queue: {str(e)}"
+                await self.db.commit()
+                return False
+                
+        except Exception as e:
+            logger.error(f"Error reprocessing document {document_id}: {e}")
+            return False