mirror of
https://github.com/aljazceru/enclava.git
synced 2025-12-17 15:34:36 +01:00
490 lines
20 KiB
Python
490 lines
20 KiB
Python
"""
|
|
Document Processor Service
|
|
Handles async document processing with queue management
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from typing import Dict, Any, Optional, List
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from sqlalchemy import select, update
|
|
from sqlalchemy.orm import selectinload
|
|
|
|
from app.db.database import get_db
|
|
from app.models.rag_document import RagDocument
|
|
from app.models.rag_collection import RagCollection
|
|
from app.services.module_manager import module_manager
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ProcessingStatus(str, Enum):
|
|
PENDING = "pending"
|
|
PROCESSING = "processing"
|
|
PROCESSED = "processed"
|
|
INDEXED = "indexed"
|
|
ERROR = "error"
|
|
|
|
|
|
@dataclass
|
|
class ProcessingTask:
|
|
"""Document processing task"""
|
|
|
|
document_id: int
|
|
priority: int = 1
|
|
retry_count: int = 0
|
|
max_retries: int = 3
|
|
created_at: datetime = None
|
|
|
|
def __post_init__(self):
|
|
if self.created_at is None:
|
|
self.created_at = datetime.utcnow()
|
|
|
|
|
|
class DocumentProcessor:
|
|
"""Async document processor with queue management"""
|
|
|
|
def __init__(self, max_workers: int = 3, max_queue_size: int = 100):
|
|
self.max_workers = max_workers
|
|
self.max_queue_size = max_queue_size
|
|
self.processing_queue: asyncio.Queue = asyncio.Queue(maxsize=max_queue_size)
|
|
self.workers: List[asyncio.Task] = []
|
|
self.running = False
|
|
self.stats = {
|
|
"processed_count": 0,
|
|
"error_count": 0,
|
|
"queue_size": 0,
|
|
"active_workers": 0,
|
|
}
|
|
self._rag_module = None
|
|
self._rag_module_lock = asyncio.Lock()
|
|
|
|
async def start(self):
|
|
"""Start the document processor"""
|
|
if self.running:
|
|
return
|
|
|
|
self.running = True
|
|
logger.info(f"Starting document processor with {self.max_workers} workers")
|
|
|
|
# Start worker tasks
|
|
for i in range(self.max_workers):
|
|
worker = asyncio.create_task(self._worker(f"worker-{i}"))
|
|
self.workers.append(worker)
|
|
|
|
logger.info("Document processor started")
|
|
|
|
async def stop(self):
|
|
"""Stop the document processor"""
|
|
if not self.running:
|
|
return
|
|
|
|
self.running = False
|
|
logger.info("Stopping document processor...")
|
|
|
|
# Cancel all workers
|
|
for worker in self.workers:
|
|
worker.cancel()
|
|
|
|
# Wait for workers to finish
|
|
await asyncio.gather(*self.workers, return_exceptions=True)
|
|
self.workers.clear()
|
|
|
|
logger.info("Document processor stopped")
|
|
|
|
async def add_task(self, document_id: int, priority: int = 1) -> bool:
|
|
"""Add a document processing task to the queue"""
|
|
try:
|
|
task = ProcessingTask(document_id=document_id, priority=priority)
|
|
|
|
try:
|
|
await asyncio.wait_for(self.processing_queue.put(task), timeout=5.0)
|
|
except asyncio.TimeoutError:
|
|
logger.warning(
|
|
"Processing queue saturated, could not enqueue document %s within timeout",
|
|
document_id,
|
|
)
|
|
return False
|
|
|
|
self.stats["queue_size"] = self.processing_queue.qsize()
|
|
|
|
logger.info(
|
|
f"Added processing task for document {document_id} (priority: {priority})"
|
|
)
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Failed to add processing task for document {document_id}: {e}"
|
|
)
|
|
return False
|
|
|
|
async def _worker(self, worker_name: str):
|
|
"""Worker coroutine that processes documents"""
|
|
logger.info(f"Started worker: {worker_name}")
|
|
|
|
while self.running:
|
|
task: Optional[ProcessingTask] = None
|
|
try:
|
|
# Get task from queue (wait up to 1 second)
|
|
task = await asyncio.wait_for(self.processing_queue.get(), timeout=1.0)
|
|
|
|
self.stats["active_workers"] += 1
|
|
self.stats["queue_size"] = self.processing_queue.qsize()
|
|
|
|
logger.info(f"{worker_name}: Processing document {task.document_id}")
|
|
|
|
# Process the document
|
|
success = await self._process_document(task)
|
|
|
|
if success:
|
|
self.stats["processed_count"] += 1
|
|
logger.info(
|
|
f"{worker_name}: Successfully processed document {task.document_id}"
|
|
)
|
|
else:
|
|
# Retry logic
|
|
if task.retry_count < task.max_retries:
|
|
task.retry_count += 1
|
|
await asyncio.sleep(
|
|
2**task.retry_count
|
|
) # Exponential backoff
|
|
try:
|
|
await asyncio.wait_for(
|
|
self.processing_queue.put(task), timeout=5.0
|
|
)
|
|
except asyncio.TimeoutError:
|
|
logger.error(
|
|
"%s: Failed to requeue document %s due to saturated queue",
|
|
worker_name,
|
|
task.document_id,
|
|
)
|
|
self.stats["error_count"] += 1
|
|
continue
|
|
logger.warning(
|
|
f"{worker_name}: Retrying document {task.document_id} (attempt {task.retry_count})"
|
|
)
|
|
else:
|
|
self.stats["error_count"] += 1
|
|
logger.error(
|
|
f"{worker_name}: Failed to process document {task.document_id} after {task.max_retries} retries"
|
|
)
|
|
|
|
except asyncio.TimeoutError:
|
|
# No tasks in queue, continue
|
|
continue
|
|
except asyncio.CancelledError:
|
|
# Worker cancelled, exit
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"{worker_name}: Unexpected error: {e}")
|
|
await asyncio.sleep(1) # Brief pause before continuing
|
|
finally:
|
|
if task is not None:
|
|
self.processing_queue.task_done()
|
|
if self.stats["active_workers"] > 0:
|
|
self.stats["active_workers"] -= 1
|
|
self.stats["queue_size"] = self.processing_queue.qsize()
|
|
|
|
logger.info(f"Worker stopped: {worker_name}")
|
|
|
|
async def _get_rag_module(self):
|
|
"""Resolve and cache the RAG module instance"""
|
|
async with self._rag_module_lock:
|
|
if self._rag_module and getattr(self._rag_module, "enabled", False):
|
|
return self._rag_module
|
|
|
|
if not module_manager.initialized:
|
|
await module_manager.initialize()
|
|
|
|
rag_module = module_manager.get_module("rag")
|
|
|
|
if not rag_module:
|
|
enabled = await module_manager.enable_module("rag")
|
|
if not enabled:
|
|
raise RuntimeError("Failed to enable RAG module")
|
|
rag_module = module_manager.get_module("rag")
|
|
|
|
if not rag_module:
|
|
raise RuntimeError("RAG module not available after enable attempt")
|
|
|
|
if not getattr(rag_module, "enabled", True):
|
|
enabled = await module_manager.enable_module("rag")
|
|
if not enabled:
|
|
raise RuntimeError(
|
|
"RAG module is disabled and could not be re-enabled"
|
|
)
|
|
rag_module = module_manager.get_module("rag")
|
|
if not rag_module or not getattr(rag_module, "enabled", True):
|
|
raise RuntimeError(
|
|
"RAG module is disabled and could not be re-enabled"
|
|
)
|
|
|
|
self._rag_module = rag_module
|
|
logger.info("DocumentProcessor cached RAG module instance for reuse")
|
|
return self._rag_module
|
|
|
|
async def _process_document(self, task: ProcessingTask) -> bool:
|
|
"""Process a single document"""
|
|
from datetime import datetime
|
|
from app.db.database import async_session_factory
|
|
|
|
async with async_session_factory() as session:
|
|
try:
|
|
# Get document from database
|
|
stmt = (
|
|
select(RagDocument)
|
|
.options(selectinload(RagDocument.collection))
|
|
.where(RagDocument.id == task.document_id)
|
|
)
|
|
result = await session.execute(stmt)
|
|
document = result.scalar_one_or_none()
|
|
|
|
if not document:
|
|
logger.error(f"Document {task.document_id} not found")
|
|
return False
|
|
|
|
# Update status to processing
|
|
document.status = ProcessingStatus.PROCESSING
|
|
await session.commit()
|
|
|
|
# Get RAG module for processing
|
|
try:
|
|
rag_module = await self._get_rag_module()
|
|
except Exception as e:
|
|
logger.error(f"Failed to get RAG module: {e}")
|
|
raise Exception(f"RAG module not available: {e}")
|
|
|
|
if not rag_module or not rag_module.enabled:
|
|
raise Exception("RAG module not available or not enabled")
|
|
|
|
logger.info(
|
|
f"RAG module loaded successfully for document {task.document_id}"
|
|
)
|
|
|
|
# Read file content
|
|
logger.info(
|
|
f"Reading file content for document {task.document_id}: {document.file_path}"
|
|
)
|
|
file_path = Path(document.file_path)
|
|
try:
|
|
file_content = await asyncio.to_thread(file_path.read_bytes)
|
|
except FileNotFoundError:
|
|
logger.error(
|
|
f"File not found for document {task.document_id}: {document.file_path}"
|
|
)
|
|
document.status = ProcessingStatus.ERROR
|
|
document.processing_error = "Document file not found on disk"
|
|
await session.commit()
|
|
return False
|
|
except Exception as exc:
|
|
logger.error(
|
|
f"Failed reading file for document {task.document_id}: {exc}"
|
|
)
|
|
document.status = ProcessingStatus.ERROR
|
|
document.processing_error = f"Failed to read file: {exc}"
|
|
await session.commit()
|
|
return False
|
|
|
|
logger.info(
|
|
f"File content read successfully for document {task.document_id}, size: {len(file_content)} bytes"
|
|
)
|
|
|
|
# Process with RAG module
|
|
logger.info(
|
|
f"Starting document processing for document {task.document_id} with RAG module"
|
|
)
|
|
|
|
# Special handling for JSONL files - skip processing phase
|
|
if document.file_type == "jsonl":
|
|
# For JSONL files, we don't need to process content here
|
|
# The optimized JSONL processor will handle everything during indexing
|
|
document.converted_content = (
|
|
f"JSONL file with {len(file_content)} bytes"
|
|
)
|
|
document.word_count = 0 # Will be updated during indexing
|
|
document.character_count = len(file_content)
|
|
document.document_metadata = {
|
|
"file_path": document.file_path,
|
|
"processed": "jsonl",
|
|
}
|
|
document.status = ProcessingStatus.PROCESSED
|
|
document.processed_at = datetime.utcnow()
|
|
logger.info(
|
|
f"JSONL document {task.document_id} marked for optimized processing"
|
|
)
|
|
else:
|
|
# Standard processing for other file types
|
|
try:
|
|
# Add timeout to prevent hanging
|
|
processed_doc = await asyncio.wait_for(
|
|
rag_module.process_document(
|
|
file_content,
|
|
document.original_filename,
|
|
{"file_path": document.file_path},
|
|
),
|
|
timeout=300.0, # 5 minute timeout
|
|
)
|
|
logger.info(
|
|
f"Document processing completed for document {task.document_id}"
|
|
)
|
|
|
|
# Update document with processed content
|
|
document.converted_content = processed_doc.content
|
|
document.word_count = processed_doc.word_count
|
|
document.character_count = len(processed_doc.content)
|
|
document.document_metadata = processed_doc.metadata
|
|
document.status = ProcessingStatus.PROCESSED
|
|
document.processed_at = datetime.utcnow()
|
|
except asyncio.TimeoutError:
|
|
logger.error(
|
|
f"Document processing timed out for document {task.document_id}"
|
|
)
|
|
raise Exception("Document processing timed out after 5 minutes")
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Document processing failed for document {task.document_id}: {e}"
|
|
)
|
|
raise
|
|
|
|
# Index in RAG system using same RAG module
|
|
if rag_module and document.converted_content:
|
|
try:
|
|
logger.info(
|
|
f"Starting indexing for document {task.document_id} in collection {document.collection.qdrant_collection_name}"
|
|
)
|
|
|
|
# Index the document content in the correct Qdrant collection
|
|
doc_metadata = {
|
|
"collection_id": document.collection_id,
|
|
"document_id": document.id,
|
|
"filename": document.original_filename,
|
|
"file_type": document.file_type,
|
|
**document.document_metadata,
|
|
}
|
|
|
|
# Use the correct Qdrant collection name for this document
|
|
# For JSONL files, we need to use the processed document flow
|
|
if document.file_type == "jsonl":
|
|
# Create a ProcessedDocument for the JSONL processor
|
|
from app.modules.rag.main import ProcessedDocument
|
|
from datetime import datetime
|
|
import hashlib
|
|
|
|
# Calculate file hash
|
|
processed_at = datetime.utcnow()
|
|
file_hash = hashlib.md5(
|
|
str(document.id).encode()
|
|
).hexdigest()
|
|
|
|
processed_doc = ProcessedDocument(
|
|
id=str(document.id),
|
|
content="", # Will be filled by JSONL processor
|
|
extracted_text="", # Will be filled by JSONL processor
|
|
metadata={
|
|
**doc_metadata,
|
|
"file_path": document.file_path,
|
|
},
|
|
original_filename=document.original_filename,
|
|
file_type=document.file_type,
|
|
mime_type=document.mime_type,
|
|
language=document.document_metadata.get(
|
|
"language", "EN"
|
|
),
|
|
word_count=0, # Will be updated during processing
|
|
sentence_count=0, # Will be updated during processing
|
|
entities=[],
|
|
keywords=[],
|
|
processing_time=0.0,
|
|
processed_at=processed_at,
|
|
file_hash=file_hash,
|
|
file_size=document.file_size,
|
|
)
|
|
|
|
# The JSONL processor will read the original file
|
|
await asyncio.wait_for(
|
|
rag_module.index_processed_document(
|
|
processed_doc=processed_doc,
|
|
collection_name=document.collection.qdrant_collection_name,
|
|
),
|
|
timeout=300.0, # 5 minute timeout for JSONL processing
|
|
)
|
|
else:
|
|
# Use standard indexing for other file types
|
|
await asyncio.wait_for(
|
|
rag_module.index_document(
|
|
content=document.converted_content,
|
|
metadata=doc_metadata,
|
|
collection_name=document.collection.qdrant_collection_name,
|
|
),
|
|
timeout=120.0, # 2 minute timeout for indexing
|
|
)
|
|
|
|
logger.info(
|
|
f"Document {task.document_id} indexed successfully in collection {document.collection.qdrant_collection_name}"
|
|
)
|
|
|
|
# Update vector count (approximate)
|
|
document.vector_count = max(
|
|
1, len(document.converted_content) // 1000
|
|
)
|
|
document.status = ProcessingStatus.INDEXED
|
|
document.indexed_at = datetime.utcnow()
|
|
|
|
# Update collection stats
|
|
collection = document.collection
|
|
if collection and document.status == ProcessingStatus.INDEXED:
|
|
collection.document_count += 1
|
|
collection.size_bytes += document.file_size
|
|
collection.vector_count += document.vector_count
|
|
collection.updated_at = datetime.utcnow()
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Failed to index document {task.document_id} in RAG: {e}"
|
|
)
|
|
# Mark as error since indexing failed
|
|
document.status = ProcessingStatus.ERROR
|
|
document.processing_error = f"Indexing failed: {str(e)}"
|
|
# Don't raise the exception to avoid retries on indexing failures
|
|
|
|
await session.commit()
|
|
return True
|
|
|
|
except Exception as e:
|
|
# Mark document as error
|
|
if "document" in locals() and document:
|
|
document.status = ProcessingStatus.ERROR
|
|
document.processing_error = str(e)
|
|
await session.commit()
|
|
|
|
logger.error(f"Error processing document {task.document_id}: {e}")
|
|
return False
|
|
|
|
async def get_stats(self) -> Dict[str, Any]:
|
|
"""Get processor statistics"""
|
|
return {
|
|
**self.stats,
|
|
"running": self.running,
|
|
"worker_count": len(self.workers),
|
|
"queue_size": self.processing_queue.qsize(),
|
|
}
|
|
|
|
async def get_queue_status(self) -> Dict[str, Any]:
|
|
"""Get detailed queue status"""
|
|
return {
|
|
"queue_size": self.processing_queue.qsize(),
|
|
"max_queue_size": self.max_queue_size,
|
|
"queue_full": self.processing_queue.full(),
|
|
"active_workers": self.stats["active_workers"],
|
|
"max_workers": self.max_workers,
|
|
}
|
|
|
|
|
|
# Global document processor instance
|
|
document_processor = DocumentProcessor()
|