rag improvements 2

2025-12-17 07:24:34 +01:00 · 2025-09-23 15:47:33 +02:00
parent f8d127ff42
commit d4d420a03a
13 changed files with 4308 additions and 0 deletions
--- a/backend/app/services/jsonl_processor.py
+++ b/backend/app/services/jsonl_processor.py
@@ -0,0 +1,211 @@
+"""
+Optimized JSONL Processor for RAG Module
+Handles JSONL files efficiently to prevent resource exhaustion
+"""
+
+import json
+import logging
+import asyncio
+from typing import Dict, Any, List
+from datetime import datetime
+import uuid
+
+from qdrant_client.models import PointStruct, Filter, FieldCondition, MatchValue
+from qdrant_client.http.models import Batch
+
+from app.modules.rag.main import ProcessedDocument
+# from app.core.analytics import log_module_event  # Analytics module not available
+
+logger = logging.getLogger(__name__)
+
+
+class JSONLProcessor:
+    """Specialized processor for JSONL files"""
+
+    def __init__(self, rag_module):
+        self.rag_module = rag_module
+        self.config = rag_module.config
+
+    async def process_and_index_jsonl(self, collection_name: str, content: bytes,
+                                   filename: str, metadata: Dict[str, Any]) -> str:
+        """Process and index a JSONL file efficiently
+
+        Processes each JSON line as a separate document to avoid
+        creating thousands of chunks from a single large document.
+        """
+        try:
+            # Decode content
+            jsonl_content = content.decode('utf-8', errors='replace')
+            lines = jsonl_content.strip().split('\n')
+
+            logger.info(f"Processing JSONL file {filename} with {len(lines)} lines")
+
+            # Generate base document ID
+            base_doc_id = self.rag_module._generate_document_id(jsonl_content, metadata)
+
+            # Process lines in batches
+            batch_size = 10  # Smaller batches for better memory management
+            processed_count = 0
+
+            for batch_start in range(0, len(lines), batch_size):
+                batch_end = min(batch_start + batch_size, len(lines))
+                batch_lines = lines[batch_start:batch_end]
+
+                # Process batch
+                await self._process_jsonl_batch(
+                    collection_name,
+                    batch_lines,
+                    batch_start,
+                    base_doc_id,
+                    filename,
+                    metadata
+                )
+
+                processed_count += len(batch_lines)
+
+                # Log progress
+                if processed_count % 50 == 0:
+                    logger.info(f"Processed {processed_count}/{len(lines)} lines from {filename}")
+
+                # Small delay to prevent resource exhaustion
+                await asyncio.sleep(0.05)
+
+            logger.info(f"Successfully processed JSONL file {filename} with {len(lines)} lines")
+            return base_doc_id
+
+        except Exception as e:
+            logger.error(f"Error processing JSONL file {filename}: {e}")
+            raise
+
+    async def _process_jsonl_batch(self, collection_name: str, lines: List[str],
+                                 start_idx: int, base_doc_id: str,
+                                 filename: str, metadata: Dict[str, Any]) -> None:
+        """Process a batch of JSONL lines"""
+        try:
+            points = []
+
+            for line_idx, line in enumerate(lines, start=start_idx + 1):
+                if not line.strip():
+                    continue
+
+                try:
+                    # Parse JSON line
+                    data = json.loads(line)
+
+                    # Debug: check if data is None
+                    if data is None:
+                        logger.warning(f"JSON line {line_idx} parsed as None")
+                        continue
+
+                    # Handle helpjuice export format
+                    if 'payload' in data and data['payload'] is not None:
+                        payload = data['payload']
+                        article_id = data.get('id', f'article_{line_idx}')
+
+                        # Extract Q&A
+                        question = payload.get('question', '')
+                        answer = payload.get('answer', '')
+                        language = payload.get('language', 'EN')
+
+                        if question or answer:
+                            # Create Q&A content
+                            content = f"Question: {question}\n\nAnswer: {answer}"
+
+                            # Create metadata
+                            doc_metadata = {
+                                **metadata,
+                                "article_id": article_id,
+                                "language": language,
+                                "filename": filename,
+                                "line_number": line_idx,
+                                "content_type": "qa_pair",
+                                "question": question[:100],  # Truncate for metadata
+                                "processed_at": datetime.utcnow().isoformat()
+                            }
+
+                            # Generate single embedding for the Q&A pair
+                            embeddings = await self.rag_module._generate_embeddings([content])
+
+                            # Create point
+                            point_id = str(uuid.uuid4())
+                            points.append(PointStruct(
+                                id=point_id,
+                                vector=embeddings[0],
+                                payload={
+                                    **doc_metadata,
+                                    "document_id": f"{base_doc_id}_{article_id}",
+                                    "content": content,
+                                    "chunk_index": 0,
+                                    "chunk_count": 1
+                                }
+                            ))
+
+                    # Handle generic JSON format
+                    else:
+                        content = json.dumps(data, indent=2, ensure_ascii=False)
+
+                        # For larger JSON objects, we might need to chunk
+                        if len(content) > 1000:
+                            chunks = self.rag_module._chunk_text(content, chunk_size=500)
+                            embeddings = await self.rag_module._generate_embeddings(chunks)
+
+                            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+                                point_id = str(uuid.uuid4())
+                                points.append(PointStruct(
+                                    id=point_id,
+                                    vector=embedding,
+                                    payload={
+                                        **metadata,
+                                        "filename": filename,
+                                        "line_number": line_idx,
+                                        "content_type": "json_object",
+                                        "document_id": f"{base_doc_id}_line_{line_idx}",
+                                        "content": chunk,
+                                        "chunk_index": i,
+                                        "chunk_count": len(chunks)
+                                    }
+                                ))
+                        else:
+                            # Small JSON - no chunking needed
+                            embeddings = await self.rag_module._generate_embeddings([content])
+                            point_id = str(uuid.uuid4())
+                            points.append(PointStruct(
+                                id=point_id,
+                                vector=embeddings[0],
+                                payload={
+                                    **metadata,
+                                    "filename": filename,
+                                    "line_number": line_idx,
+                                    "content_type": "json_object",
+                                    "document_id": f"{base_doc_id}_line_{line_idx}",
+                                    "content": content,
+                                    "chunk_index": 0,
+                                    "chunk_count": 1
+                                }
+                            ))
+
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Error parsing JSONL line {line_idx}: {e}")
+                    continue
+                except Exception as e:
+                    logger.warning(f"Error processing JSONL line {line_idx}: {e}")
+                    continue
+
+            # Insert all points in this batch
+            if points:
+                self.rag_module.qdrant_client.upsert(
+                    collection_name=collection_name,
+                    points=points
+                )
+
+                # Update stats
+                self.rag_module.stats["documents_indexed"] += len(points)
+                # log_module_event("rag", "jsonl_batch_processed", {  # Analytics module not available
+                #     "filename": filename,
+                #     "lines_processed": len(lines),
+                #     "points_created": len(points)
+                # })
+
+        except Exception as e:
+            logger.error(f"Error processing JSONL batch: {e}")
+            raise