fixing rag

2025-12-17 07:24:34 +01:00 · 2025-10-01 15:50:34 +02:00
parent 755ea4c585
commit f3f5cca50b
6 changed files with 159 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ frontend/.env.development


 backend/storage/
+
 # TypeScript
 *.tsbuildinfo

--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -17,6 +17,9 @@ RUN apt-get update && apt-get install -y \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*

+# Install CPU-only PyTorch and compatible numpy first (faster download)
+RUN pip install --no-cache-dir torch==2.5.1+cpu torchaudio==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu -f https://download.pytorch.org/whl/torch_stable.html
+
 # Copy requirements and install Python dependencies
 COPY requirements.txt .
 COPY tests/requirements-test.txt ./tests/
--- a/backend/app/modules/chatbot/main.py
+++ b/backend/app/modules/chatbot/main.py
@@ -453,9 +453,22 @@ class ChatbotModule(BaseModule):
                guardrails += (
                    "When asked about encryption or SD-card backups, do not claim that backups are encrypted unless the provided context explicitly uses wording like 'encrypt', 'encrypted', or 'encryption'. "
                    "If such wording is absent, state clearly that the SD-card backup is not encrypted. "
+                    "Product policy: For BitBox devices, microSD (SD card) backups are not encrypted; verification steps may require a recovery password, but that is not encryption. Do not conflate password entry with encryption. "
                )
            extra_instructions["additional_instructions"] = guardrails

+        # Deterministic enforcement: if encryption question and RAG context does not explicitly
+        # contain encryption wording, return policy answer without calling the LLM.
+        ctx_lower = (rag_context or "").lower()
+        has_encryption_terms = any(k in ctx_lower for k in ["encrypt", "encrypted", "encryption", "decrypt", "decryption"])
+        if is_encryption and not has_encryption_terms:
+            policy_answer = (
+                "No. BitBox microSD (SD card) backups are not encrypted. "
+                "Verification may require entering a recovery password, but that does not encrypt the backup — "
+                "it only proves you have the correct credentials to restore. Keep the card and password secure."
+            )
+            return policy_answer, sources
+
        messages = self._build_conversation_messages(db_messages, config, rag_context, extra_instructions)
        
        # Note: Current user message is already included in db_messages from the query
--- a/backend/modules/rag/main.py
+++ b/backend/modules/rag/main.py
@@ -1495,8 +1495,16 @@ class RAGModule(BaseModule):
        """Search for relevant documents"""
        if not self.enabled:
            raise RuntimeError("RAG module not initialized")
-        
+
        collection_name = collection_name or self.default_collection_name
+
+        # Special handling for collections with different vector dimensions
+        SPECIAL_COLLECTIONS = {
+            "bitbox02_faq_local": {
+                "dimension": 384,
+                "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+            }
+        }
        max_results = max_results or self.config.get("max_results", 10)
        
        # Check cache (include collection name in cache key)
@@ -1510,9 +1518,24 @@ class RAGModule(BaseModule):
            start_time = time.time()
            
            # Generate query embedding with task-specific prefix for better retrieval
-            # The E5 model works better with "query:" prefix for search queries
-            optimized_query = f"query: {query}"
-            query_embedding = await self._generate_embedding(optimized_query)
+            try:
+                # Check if this is a special collection
+                if collection_name in SPECIAL_COLLECTIONS:
+                    # Try to import sentence-transformers
+                    import sentence_transformers
+                    from sentence_transformers import SentenceTransformer
+                    model = SentenceTransformer(SPECIAL_COLLECTIONS[collection_name]["model"])
+                    query_embedding = model.encode([query], normalize_embeddings=True)[0].tolist()
+                    logger.info(f"Using {SPECIAL_COLLECTIONS[collection_name]['dimension']}-dim local model for {collection_name}")
+                else:
+                    # The E5 model works better with "query:" prefix for search queries
+                    optimized_query = f"query: {query}"
+                    query_embedding = await self._generate_embedding(optimized_query)
+            except ImportError:
+                # Fallback to default embedding if sentence-transformers is not available
+                logger.warning(f"sentence-transformers not available, falling back to default embedding for {collection_name}")
+                optimized_query = f"query: {query}"
+                query_embedding = await self._generate_embedding(optimized_query)
            
            # Build filter
            search_filter = None
@@ -1565,14 +1588,31 @@ class RAGModule(BaseModule):
                doc_id = result.payload.get("document_id")
                content = result.payload.get("content", "")
                score = result.score
-                
+
+                # Generic content extraction for documents without a 'content' field
+                if not content:
+                    # Build content from all text-based fields in the payload
+                    # This makes the RAG module completely agnostic to document structure
+                    text_fields = []
+                    for field, value in result.payload.items():
+                        # Skip system/metadata fields
+                        if field not in ["document_id", "chunk_index", "chunk_count", "indexed_at", "processed_at",
+                                        "file_hash", "mime_type", "file_type", "created_at", "__collection_metadata__"]:
+                            # Include any field that has a non-empty string value
+                            if value and isinstance(value, str) and len(value.strip()) > 0:
+                                text_fields.append(f"{field}: {value}")
+
+                    # Join all text fields to create content
+                    if text_fields:
+                        content = "\n\n".join(text_fields)
+
                # Log each raw result for debugging
                logger.info(f"\n--- Raw Result {i+1} ---")
                logger.info(f"Score: {score}")
                logger.info(f"Document ID: {doc_id}")
                logger.info(f"Content preview (first 200 chars): {content[:200]}")
                logger.info(f"Metadata keys: {list(result.payload.keys())}")
-                
+
                # Aggregate scores by document
                if doc_id in document_scores:
                    document_scores[doc_id]["score"] = max(document_scores[doc_id]["score"], score)
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -46,6 +46,7 @@ qdrant-client==1.7.0

 # Text Processing
 tiktoken==0.5.1
+numpy>=1.26.0

 # Basic document processing (lightweight)
 markitdown==0.0.1a2
@@ -56,8 +57,9 @@ python-docx==1.1.0
 # nltk==3.8.1
 # spacy==3.7.2

-# Heavy ML dependencies (REMOVED - unused in codebase)
-# sentence-transformers==2.6.1  # REMOVED - not used anywhere in codebase
+# Heavy ML dependencies (sentence-transformers will be installed separately)
+# Note: PyTorch is already installed in the base Docker image
+sentence-transformers==2.6.1  # Added back - needed for bitbox02_faq_local collection
 # transformers==4.35.2  # REMOVED - already commented out

 # Configuration
--- a/backend/scripts/import_jsonl.py
+++ b/backend/scripts/import_jsonl.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Import a JSONL file into a Qdrant collection from inside the backend container.
+
+Usage (from host):
+  docker compose exec enclava-backend bash -lc \
+    'python /app/scripts/import_jsonl.py \
+      --collection rag_test_import_859b1f01 \
+      --file /app/_to_delete/helpjuice-export.jsonl'
+
+Notes:
+  - Runs fully inside the backend, so Docker service hostnames (e.g. enclava-qdrant)
+    and privatemode-proxy are reachable.
+  - Uses RAGModule + JSONLProcessor to embed/index each JSONL line.
+  - Creates the collection if missing (size=1024, cosine).
+"""
+
+import argparse
+import asyncio
+import os
+from datetime import datetime
+
+
+async def import_jsonl(collection_name: str, file_path: str):
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import Distance, VectorParams
+    from app.modules.rag.main import RAGModule
+    from app.services.jsonl_processor import JSONLProcessor
+    from app.core.config import settings
+
+    if not os.path.exists(file_path):
+        raise SystemExit(f"File not found: {file_path}")
+
+    # Ensure collection exists (inside container uses Docker DNS hostnames)
+    client = QdrantClient(host=settings.QDRANT_HOST, port=settings.QDRANT_PORT)
+    collections = client.get_collections().collections
+    if not any(c.name == collection_name for c in collections):
+        client.create_collection(
+            collection_name=collection_name,
+            vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
+        )
+        print(f"Created Qdrant collection '{collection_name}' (size=1024, cosine)")
+    else:
+        print(f"Using existing Qdrant collection '{collection_name}'")
+
+    # Initialize RAG
+    rag = RAGModule({
+        "chunk_size": 300,
+        "chunk_overlap": 50,
+        "max_results": 10,
+        "score_threshold": 0.3,
+        "embedding_model": "intfloat/multilingual-e5-large-instruct",
+    })
+    await rag.initialize()
+
+    # Process JSONL
+    processor = JSONLProcessor(rag)
+    with open(file_path, "rb") as f:
+        content = f.read()
+
+    doc_id = await processor.process_and_index_jsonl(
+        collection_name=collection_name,
+        content=content,
+        filename=os.path.basename(file_path),
+        metadata={
+            "source": "jsonl_upload",
+            "upload_date": datetime.utcnow().isoformat(),
+            "file_path": os.path.abspath(file_path),
+        },
+    )
+
+    # Report stats using safe HTTP method to avoid client parsing issues
+    try:
+        info = await rag._get_collection_info_safely(collection_name)
+        print(f"Import complete. Points: {info.get('points_count', 0)}, vector_size: {info.get('vector_size', 'n/a')}")
+    except Exception as e:
+        print(f"Import complete. (Could not fetch collection info safely: {e})")
+    await rag.cleanup()
+    return doc_id
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--collection", required=True, help="Qdrant collection name")
+    ap.add_argument("--file", required=True, help="Path inside container (e.g. /app/_to_delete/...).")
+    args = ap.parse_args()
+
+    asyncio.run(import_jsonl(args.collection, args.file))
+
+
+if __name__ == "__main__":
+    main()