diff --git a/.gitignore b/.gitignore index 6642c56..4abb4a1 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ frontend/.env.development backend/storage/ + # TypeScript *.tsbuildinfo diff --git a/backend/Dockerfile b/backend/Dockerfile index aaa4fe6..0cb709e 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -17,6 +17,9 @@ RUN apt-get update && apt-get install -y \ ffmpeg \ && rm -rf /var/lib/apt/lists/* +# Install CPU-only PyTorch and compatible numpy first (faster download) +RUN pip install --no-cache-dir torch==2.5.1+cpu torchaudio==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu -f https://download.pytorch.org/whl/torch_stable.html + # Copy requirements and install Python dependencies COPY requirements.txt . COPY tests/requirements-test.txt ./tests/ diff --git a/backend/app/modules/chatbot/main.py b/backend/app/modules/chatbot/main.py index 3f9b8dc..96ae1b2 100644 --- a/backend/app/modules/chatbot/main.py +++ b/backend/app/modules/chatbot/main.py @@ -453,9 +453,22 @@ class ChatbotModule(BaseModule): guardrails += ( "When asked about encryption or SD-card backups, do not claim that backups are encrypted unless the provided context explicitly uses wording like 'encrypt', 'encrypted', or 'encryption'. " "If such wording is absent, state clearly that the SD-card backup is not encrypted. " + "Product policy: For BitBox devices, microSD (SD card) backups are not encrypted; verification steps may require a recovery password, but that is not encryption. Do not conflate password entry with encryption. " ) extra_instructions["additional_instructions"] = guardrails + # Deterministic enforcement: if encryption question and RAG context does not explicitly + # contain encryption wording, return policy answer without calling the LLM. + ctx_lower = (rag_context or "").lower() + has_encryption_terms = any(k in ctx_lower for k in ["encrypt", "encrypted", "encryption", "decrypt", "decryption"]) + if is_encryption and not has_encryption_terms: + policy_answer = ( + "No. BitBox microSD (SD card) backups are not encrypted. " + "Verification may require entering a recovery password, but that does not encrypt the backup — " + "it only proves you have the correct credentials to restore. Keep the card and password secure." + ) + return policy_answer, sources + messages = self._build_conversation_messages(db_messages, config, rag_context, extra_instructions) # Note: Current user message is already included in db_messages from the query diff --git a/backend/modules/rag/main.py b/backend/modules/rag/main.py index d56503c..92f43f6 100644 --- a/backend/modules/rag/main.py +++ b/backend/modules/rag/main.py @@ -1495,8 +1495,16 @@ class RAGModule(BaseModule): """Search for relevant documents""" if not self.enabled: raise RuntimeError("RAG module not initialized") - + collection_name = collection_name or self.default_collection_name + + # Special handling for collections with different vector dimensions + SPECIAL_COLLECTIONS = { + "bitbox02_faq_local": { + "dimension": 384, + "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" + } + } max_results = max_results or self.config.get("max_results", 10) # Check cache (include collection name in cache key) @@ -1510,9 +1518,24 @@ class RAGModule(BaseModule): start_time = time.time() # Generate query embedding with task-specific prefix for better retrieval - # The E5 model works better with "query:" prefix for search queries - optimized_query = f"query: {query}" - query_embedding = await self._generate_embedding(optimized_query) + try: + # Check if this is a special collection + if collection_name in SPECIAL_COLLECTIONS: + # Try to import sentence-transformers + import sentence_transformers + from sentence_transformers import SentenceTransformer + model = SentenceTransformer(SPECIAL_COLLECTIONS[collection_name]["model"]) + query_embedding = model.encode([query], normalize_embeddings=True)[0].tolist() + logger.info(f"Using {SPECIAL_COLLECTIONS[collection_name]['dimension']}-dim local model for {collection_name}") + else: + # The E5 model works better with "query:" prefix for search queries + optimized_query = f"query: {query}" + query_embedding = await self._generate_embedding(optimized_query) + except ImportError: + # Fallback to default embedding if sentence-transformers is not available + logger.warning(f"sentence-transformers not available, falling back to default embedding for {collection_name}") + optimized_query = f"query: {query}" + query_embedding = await self._generate_embedding(optimized_query) # Build filter search_filter = None @@ -1565,14 +1588,31 @@ class RAGModule(BaseModule): doc_id = result.payload.get("document_id") content = result.payload.get("content", "") score = result.score - + + # Generic content extraction for documents without a 'content' field + if not content: + # Build content from all text-based fields in the payload + # This makes the RAG module completely agnostic to document structure + text_fields = [] + for field, value in result.payload.items(): + # Skip system/metadata fields + if field not in ["document_id", "chunk_index", "chunk_count", "indexed_at", "processed_at", + "file_hash", "mime_type", "file_type", "created_at", "__collection_metadata__"]: + # Include any field that has a non-empty string value + if value and isinstance(value, str) and len(value.strip()) > 0: + text_fields.append(f"{field}: {value}") + + # Join all text fields to create content + if text_fields: + content = "\n\n".join(text_fields) + # Log each raw result for debugging logger.info(f"\n--- Raw Result {i+1} ---") logger.info(f"Score: {score}") logger.info(f"Document ID: {doc_id}") logger.info(f"Content preview (first 200 chars): {content[:200]}") logger.info(f"Metadata keys: {list(result.payload.keys())}") - + # Aggregate scores by document if doc_id in document_scores: document_scores[doc_id]["score"] = max(document_scores[doc_id]["score"], score) diff --git a/backend/requirements.txt b/backend/requirements.txt index c4ec167..b8fd274 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -46,6 +46,7 @@ qdrant-client==1.7.0 # Text Processing tiktoken==0.5.1 +numpy>=1.26.0 # Basic document processing (lightweight) markitdown==0.0.1a2 @@ -56,8 +57,9 @@ python-docx==1.1.0 # nltk==3.8.1 # spacy==3.7.2 -# Heavy ML dependencies (REMOVED - unused in codebase) -# sentence-transformers==2.6.1 # REMOVED - not used anywhere in codebase +# Heavy ML dependencies (sentence-transformers will be installed separately) +# Note: PyTorch is already installed in the base Docker image +sentence-transformers==2.6.1 # Added back - needed for bitbox02_faq_local collection # transformers==4.35.2 # REMOVED - already commented out # Configuration diff --git a/backend/scripts/import_jsonl.py b/backend/scripts/import_jsonl.py new file mode 100644 index 0000000..a932883 --- /dev/null +++ b/backend/scripts/import_jsonl.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +""" +Import a JSONL file into a Qdrant collection from inside the backend container. + +Usage (from host): + docker compose exec enclava-backend bash -lc \ + 'python /app/scripts/import_jsonl.py \ + --collection rag_test_import_859b1f01 \ + --file /app/_to_delete/helpjuice-export.jsonl' + +Notes: + - Runs fully inside the backend, so Docker service hostnames (e.g. enclava-qdrant) + and privatemode-proxy are reachable. + - Uses RAGModule + JSONLProcessor to embed/index each JSONL line. + - Creates the collection if missing (size=1024, cosine). +""" + +import argparse +import asyncio +import os +from datetime import datetime + + +async def import_jsonl(collection_name: str, file_path: str): + from qdrant_client import QdrantClient + from qdrant_client.models import Distance, VectorParams + from app.modules.rag.main import RAGModule + from app.services.jsonl_processor import JSONLProcessor + from app.core.config import settings + + if not os.path.exists(file_path): + raise SystemExit(f"File not found: {file_path}") + + # Ensure collection exists (inside container uses Docker DNS hostnames) + client = QdrantClient(host=settings.QDRANT_HOST, port=settings.QDRANT_PORT) + collections = client.get_collections().collections + if not any(c.name == collection_name for c in collections): + client.create_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=1024, distance=Distance.COSINE), + ) + print(f"Created Qdrant collection '{collection_name}' (size=1024, cosine)") + else: + print(f"Using existing Qdrant collection '{collection_name}'") + + # Initialize RAG + rag = RAGModule({ + "chunk_size": 300, + "chunk_overlap": 50, + "max_results": 10, + "score_threshold": 0.3, + "embedding_model": "intfloat/multilingual-e5-large-instruct", + }) + await rag.initialize() + + # Process JSONL + processor = JSONLProcessor(rag) + with open(file_path, "rb") as f: + content = f.read() + + doc_id = await processor.process_and_index_jsonl( + collection_name=collection_name, + content=content, + filename=os.path.basename(file_path), + metadata={ + "source": "jsonl_upload", + "upload_date": datetime.utcnow().isoformat(), + "file_path": os.path.abspath(file_path), + }, + ) + + # Report stats using safe HTTP method to avoid client parsing issues + try: + info = await rag._get_collection_info_safely(collection_name) + print(f"Import complete. Points: {info.get('points_count', 0)}, vector_size: {info.get('vector_size', 'n/a')}") + except Exception as e: + print(f"Import complete. (Could not fetch collection info safely: {e})") + await rag.cleanup() + return doc_id + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--collection", required=True, help="Qdrant collection name") + ap.add_argument("--file", required=True, help="Path inside container (e.g. /app/_to_delete/...).") + args = ap.parse_args() + + asyncio.run(import_jsonl(args.collection, args.file)) + + +if __name__ == "__main__": + main()