mirror of
https://github.com/aljazceru/enclava.git
synced 2025-12-17 07:24:34 +01:00
fixing rag
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -32,6 +32,7 @@ frontend/.env.development
|
|||||||
|
|
||||||
|
|
||||||
backend/storage/
|
backend/storage/
|
||||||
|
|
||||||
# TypeScript
|
# TypeScript
|
||||||
*.tsbuildinfo
|
*.tsbuildinfo
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,9 @@ RUN apt-get update && apt-get install -y \
|
|||||||
ffmpeg \
|
ffmpeg \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install CPU-only PyTorch and compatible numpy first (faster download)
|
||||||
|
RUN pip install --no-cache-dir torch==2.5.1+cpu torchaudio==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu -f https://download.pytorch.org/whl/torch_stable.html
|
||||||
|
|
||||||
# Copy requirements and install Python dependencies
|
# Copy requirements and install Python dependencies
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
COPY tests/requirements-test.txt ./tests/
|
COPY tests/requirements-test.txt ./tests/
|
||||||
|
|||||||
@@ -453,9 +453,22 @@ class ChatbotModule(BaseModule):
|
|||||||
guardrails += (
|
guardrails += (
|
||||||
"When asked about encryption or SD-card backups, do not claim that backups are encrypted unless the provided context explicitly uses wording like 'encrypt', 'encrypted', or 'encryption'. "
|
"When asked about encryption or SD-card backups, do not claim that backups are encrypted unless the provided context explicitly uses wording like 'encrypt', 'encrypted', or 'encryption'. "
|
||||||
"If such wording is absent, state clearly that the SD-card backup is not encrypted. "
|
"If such wording is absent, state clearly that the SD-card backup is not encrypted. "
|
||||||
|
"Product policy: For BitBox devices, microSD (SD card) backups are not encrypted; verification steps may require a recovery password, but that is not encryption. Do not conflate password entry with encryption. "
|
||||||
)
|
)
|
||||||
extra_instructions["additional_instructions"] = guardrails
|
extra_instructions["additional_instructions"] = guardrails
|
||||||
|
|
||||||
|
# Deterministic enforcement: if encryption question and RAG context does not explicitly
|
||||||
|
# contain encryption wording, return policy answer without calling the LLM.
|
||||||
|
ctx_lower = (rag_context or "").lower()
|
||||||
|
has_encryption_terms = any(k in ctx_lower for k in ["encrypt", "encrypted", "encryption", "decrypt", "decryption"])
|
||||||
|
if is_encryption and not has_encryption_terms:
|
||||||
|
policy_answer = (
|
||||||
|
"No. BitBox microSD (SD card) backups are not encrypted. "
|
||||||
|
"Verification may require entering a recovery password, but that does not encrypt the backup — "
|
||||||
|
"it only proves you have the correct credentials to restore. Keep the card and password secure."
|
||||||
|
)
|
||||||
|
return policy_answer, sources
|
||||||
|
|
||||||
messages = self._build_conversation_messages(db_messages, config, rag_context, extra_instructions)
|
messages = self._build_conversation_messages(db_messages, config, rag_context, extra_instructions)
|
||||||
|
|
||||||
# Note: Current user message is already included in db_messages from the query
|
# Note: Current user message is already included in db_messages from the query
|
||||||
|
|||||||
@@ -1497,6 +1497,14 @@ class RAGModule(BaseModule):
|
|||||||
raise RuntimeError("RAG module not initialized")
|
raise RuntimeError("RAG module not initialized")
|
||||||
|
|
||||||
collection_name = collection_name or self.default_collection_name
|
collection_name = collection_name or self.default_collection_name
|
||||||
|
|
||||||
|
# Special handling for collections with different vector dimensions
|
||||||
|
SPECIAL_COLLECTIONS = {
|
||||||
|
"bitbox02_faq_local": {
|
||||||
|
"dimension": 384,
|
||||||
|
"model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
||||||
|
}
|
||||||
|
}
|
||||||
max_results = max_results or self.config.get("max_results", 10)
|
max_results = max_results or self.config.get("max_results", 10)
|
||||||
|
|
||||||
# Check cache (include collection name in cache key)
|
# Check cache (include collection name in cache key)
|
||||||
@@ -1510,9 +1518,24 @@ class RAGModule(BaseModule):
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Generate query embedding with task-specific prefix for better retrieval
|
# Generate query embedding with task-specific prefix for better retrieval
|
||||||
# The E5 model works better with "query:" prefix for search queries
|
try:
|
||||||
optimized_query = f"query: {query}"
|
# Check if this is a special collection
|
||||||
query_embedding = await self._generate_embedding(optimized_query)
|
if collection_name in SPECIAL_COLLECTIONS:
|
||||||
|
# Try to import sentence-transformers
|
||||||
|
import sentence_transformers
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
model = SentenceTransformer(SPECIAL_COLLECTIONS[collection_name]["model"])
|
||||||
|
query_embedding = model.encode([query], normalize_embeddings=True)[0].tolist()
|
||||||
|
logger.info(f"Using {SPECIAL_COLLECTIONS[collection_name]['dimension']}-dim local model for {collection_name}")
|
||||||
|
else:
|
||||||
|
# The E5 model works better with "query:" prefix for search queries
|
||||||
|
optimized_query = f"query: {query}"
|
||||||
|
query_embedding = await self._generate_embedding(optimized_query)
|
||||||
|
except ImportError:
|
||||||
|
# Fallback to default embedding if sentence-transformers is not available
|
||||||
|
logger.warning(f"sentence-transformers not available, falling back to default embedding for {collection_name}")
|
||||||
|
optimized_query = f"query: {query}"
|
||||||
|
query_embedding = await self._generate_embedding(optimized_query)
|
||||||
|
|
||||||
# Build filter
|
# Build filter
|
||||||
search_filter = None
|
search_filter = None
|
||||||
@@ -1566,6 +1589,23 @@ class RAGModule(BaseModule):
|
|||||||
content = result.payload.get("content", "")
|
content = result.payload.get("content", "")
|
||||||
score = result.score
|
score = result.score
|
||||||
|
|
||||||
|
# Generic content extraction for documents without a 'content' field
|
||||||
|
if not content:
|
||||||
|
# Build content from all text-based fields in the payload
|
||||||
|
# This makes the RAG module completely agnostic to document structure
|
||||||
|
text_fields = []
|
||||||
|
for field, value in result.payload.items():
|
||||||
|
# Skip system/metadata fields
|
||||||
|
if field not in ["document_id", "chunk_index", "chunk_count", "indexed_at", "processed_at",
|
||||||
|
"file_hash", "mime_type", "file_type", "created_at", "__collection_metadata__"]:
|
||||||
|
# Include any field that has a non-empty string value
|
||||||
|
if value and isinstance(value, str) and len(value.strip()) > 0:
|
||||||
|
text_fields.append(f"{field}: {value}")
|
||||||
|
|
||||||
|
# Join all text fields to create content
|
||||||
|
if text_fields:
|
||||||
|
content = "\n\n".join(text_fields)
|
||||||
|
|
||||||
# Log each raw result for debugging
|
# Log each raw result for debugging
|
||||||
logger.info(f"\n--- Raw Result {i+1} ---")
|
logger.info(f"\n--- Raw Result {i+1} ---")
|
||||||
logger.info(f"Score: {score}")
|
logger.info(f"Score: {score}")
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ qdrant-client==1.7.0
|
|||||||
|
|
||||||
# Text Processing
|
# Text Processing
|
||||||
tiktoken==0.5.1
|
tiktoken==0.5.1
|
||||||
|
numpy>=1.26.0
|
||||||
|
|
||||||
# Basic document processing (lightweight)
|
# Basic document processing (lightweight)
|
||||||
markitdown==0.0.1a2
|
markitdown==0.0.1a2
|
||||||
@@ -56,8 +57,9 @@ python-docx==1.1.0
|
|||||||
# nltk==3.8.1
|
# nltk==3.8.1
|
||||||
# spacy==3.7.2
|
# spacy==3.7.2
|
||||||
|
|
||||||
# Heavy ML dependencies (REMOVED - unused in codebase)
|
# Heavy ML dependencies (sentence-transformers will be installed separately)
|
||||||
# sentence-transformers==2.6.1 # REMOVED - not used anywhere in codebase
|
# Note: PyTorch is already installed in the base Docker image
|
||||||
|
sentence-transformers==2.6.1 # Added back - needed for bitbox02_faq_local collection
|
||||||
# transformers==4.35.2 # REMOVED - already commented out
|
# transformers==4.35.2 # REMOVED - already commented out
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
|
|||||||
92
backend/scripts/import_jsonl.py
Normal file
92
backend/scripts/import_jsonl.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Import a JSONL file into a Qdrant collection from inside the backend container.
|
||||||
|
|
||||||
|
Usage (from host):
|
||||||
|
docker compose exec enclava-backend bash -lc \
|
||||||
|
'python /app/scripts/import_jsonl.py \
|
||||||
|
--collection rag_test_import_859b1f01 \
|
||||||
|
--file /app/_to_delete/helpjuice-export.jsonl'
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Runs fully inside the backend, so Docker service hostnames (e.g. enclava-qdrant)
|
||||||
|
and privatemode-proxy are reachable.
|
||||||
|
- Uses RAGModule + JSONLProcessor to embed/index each JSONL line.
|
||||||
|
- Creates the collection if missing (size=1024, cosine).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
async def import_jsonl(collection_name: str, file_path: str):
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.models import Distance, VectorParams
|
||||||
|
from app.modules.rag.main import RAGModule
|
||||||
|
from app.services.jsonl_processor import JSONLProcessor
|
||||||
|
from app.core.config import settings
|
||||||
|
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
raise SystemExit(f"File not found: {file_path}")
|
||||||
|
|
||||||
|
# Ensure collection exists (inside container uses Docker DNS hostnames)
|
||||||
|
client = QdrantClient(host=settings.QDRANT_HOST, port=settings.QDRANT_PORT)
|
||||||
|
collections = client.get_collections().collections
|
||||||
|
if not any(c.name == collection_name for c in collections):
|
||||||
|
client.create_collection(
|
||||||
|
collection_name=collection_name,
|
||||||
|
vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
|
||||||
|
)
|
||||||
|
print(f"Created Qdrant collection '{collection_name}' (size=1024, cosine)")
|
||||||
|
else:
|
||||||
|
print(f"Using existing Qdrant collection '{collection_name}'")
|
||||||
|
|
||||||
|
# Initialize RAG
|
||||||
|
rag = RAGModule({
|
||||||
|
"chunk_size": 300,
|
||||||
|
"chunk_overlap": 50,
|
||||||
|
"max_results": 10,
|
||||||
|
"score_threshold": 0.3,
|
||||||
|
"embedding_model": "intfloat/multilingual-e5-large-instruct",
|
||||||
|
})
|
||||||
|
await rag.initialize()
|
||||||
|
|
||||||
|
# Process JSONL
|
||||||
|
processor = JSONLProcessor(rag)
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
doc_id = await processor.process_and_index_jsonl(
|
||||||
|
collection_name=collection_name,
|
||||||
|
content=content,
|
||||||
|
filename=os.path.basename(file_path),
|
||||||
|
metadata={
|
||||||
|
"source": "jsonl_upload",
|
||||||
|
"upload_date": datetime.utcnow().isoformat(),
|
||||||
|
"file_path": os.path.abspath(file_path),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Report stats using safe HTTP method to avoid client parsing issues
|
||||||
|
try:
|
||||||
|
info = await rag._get_collection_info_safely(collection_name)
|
||||||
|
print(f"Import complete. Points: {info.get('points_count', 0)}, vector_size: {info.get('vector_size', 'n/a')}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Import complete. (Could not fetch collection info safely: {e})")
|
||||||
|
await rag.cleanup()
|
||||||
|
return doc_id
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--collection", required=True, help="Qdrant collection name")
|
||||||
|
ap.add_argument("--file", required=True, help="Path inside container (e.g. /app/_to_delete/...).")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
asyncio.run(import_jsonl(args.collection, args.file))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user