fixing rag

This commit is contained in:
2025-10-01 15:50:34 +02:00
parent 755ea4c585
commit f3f5cca50b
6 changed files with 159 additions and 8 deletions

View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
Import a JSONL file into a Qdrant collection from inside the backend container.
Usage (from host):
docker compose exec enclava-backend bash -lc \
'python /app/scripts/import_jsonl.py \
--collection rag_test_import_859b1f01 \
--file /app/_to_delete/helpjuice-export.jsonl'
Notes:
- Runs fully inside the backend, so Docker service hostnames (e.g. enclava-qdrant)
and privatemode-proxy are reachable.
- Uses RAGModule + JSONLProcessor to embed/index each JSONL line.
- Creates the collection if missing (size=1024, cosine).
"""
import argparse
import asyncio
import os
from datetime import datetime
async def import_jsonl(collection_name: str, file_path: str):
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from app.modules.rag.main import RAGModule
from app.services.jsonl_processor import JSONLProcessor
from app.core.config import settings
if not os.path.exists(file_path):
raise SystemExit(f"File not found: {file_path}")
# Ensure collection exists (inside container uses Docker DNS hostnames)
client = QdrantClient(host=settings.QDRANT_HOST, port=settings.QDRANT_PORT)
collections = client.get_collections().collections
if not any(c.name == collection_name for c in collections):
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)
print(f"Created Qdrant collection '{collection_name}' (size=1024, cosine)")
else:
print(f"Using existing Qdrant collection '{collection_name}'")
# Initialize RAG
rag = RAGModule({
"chunk_size": 300,
"chunk_overlap": 50,
"max_results": 10,
"score_threshold": 0.3,
"embedding_model": "intfloat/multilingual-e5-large-instruct",
})
await rag.initialize()
# Process JSONL
processor = JSONLProcessor(rag)
with open(file_path, "rb") as f:
content = f.read()
doc_id = await processor.process_and_index_jsonl(
collection_name=collection_name,
content=content,
filename=os.path.basename(file_path),
metadata={
"source": "jsonl_upload",
"upload_date": datetime.utcnow().isoformat(),
"file_path": os.path.abspath(file_path),
},
)
# Report stats using safe HTTP method to avoid client parsing issues
try:
info = await rag._get_collection_info_safely(collection_name)
print(f"Import complete. Points: {info.get('points_count', 0)}, vector_size: {info.get('vector_size', 'n/a')}")
except Exception as e:
print(f"Import complete. (Could not fetch collection info safely: {e})")
await rag.cleanup()
return doc_id
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--collection", required=True, help="Qdrant collection name")
ap.add_argument("--file", required=True, help="Path inside container (e.g. /app/_to_delete/...).")
args = ap.parse_args()
asyncio.run(import_jsonl(args.collection, args.file))
if __name__ == "__main__":
main()