mirror of
https://github.com/aljazceru/enclava.git
synced 2025-12-17 07:24:34 +01:00
fixing rag
This commit is contained in:
92
backend/scripts/import_jsonl.py
Normal file
92
backend/scripts/import_jsonl.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Import a JSONL file into a Qdrant collection from inside the backend container.
|
||||
|
||||
Usage (from host):
|
||||
docker compose exec enclava-backend bash -lc \
|
||||
'python /app/scripts/import_jsonl.py \
|
||||
--collection rag_test_import_859b1f01 \
|
||||
--file /app/_to_delete/helpjuice-export.jsonl'
|
||||
|
||||
Notes:
|
||||
- Runs fully inside the backend, so Docker service hostnames (e.g. enclava-qdrant)
|
||||
and privatemode-proxy are reachable.
|
||||
- Uses RAGModule + JSONLProcessor to embed/index each JSONL line.
|
||||
- Creates the collection if missing (size=1024, cosine).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
async def import_jsonl(collection_name: str, file_path: str):
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Distance, VectorParams
|
||||
from app.modules.rag.main import RAGModule
|
||||
from app.services.jsonl_processor import JSONLProcessor
|
||||
from app.core.config import settings
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise SystemExit(f"File not found: {file_path}")
|
||||
|
||||
# Ensure collection exists (inside container uses Docker DNS hostnames)
|
||||
client = QdrantClient(host=settings.QDRANT_HOST, port=settings.QDRANT_PORT)
|
||||
collections = client.get_collections().collections
|
||||
if not any(c.name == collection_name for c in collections):
|
||||
client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
|
||||
)
|
||||
print(f"Created Qdrant collection '{collection_name}' (size=1024, cosine)")
|
||||
else:
|
||||
print(f"Using existing Qdrant collection '{collection_name}'")
|
||||
|
||||
# Initialize RAG
|
||||
rag = RAGModule({
|
||||
"chunk_size": 300,
|
||||
"chunk_overlap": 50,
|
||||
"max_results": 10,
|
||||
"score_threshold": 0.3,
|
||||
"embedding_model": "intfloat/multilingual-e5-large-instruct",
|
||||
})
|
||||
await rag.initialize()
|
||||
|
||||
# Process JSONL
|
||||
processor = JSONLProcessor(rag)
|
||||
with open(file_path, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
doc_id = await processor.process_and_index_jsonl(
|
||||
collection_name=collection_name,
|
||||
content=content,
|
||||
filename=os.path.basename(file_path),
|
||||
metadata={
|
||||
"source": "jsonl_upload",
|
||||
"upload_date": datetime.utcnow().isoformat(),
|
||||
"file_path": os.path.abspath(file_path),
|
||||
},
|
||||
)
|
||||
|
||||
# Report stats using safe HTTP method to avoid client parsing issues
|
||||
try:
|
||||
info = await rag._get_collection_info_safely(collection_name)
|
||||
print(f"Import complete. Points: {info.get('points_count', 0)}, vector_size: {info.get('vector_size', 'n/a')}")
|
||||
except Exception as e:
|
||||
print(f"Import complete. (Could not fetch collection info safely: {e})")
|
||||
await rag.cleanup()
|
||||
return doc_id
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--collection", required=True, help="Qdrant collection name")
|
||||
ap.add_argument("--file", required=True, help="Path inside container (e.g. /app/_to_delete/...).")
|
||||
args = ap.parse_args()
|
||||
|
||||
asyncio.run(import_jsonl(args.collection, args.file))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user