From 25778ab94e836446ff48d80957827273857711d4 Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Thu, 18 Sep 2025 15:37:00 +0200
Subject: [PATCH 01/13] fix

---
 .gitignore                             | 108 -------------------
 backend/app/api/v1/prompt_templates.py |  49 ++++++---
 frontend/src/lib/api-client.ts         |  96 +++++++++++++++++
 frontend/src/lib/config.ts             |  15 +++
 frontend/src/lib/file-download.ts      |  51 +++++++++
 frontend/src/lib/id-utils.ts           |  16 +++
 frontend/src/lib/proxy-auth.ts         |  31 ++++++
 frontend/src/lib/token-manager.ts      | 141 +++++++++++++++++++++++++
 frontend/src/lib/utils.ts              |   8 ++
 9 files changed, 392 insertions(+), 123 deletions(-)
 create mode 100644 frontend/src/lib/api-client.ts
 create mode 100644 frontend/src/lib/config.ts
 create mode 100644 frontend/src/lib/file-download.ts
 create mode 100644 frontend/src/lib/id-utils.ts
 create mode 100644 frontend/src/lib/proxy-auth.ts
 create mode 100644 frontend/src/lib/token-manager.ts
 create mode 100644 frontend/src/lib/utils.ts

diff --git a/.gitignore b/.gitignore
index 1719a7d..e69de29 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,108 +0,0 @@
-*.backup
-backend/storage/rag_documents/*
-# Python
-__pycache__/
-*.py[cod]
-*$py.class
-*.so
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# Virtual environments
-venv/
-env/
-ENV/
-env.bak/
-venv.bak/
-.venv/
-
-# IDE
-.vscode/
-.idea/
-*.swp
-*.swo
-*~
-
-# OS
-.DS_Store
-.DS_Store?
-._*
-.Spotlight-V100
-.Trashes
-ehthumbs.db
-Thumbs.db
-
-# Node.js
-node_modules/
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-.npm
-.eslintcache
-.next/
-.nuxt/
-out/
-dist/
-
-# Environment variables
-.env
-.env.local
-.env.development.local
-.env.test.local
-.env.production.local
-
-# Docker
-*.log
-docker-compose.override.yml
-
-# Database
-*.db
-*.sqlite
-
-# Redis
-dump.rdb
-
-# Logs
-logs/
-*.log
-
-# Coverage
-coverage/
-.coverage
-.nyc_output
-
-# Cache
-.cache/
-.pytest_cache/
-.mypy_cache/
-.ruff_cache/
-
-# Temporary files
-*.tmp
-*.temp
-.tmp/
-
-# Security - sensitive files
-backend/.config_encryption_key
-*.key
-*.pem
-*.crt
-
-# Generated files
-backend/performance_report.json
-performance_report*.json
diff --git a/backend/app/api/v1/prompt_templates.py b/backend/app/api/v1/prompt_templates.py
index 2f58a65..6612149 100644
--- a/backend/app/api/v1/prompt_templates.py
+++ b/backend/app/api/v1/prompt_templates.py
@@ -484,7 +484,7 @@ async def seed_default_templates(
                 select(PromptTemplate).where(PromptTemplate.type_key == type_key)
             )
             existing_template = existing.scalar_one_or_none()
-            
+
             if existing_template:
                 # Only update if it's still the default (version 1)
                 if existing_template.version == 1 and existing_template.is_default:
@@ -494,21 +494,40 @@ async def seed_default_templates(
                     existing_template.updated_at = datetime.utcnow()
                     updated_templates.append(type_key)
             else:
-                # Create new template
-                new_template = PromptTemplate(
-                    id=str(uuid.uuid4()),
-                    name=template_data["name"],
-                    type_key=type_key,
-                    description=template_data["description"],
-                    system_prompt=template_data["prompt"],
-                    is_default=True,
-                    is_active=True,
-                    version=1,
-                    created_at=datetime.utcnow(),
-                    updated_at=datetime.utcnow()
+                # Check if any inactive template exists with this type_key
+                inactive_result = await db.execute(
+                    select(PromptTemplate)
+                    .where(PromptTemplate.type_key == type_key)
+                    .where(PromptTemplate.is_active == False)
                 )
-                db.add(new_template)
-                created_templates.append(type_key)
+                inactive_template = inactive_result.scalar_one_or_none()
+
+                if inactive_template:
+                    # Reactivate the inactive template
+                    inactive_template.is_active = True
+                    inactive_template.name = template_data["name"]
+                    inactive_template.description = template_data["description"]
+                    inactive_template.system_prompt = template_data["prompt"]
+                    inactive_template.is_default = True
+                    inactive_template.version = 1
+                    inactive_template.updated_at = datetime.utcnow()
+                    updated_templates.append(type_key)
+                else:
+                    # Create new template
+                    new_template = PromptTemplate(
+                        id=str(uuid.uuid4()),
+                        name=template_data["name"],
+                        type_key=type_key,
+                        description=template_data["description"],
+                        system_prompt=template_data["prompt"],
+                        is_default=True,
+                        is_active=True,
+                        version=1,
+                        created_at=datetime.utcnow(),
+                        updated_at=datetime.utcnow()
+                    )
+                    db.add(new_template)
+                    created_templates.append(type_key)
         
         await db.commit()
         
diff --git a/frontend/src/lib/api-client.ts b/frontend/src/lib/api-client.ts
new file mode 100644
index 0000000..126f733
--- /dev/null
+++ b/frontend/src/lib/api-client.ts
@@ -0,0 +1,96 @@
+export interface AppError extends Error {
+  code: 'UNAUTHORIZED' | 'NETWORK_ERROR' | 'VALIDATION_ERROR' | 'NOT_FOUND' | 'FORBIDDEN' | 'TIMEOUT' | 'UNKNOWN'
+  status?: number
+  details?: any
+}
+
+function makeError(message: string, code: AppError['code'], status?: number, details?: any): AppError {
+  const err = new Error(message) as AppError
+  err.code = code
+  err.status = status
+  err.details = details
+  return err
+}
+
+async function getAuthHeader(): Promise<Record<string, string>> {
+  try {
+    const { tokenManager } = await import('./token-manager')
+    const token = await tokenManager.getAccessToken()
+    return token ? { Authorization: `Bearer ${token}` } : {}
+  } catch {
+    return {}
+  }
+}
+
+async function request<T = any>(method: string, url: string, body?: any, extraInit?: RequestInit): Promise<T> {
+  try {
+    const headers: Record<string, string> = {
+      'Accept': 'application/json',
+      ...(method !== 'GET' && method !== 'HEAD' ? { 'Content-Type': 'application/json' } : {}),
+      ...(await getAuthHeader()),
+      ...(extraInit?.headers as Record<string, string> | undefined),
+    }
+
+    const res = await fetch(url, {
+      method,
+      headers,
+      body: body != null && method !== 'GET' && method !== 'HEAD' ? JSON.stringify(body) : undefined,
+      ...extraInit,
+    })
+
+    if (!res.ok) {
+      let details: any = undefined
+      try { details = await res.json() } catch { details = await res.text() }
+      const status = res.status
+      if (status === 401) throw makeError('Unauthorized', 'UNAUTHORIZED', status, details)
+      if (status === 403) throw makeError('Forbidden', 'FORBIDDEN', status, details)
+      if (status === 404) throw makeError('Not found', 'NOT_FOUND', status, details)
+      if (status === 400) throw makeError('Validation error', 'VALIDATION_ERROR', status, details)
+      throw makeError('Request failed', 'UNKNOWN', status, details)
+    }
+
+    const contentType = res.headers.get('content-type') || ''
+    if (contentType.includes('application/json')) {
+      return (await res.json()) as T
+    }
+    // @ts-expect-error allow non-json generic
+    return (await res.text()) as T
+  } catch (e: any) {
+    if (e?.code) throw e
+    if (e?.name === 'AbortError') throw makeError('Request timed out', 'TIMEOUT')
+    throw makeError(e?.message || 'Network error', 'NETWORK_ERROR')
+  }
+}
+
+export const apiClient = {
+  get: <T = any>(url: string, init?: RequestInit) => request<T>('GET', url, undefined, init),
+  post: <T = any>(url: string, body?: any, init?: RequestInit) => request<T>('POST', url, body, init),
+  put: <T = any>(url: string, body?: any, init?: RequestInit) => request<T>('PUT', url, body, init),
+  delete: <T = any>(url: string, init?: RequestInit) => request<T>('DELETE', url, undefined, init),
+}
+
+export const chatbotApi = {
+  async listChatbots() {
+    try {
+      return await apiClient.get('/api-internal/v1/chatbot/list')
+    } catch {
+      return await apiClient.get('/api-internal/v1/chatbot/instances')
+    }
+  },
+  createChatbot(config: any) {
+    return apiClient.post('/api-internal/v1/chatbot/create', config)
+  },
+  updateChatbot(id: string, config: any) {
+    return apiClient.put(`/api-internal/v1/chatbot/update/${encodeURIComponent(id)}`, config)
+  },
+  deleteChatbot(id: string) {
+    return apiClient.delete(`/api-internal/v1/chatbot/delete/${encodeURIComponent(id)}`)
+  },
+  sendMessage(chatbotId: string, message: string, conversationId?: string, history?: Array<{role: string; content: string}>) {
+    const body: any = { chatbot_id: chatbotId, message }
+    if (conversationId) body.conversation_id = conversationId
+    if (history) body.history = history
+    return apiClient.post('/api-internal/v1/chatbot/chat', body)
+  },
+}
+
diff --git a/frontend/src/lib/config.ts b/frontend/src/lib/config.ts
new file mode 100644
index 0000000..e194a3f
--- /dev/null
+++ b/frontend/src/lib/config.ts
@@ -0,0 +1,15 @@
+export const config = {
+  getPublicApiUrl(): string {
+    if (typeof process !== 'undefined' && process.env.NEXT_PUBLIC_BASE_URL) {
+      return process.env.NEXT_PUBLIC_BASE_URL
+    }
+    if (typeof window !== 'undefined') {
+      return window.location.origin
+    }
+    return 'http://localhost:3000'
+  },
+  getAppName(): string {
+    return process.env.NEXT_PUBLIC_APP_NAME || 'Enclava'
+  },
+}
+
diff --git a/frontend/src/lib/file-download.ts b/frontend/src/lib/file-download.ts
new file mode 100644
index 0000000..137b274
--- /dev/null
+++ b/frontend/src/lib/file-download.ts
@@ -0,0 +1,51 @@
+import { tokenManager } from './token-manager'
+
+export async function downloadFile(path: string, filename: string, params?: URLSearchParams | Record<string, string>) {
+  const url = new URL(path, typeof window !== 'undefined' ? window.location.origin : 'http://localhost:3000')
+  if (params) {
+    const p = params instanceof URLSearchParams ? params : new URLSearchParams(params)
+    p.forEach((v, k) => url.searchParams.set(k, v))
+  }
+
+  const token = await tokenManager.getAccessToken()
+  const res = await fetch(url.toString(), {
+    headers: {
+      ...(token ? { Authorization: `Bearer ${token}` } : {}),
+    },
+  })
+  if (!res.ok) throw new Error(`Failed to download file (${res.status})`)
+  const blob = await res.blob()
+
+  if (typeof window !== 'undefined') {
+    const link = document.createElement('a')
+    const href = URL.createObjectURL(blob)
+    link.href = href
+    link.download = filename
+    document.body.appendChild(link)
+    link.click()
+    link.remove()
+    URL.revokeObjectURL(href)
+  }
+}
+
+export async function uploadFile(path: string, file: File, extraFields?: Record<string, string>) {
+  const form = new FormData()
+  form.append('file', file)
+  if (extraFields) Object.entries(extraFields).forEach(([k, v]) => form.append(k, v))
+
+  const token = await tokenManager.getAccessToken()
+  const res = await fetch(path, {
+    method: 'POST',
+    headers: {
+      ...(token ? { Authorization: `Bearer ${token}` } : {}),
+    },
+    body: form,
+  })
+  if (!res.ok) {
+    let details: any
+    try { details = await res.json() } catch { details = await res.text() }
+    throw new Error(typeof details === 'string' ? details : (details?.error || 'Upload failed'))
+  }
+  return await res.json()
+}
+
diff --git a/frontend/src/lib/id-utils.ts b/frontend/src/lib/id-utils.ts
new file mode 100644
index 0000000..d511e03
--- /dev/null
+++ b/frontend/src/lib/id-utils.ts
@@ -0,0 +1,16 @@
+export function generateId(prefix = "id"): string {
+  const rand = Math.random().toString(36).slice(2, 10)
+  return `${prefix}_${rand}`
+}
+
+export function generateShortId(prefix = "id"): string {
+  const rand = Math.random().toString(36).slice(2, 7)
+  return `${prefix}_${rand}`
+}
+
+export function generateTimestampId(prefix = "id"): string {
+  const ts = Date.now()
+  const rand = Math.floor(Math.random() * 1000).toString().padStart(3, '0')
+  return `${prefix}_${ts}_${rand}`
+}
+
diff --git a/frontend/src/lib/proxy-auth.ts b/frontend/src/lib/proxy-auth.ts
new file mode 100644
index 0000000..bbf7109
--- /dev/null
+++ b/frontend/src/lib/proxy-auth.ts
@@ -0,0 +1,31 @@
+const BACKEND_URL = process.env.INTERNAL_API_URL || `http://enclava-backend:${process.env.BACKEND_INTERNAL_PORT || '8000'}`
+
+function mapPath(path: string): string {
+  // Convert '/api-internal/..' to backend '/api/..'
+  if (path.startsWith('/api-internal/')) {
+    return path.replace('/api-internal/', '/api/')
+  }
+  return path
+}
+
+export async function proxyRequest(path: string, init?: RequestInit): Promise<Response> {
+  const url = `${BACKEND_URL}${mapPath(path)}`
+  const headers: Record<string, string> = {
+    'Content-Type': 'application/json',
+    ...(init?.headers as Record<string, string> | undefined),
+  }
+  return fetch(url, { ...init, headers })
+}
+
+export async function handleProxyResponse<T = any>(response: Response, defaultMessage = 'Request failed'): Promise<T> {
+  if (!response.ok) {
+    let details: any
+    try { details = await response.json() } catch { details = await response.text() }
+    throw new Error(typeof details === 'string' ? `${defaultMessage}: ${details}` : (details?.error || defaultMessage))
+  }
+  const contentType = response.headers.get('content-type') || ''
+  if (contentType.includes('application/json')) return (await response.json()) as T
+  // @ts-ignore allow non-json
+  return (await response.text()) as T
+}
+
diff --git a/frontend/src/lib/token-manager.ts b/frontend/src/lib/token-manager.ts
new file mode 100644
index 0000000..b4699df
--- /dev/null
+++ b/frontend/src/lib/token-manager.ts
@@ -0,0 +1,141 @@
+type Listener = (...args: any[]) => void
+
+class SimpleEmitter {
+  private listeners = new Map<string, Set<Listener>>()
+
+  on(event: string, listener: Listener) {
+    if (!this.listeners.has(event)) this.listeners.set(event, new Set())
+    this.listeners.get(event)!.add(listener)
+  }
+
+  off(event: string, listener: Listener) {
+    this.listeners.get(event)?.delete(listener)
+  }
+
+  emit(event: string, ...args: any[]) {
+    this.listeners.get(event)?.forEach(l => l(...args))
+  }
+}
+
+interface StoredTokens {
+  access_token: string
+  refresh_token: string
+  access_expires_at: number // epoch ms
+  refresh_expires_at?: number // epoch ms
+}
+
+const ACCESS_LIFETIME_FALLBACK_MS = 30 * 60 * 1000 // 30 minutes
+const REFRESH_LIFETIME_FALLBACK_MS = 7 * 24 * 60 * 60 * 1000 // 7 days
+
+function now() { return Date.now() }
+
+function readTokens(): StoredTokens | null {
+  if (typeof window === 'undefined') return null
+  try {
+    const raw = window.localStorage.getItem('auth_tokens')
+    return raw ? JSON.parse(raw) as StoredTokens : null
+  } catch {
+    return null
+  }
+}
+
+function writeTokens(tokens: StoredTokens | null) {
+  if (typeof window === 'undefined') return
+  if (tokens) {
+    window.localStorage.setItem('auth_tokens', JSON.stringify(tokens))
+  } else {
+    window.localStorage.removeItem('auth_tokens')
+  }
+}
+
+class TokenManager extends SimpleEmitter {
+  private refreshTimer: ReturnType<typeof setTimeout> | null = null
+
+  isAuthenticated(): boolean {
+    const t = readTokens()
+    return !!t && t.access_expires_at > now()
+  }
+
+  getTokenExpiry(): Date | null {
+    const t = readTokens()
+    return t ? new Date(t.access_expires_at) : null
+  }
+
+  getRefreshTokenExpiry(): Date | null {
+    const t = readTokens()
+    return t?.refresh_expires_at ? new Date(t.refresh_expires_at) : null
+  }
+
+  setTokens(accessToken: string, refreshToken: string, expiresInSeconds?: number) {
+    const access_expires_at = now() + (expiresInSeconds ? expiresInSeconds * 1000 : ACCESS_LIFETIME_FALLBACK_MS)
+    const refresh_expires_at = now() + REFRESH_LIFETIME_FALLBACK_MS
+    const tokens: StoredTokens = {
+      access_token: accessToken,
+      refresh_token: refreshToken,
+      access_expires_at,
+      refresh_expires_at,
+    }
+    writeTokens(tokens)
+    this.scheduleRefresh()
+    this.emit('tokensUpdated')
+  }
+
+  clearTokens() {
+    if (this.refreshTimer) {
+      clearTimeout(this.refreshTimer)
+      this.refreshTimer = null
+    }
+    writeTokens(null)
+    this.emit('tokensCleared')
+  }
+
+  logout() {
+    this.clearTokens()
+    this.emit('logout')
+  }
+
+  private scheduleRefresh() {
+    if (typeof window === 'undefined') return
+    const t = readTokens()
+    if (!t) return
+    if (this.refreshTimer) clearTimeout(this.refreshTimer)
+    const msUntilRefresh = Math.max(5_000, t.access_expires_at - now() - 60_000) // 1 minute before expiry
+    this.refreshTimer = setTimeout(() => {
+      this.refreshAccessToken().catch(() => {
+        this.emit('sessionExpired', 'refresh_failed')
+        this.clearTokens()
+      })
+    }, msUntilRefresh)
+  }
+
+  async getAccessToken(): Promise<string | null> {
+    const t = readTokens()
+    if (!t) return null
+    if (t.access_expires_at - now() > 10_000) return t.access_token
+    try {
+      await this.refreshAccessToken()
+      return readTokens()?.access_token || null
+    } catch {
+      this.emit('sessionExpired', 'expired')
+      this.clearTokens()
+      return null
+    }
+  }
+
+  private async refreshAccessToken(): Promise<void> {
+    const t = readTokens()
+    if (!t?.refresh_token) throw new Error('No refresh token')
+    const res = await fetch('/api-internal/v1/auth/refresh', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ refresh_token: t.refresh_token }),
+    })
+    if (!res.ok) throw new Error('Refresh failed')
+    const data = await res.json()
+    const expiresIn = data.expires_in as number | undefined
+    this.setTokens(data.access_token, data.refresh_token || t.refresh_token, expiresIn)
+  }
+}
+
+export const tokenManager = new TokenManager()
+
diff --git a/frontend/src/lib/utils.ts b/frontend/src/lib/utils.ts
new file mode 100644
index 0000000..02aca5c
--- /dev/null
+++ b/frontend/src/lib/utils.ts
@@ -0,0 +1,8 @@
+import { type ClassValue } from 'clsx'
+import { clsx } from 'clsx'
+import { twMerge } from 'tailwind-merge'
+
+export function cn(...inputs: ClassValue[]) {
+  return twMerge(clsx(inputs))
+}
+

From 0c20de4ca1e9948f7e91be1b4765bcdedde9451f Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Fri, 19 Sep 2025 20:34:51 +0200
Subject: [PATCH 02/13] working chatbot, rag weird

---
 backend/app/api/v1/chatbot.py                 |  60 ++------
 backend/app/services/llm/security.py          | 142 ++++++++++++------
 backend/app/services/llm/service.py           |  89 ++++-------
 backend/modules/chatbot/main.py               |  57 +++++--
 frontend/package-lock.json                    |   6 +-
 frontend/src/app/api-keys/page.tsx            |  11 +-
 .../src/components/chatbot/ChatInterface.tsx  |  32 ++--
 .../src/components/chatbot/ChatbotManager.tsx |   1 +
 frontend/src/lib/api-client.ts                |  24 ++-
 9 files changed, 230 insertions(+), 192 deletions(-)

diff --git a/backend/app/api/v1/chatbot.py b/backend/app/api/v1/chatbot.py
index 1f2f579..20f03dc 100644
--- a/backend/app/api/v1/chatbot.py
+++ b/backend/app/api/v1/chatbot.py
@@ -275,14 +275,14 @@ async def chat_with_chatbot(
     current_user: User = Depends(get_current_user),
     db: AsyncSession = Depends(get_db)
 ):
-    """Send a message to a chatbot and get a response"""
+    """Send a message to a chatbot and get a response (without persisting conversation)"""
     user_id = current_user.get("id") if isinstance(current_user, dict) else current_user.id
     log_api_request("chat_with_chatbot", {
         "user_id": user_id,
         "chatbot_id": chatbot_id,
         "message_length": len(request.message)
     })
-    
+
     try:
         # Get the chatbot instance
         result = await db.execute(
@@ -291,74 +291,40 @@ async def chat_with_chatbot(
             .where(ChatbotInstance.created_by == str(user_id))
         )
         chatbot = result.scalar_one_or_none()
-        
+
         if not chatbot:
             raise HTTPException(status_code=404, detail="Chatbot not found")
-        
+
         if not chatbot.is_active:
             raise HTTPException(status_code=400, detail="Chatbot is not active")
-        
-        # Initialize conversation service
-        conversation_service = ConversationService(db)
-        
-        # Get or create conversation
-        conversation = await conversation_service.get_or_create_conversation(
-            chatbot_id=chatbot_id,
-            user_id=str(user_id),
-            conversation_id=request.conversation_id
-        )
-        
-        # Add user message to conversation
-        await conversation_service.add_message(
-            conversation_id=conversation.id,
-            role="user",
-            content=request.message,
-            metadata={}
-        )
-        
+
         # Get chatbot module and generate response
         try:
             chatbot_module = module_manager.modules.get("chatbot")
             if not chatbot_module:
                 raise HTTPException(status_code=500, detail="Chatbot module not available")
-            
-            # Load conversation history for context
-            conversation_history = await conversation_service.get_conversation_history(
-                conversation_id=conversation.id,
-                limit=chatbot.config.get('memory_length', 10),
-                include_system=False
-            )
-            
-            # Use the chatbot module to generate a response
+
+            # Use the chatbot module to generate a response (without persisting)
             response_data = await chatbot_module.chat(
                 chatbot_config=chatbot.config,
                 message=request.message,
-                conversation_history=conversation_history,
+                conversation_history=[],  # Empty history for test chat
                 user_id=str(user_id)
             )
-            
+
             response_content = response_data.get("response", "I'm sorry, I couldn't generate a response.")
-            
+
         except Exception as e:
             # Use fallback response
             fallback_responses = chatbot.config.get("fallback_responses", [
                 "I'm sorry, I'm having trouble processing your request right now."
             ])
             response_content = fallback_responses[0] if fallback_responses else "I'm sorry, I couldn't process your request."
-        
-        # Save assistant message using conversation service
-        assistant_message = await conversation_service.add_message(
-            conversation_id=conversation.id,
-            role="assistant",
-            content=response_content,
-            metadata={},
-            sources=response_data.get("sources")
-        )
-        
+
+        # Return response without conversation ID (since we're not persisting)
         return {
-            "conversation_id": conversation.id,
             "response": response_content,
-            "timestamp": assistant_message.timestamp.isoformat()
+            "sources": response_data.get("sources")
         }
         
     except HTTPException:
diff --git a/backend/app/services/llm/security.py b/backend/app/services/llm/security.py
index 0d24f62..8aa37be 100644
--- a/backend/app/services/llm/security.py
+++ b/backend/app/services/llm/security.py
@@ -29,7 +29,7 @@ class SecurityManager:
         """Setup patterns for prompt injection detection"""
         self.injection_patterns = [
             # Direct instruction injection
-            r"(?i)(ignore|forget|disregard|override)\s+(previous|all|above|prior)\s+(instructions|rules|prompts)",
+            r"(?i)(ignore|forget|disregard|override).{0,20}(instructions|rules|prompts)",
             r"(?i)(new|updated|different)\s+(instructions|rules|system)",
             r"(?i)act\s+as\s+(if|though)\s+you\s+(are|were)",
             r"(?i)pretend\s+(to\s+be|you\s+are)",
@@ -61,12 +61,12 @@ class SecurityManager:
             r"(?i)base64\s*:",
             r"(?i)hex\s*:",
             r"(?i)unicode\s*:",
-            r"[A-Za-z0-9+/]{20,}={0,2}",  # Potential base64
+            r"(?i)\b[A-Za-z0-9+/]{40,}={0,2}\b",  # More specific base64 pattern (longer sequences)
             
-            # SQL injection patterns (for system prompts)
-            r"(?i)(union|select|insert|update|delete|drop|create)\s+",
-            r"(?i)(or|and)\s+1\s*=\s*1",
-            r"(?i)';?\s*(drop|delete|insert)",
+            # SQL injection patterns (more specific to reduce false positives)
+            r"(?i)(union\s+select|select\s+\*|insert\s+into|update\s+\w+\s+set|delete\s+from|drop\s+table|create\s+table)\s",
+            r"(?i)(or|and)\s+\d+\s*=\s*\d+",
+            r"(?i)';?\s*(drop\s+table|delete\s+from|insert\s+into)",
             
             # Command injection patterns
             r"(?i)(exec|eval|system|shell|cmd)\s*\(",
@@ -88,23 +88,27 @@ class SecurityManager:
     def validate_prompt_security(self, messages: List[Dict[str, str]]) -> Tuple[bool, float, List[str]]:
         """
         Validate messages for prompt injection attempts
-        
+
         Returns:
             Tuple[bool, float, List[str]]: (is_safe, risk_score, detected_patterns)
         """
         detected_patterns = []
         total_risk = 0.0
-        
+
+        # Check if this is a system/RAG request
+        is_system_request = self._is_system_request(messages)
+
         for message in messages:
             content = message.get("content", "")
             if not content:
                 continue
-            
-            # Check against injection patterns
+
+            # Check against injection patterns with context awareness
             for i, pattern in enumerate(self.compiled_patterns):
                 matches = pattern.findall(content)
                 if matches:
-                    pattern_risk = self._calculate_pattern_risk(i, matches)
+                    # Apply context-aware risk calculation
+                    pattern_risk = self._calculate_pattern_risk(i, matches, message.get("role", "user"), is_system_request)
                     total_risk += pattern_risk
                     detected_patterns.append({
                         "pattern_index": i,
@@ -112,57 +116,97 @@ class SecurityManager:
                         "matches": matches,
                         "risk": pattern_risk
                     })
-            
-            # Additional security checks
-            total_risk += self._check_message_characteristics(content)
-        
+
+            # Additional security checks with context awareness
+            total_risk += self._check_message_characteristics(content, message.get("role", "user"), is_system_request)
+
         # Normalize risk score (0.0 to 1.0)
         risk_score = min(total_risk / len(messages) if messages else 0.0, 1.0)
-        is_safe = risk_score < settings.API_SECURITY_RISK_THRESHOLD
-        
+        # Never block - always return True for is_safe
+        is_safe = True
+
         if detected_patterns:
-            logger.warning(f"Detected {len(detected_patterns)} potential injection patterns, risk score: {risk_score}")
-        
+            logger.info(f"Detected {len(detected_patterns)} potential injection patterns, risk score: {risk_score} (system_request: {is_system_request})")
+
         return is_safe, risk_score, detected_patterns
     
-    def _calculate_pattern_risk(self, pattern_index: int, matches: List) -> float:
-        """Calculate risk score for a detected pattern"""
+    def _calculate_pattern_risk(self, pattern_index: int, matches: List, role: str, is_system_request: bool) -> float:
+        """Calculate risk score for a detected pattern with context awareness"""
         # Different patterns have different risk levels
-        high_risk_patterns = [0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 16, 22, 23, 24]  # System manipulation, jailbreak
+        high_risk_patterns = [0, 1, 2, 3, 4, 5, 6, 7, 22, 23, 24]  # System manipulation, jailbreak
         medium_risk_patterns = [8, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21]  # Escape attempts, info extraction
-        
+
+        # Base risk score
         base_risk = 0.8 if pattern_index in high_risk_patterns else 0.5 if pattern_index in medium_risk_patterns else 0.3
-        
-        # Increase risk based on number of matches
-        match_multiplier = min(1.0 + (len(matches) - 1) * 0.2, 2.0)
-        
+
+        # Apply context-specific risk reduction
+        if is_system_request or role == "system":
+            # Reduce risk for system messages and RAG content
+            if pattern_index in [14, 15, 16]:  # Encoding patterns (base64, hex, unicode)
+                base_risk *= 0.2  # Reduce encoding risk by 80% for system content
+            elif pattern_index in [17, 18, 19]:  # SQL patterns
+                base_risk *= 0.3  # Reduce SQL risk by 70% for system content
+            else:
+                base_risk *= 0.6  # Reduce other risks by 40% for system content
+
+        # Increase risk based on number of matches, but cap it
+        match_multiplier = min(1.0 + (len(matches) - 1) * 0.1, 1.5)  # Reduced multiplier
+
         return base_risk * match_multiplier
     
-    def _check_message_characteristics(self, content: str) -> float:
-        """Check message characteristics for additional risk factors"""
+    def _check_message_characteristics(self, content: str, role: str, is_system_request: bool) -> float:
+        """Check message characteristics for additional risk factors with context awareness"""
         risk = 0.0
-        
-        # Excessive length (potential stuffing attack)
-        if len(content) > 10000:
-            risk += 0.3
-        
-        # High ratio of special characters
+
+        # Excessive length (potential stuffing attack) - less restrictive for system content
+        length_threshold = 50000 if is_system_request else 10000  # Much higher threshold for system content
+        if len(content) > length_threshold:
+            risk += 0.1 if is_system_request else 0.3
+
+        # High ratio of special characters - more lenient for system content
         special_chars = sum(1 for c in content if not c.isalnum() and not c.isspace())
-        if len(content) > 0 and special_chars / len(content) > 0.5:
-            risk += 0.4
-        
-        # Multiple encoding indicators
+        if len(content) > 0:
+            char_ratio = special_chars / len(content)
+            threshold = 0.8 if is_system_request else 0.5
+            if char_ratio > threshold:
+                risk += 0.2 if is_system_request else 0.4
+
+        # Multiple encoding indicators - reduced risk for system content
         encoding_indicators = ["base64", "hex", "unicode", "url", "ascii"]
         found_encodings = sum(1 for indicator in encoding_indicators if indicator.lower() in content.lower())
         if found_encodings > 1:
-            risk += 0.3
-        
-        # Excessive newlines or formatting (potential formatting attacks)
-        if content.count('\n') > 50 or content.count('\\n') > 50:
-            risk += 0.2
-        
+            risk += 0.1 if is_system_request else 0.3
+
+        # Excessive newlines or formatting - more lenient for system content
+        newline_threshold = 200 if is_system_request else 50
+        if content.count('\n') > newline_threshold or content.count('\\n') > newline_threshold:
+            risk += 0.1 if is_system_request else 0.2
+
         return risk
-    
+
+    def _is_system_request(self, messages: List[Dict[str, str]]) -> bool:
+        """Determine if this is a system/RAG request"""
+        if not messages:
+            return False
+
+        # Check for system messages
+        for message in messages:
+            if message.get("role") == "system":
+                return True
+
+        # Check message content for RAG indicators
+        for message in messages:
+            content = message.get("content", "")
+            if ("document:" in content.lower() or
+                "context:" in content.lower() or
+                "source:" in content.lower() or
+                "retrieved:" in content.lower() or
+                "citation:" in content.lower() or
+                "reference:" in content.lower()):
+                return True
+
+        return False
+
     def create_audit_log(
         self,
         user_id: str,
@@ -195,11 +239,11 @@ class SecurityManager:
         audit_hash = self._create_audit_hash(audit_entry)
         audit_entry["audit_hash"] = audit_hash
         
-        # Log based on risk level
+        # Log based on risk level (never block, only log)
         if risk_score >= settings.API_SECURITY_RISK_THRESHOLD:
-            logger.error(f"HIGH RISK LLM REQUEST BLOCKED: {json.dumps(audit_entry)}")
+            logger.warning(f"HIGH RISK LLM REQUEST DETECTED (NOT BLOCKED): {json.dumps(audit_entry)}")
         elif risk_score >= settings.API_SECURITY_WARNING_THRESHOLD:
-            logger.warning(f"MEDIUM RISK LLM REQUEST: {json.dumps(audit_entry)}")
+            logger.info(f"MEDIUM RISK LLM REQUEST: {json.dumps(audit_entry)}")
         else:
             logger.info(f"LLM REQUEST AUDIT: user={user_id}, model={model}, risk={risk_score:.3f}")
         
diff --git a/backend/app/services/llm/service.py b/backend/app/services/llm/service.py
index fae28fd..bb8e683 100644
--- a/backend/app/services/llm/service.py
+++ b/backend/app/services/llm/service.py
@@ -16,6 +16,7 @@ from .models import (
     ModelInfo, ProviderStatus, LLMMetrics
 )
 from .config import config_manager, ProviderConfig
+from ...core.config import settings
 from .security import security_manager
 from .resilience import ResilienceManagerFactory
 from .metrics import metrics_collector
@@ -149,19 +150,17 @@ class LLMService:
         if not request.messages:
             raise ValidationError("Messages cannot be empty", field="messages")
         
-        # Security validation
-        # Chatbot and RAG system requests should have relaxed security validation
-        is_system_request = (
-            request.user_id == "rag_system" or 
-            request.user_id == "chatbot_user" or 
-            str(request.user_id).startswith("chatbot_")
-        )
-        
+        # Security validation (only if enabled)
         messages_dict = [{"role": msg.role, "content": msg.content} for msg in request.messages]
-        is_safe, risk_score, detected_patterns = security_manager.validate_prompt_security(messages_dict)
-        
-        if not is_safe and not is_system_request:
-            # Log security violation for regular user requests
+
+        if settings.API_SECURITY_ENABLED:
+            is_safe, risk_score, detected_patterns = security_manager.validate_prompt_security(messages_dict)
+        else:
+            # Security disabled - always safe
+            is_safe, risk_score, detected_patterns = True, 0.0, []
+
+        if not is_safe:
+            # Log security violation
             security_manager.create_audit_log(
                 user_id=request.user_id,
                 api_key_id=request.api_key_id,
@@ -171,7 +170,7 @@ class LLMService:
                 risk_score=risk_score,
                 detected_patterns=[p.get("pattern", "") for p in detected_patterns]
             )
-            
+
             # Record blocked request
             metrics_collector.record_request(
                 provider="security",
@@ -184,18 +183,12 @@ class LLMService:
                 user_id=request.user_id,
                 api_key_id=request.api_key_id
             )
-            
+
             raise SecurityError(
                 "Request blocked due to security concerns",
                 risk_score=risk_score,
                 details={"detected_patterns": detected_patterns}
             )
-        elif not is_safe and is_system_request:
-            # For system requests (chatbot/RAG), log but don't block
-            logger.info(f"System request contains security patterns (risk_score={risk_score:.2f}) but allowing due to system context")
-            if detected_patterns:
-                logger.info(f"Detected patterns: {[p.get('pattern', 'unknown') for p in detected_patterns]}")
-            # Allow system requests regardless of security patterns
         
         # Get provider for model
         provider_name = self._get_provider_for_model(request.model)
@@ -317,25 +310,20 @@ class LLMService:
             await self.initialize()
         
         # Security validation (same as non-streaming)
-        # Chatbot and RAG system requests should have relaxed security validation
-        is_system_request = (
-            request.user_id == "rag_system" or 
-            request.user_id == "chatbot_user" or 
-            str(request.user_id).startswith("chatbot_")
-        )
-        
         messages_dict = [{"role": msg.role, "content": msg.content} for msg in request.messages]
-        is_safe, risk_score, detected_patterns = security_manager.validate_prompt_security(messages_dict)
-        
-        if not is_safe and not is_system_request:
+
+        if settings.API_SECURITY_ENABLED:
+            is_safe, risk_score, detected_patterns = security_manager.validate_prompt_security(messages_dict)
+        else:
+            # Security disabled - always safe
+            is_safe, risk_score, detected_patterns = True, 0.0, []
+
+        if not is_safe:
             raise SecurityError(
                 "Streaming request blocked due to security concerns",
                 risk_score=risk_score,
                 details={"detected_patterns": detected_patterns}
             )
-        elif not is_safe and is_system_request:
-            # For system requests (chatbot/RAG), log but don't block
-            logger.info(f"System streaming request contains security patterns (risk_score={risk_score:.2f}) but allowing due to system context")
         
         # Get provider
         provider_name = self._get_provider_for_model(request.model)
@@ -378,33 +366,22 @@ class LLMService:
             await self.initialize()
         
         # Security validation for embedding input
-        # RAG system requests (document embedding) should use relaxed security validation
-        is_rag_system = request.user_id == "rag_system"
-        
-        if not is_rag_system:
-            # Apply normal security validation for user-generated embedding requests
-            input_text = request.input if isinstance(request.input, str) else " ".join(request.input)
+        input_text = request.input if isinstance(request.input, str) else " ".join(request.input)
+
+        if settings.API_SECURITY_ENABLED:
             is_safe, risk_score, detected_patterns = security_manager.validate_prompt_security([
                 {"role": "user", "content": input_text}
             ])
-            
-            if not is_safe:
-                raise SecurityError(
-                    "Embedding request blocked due to security concerns",
-                    risk_score=risk_score,
-                    details={"detected_patterns": detected_patterns}
-                )
         else:
-            # For RAG system requests, log but don't block (document content can contain legitimate text that triggers patterns)
-            input_text = request.input if isinstance(request.input, str) else " ".join(request.input)
-            is_safe, risk_score, detected_patterns = security_manager.validate_prompt_security([
-                {"role": "user", "content": input_text}
-            ])
-            
-            if detected_patterns:
-                logger.info(f"RAG document embedding contains security patterns (risk_score={risk_score:.2f}) but allowing due to document context")
-            
-            # Allow RAG system requests regardless of security patterns
+            # Security disabled - always safe
+            is_safe, risk_score, detected_patterns = True, 0.0, []
+
+        if not is_safe:
+            raise SecurityError(
+                "Embedding request blocked due to security concerns",
+                risk_score=risk_score,
+                details={"detected_patterns": detected_patterns}
+            )
         
         # Get provider
         provider_name = self._get_provider_for_model(request.model)
diff --git a/backend/modules/chatbot/main.py b/backend/modules/chatbot/main.py
index 38ee9ec..6f42f09 100644
--- a/backend/modules/chatbot/main.py
+++ b/backend/modules/chatbot/main.py
@@ -265,6 +265,7 @@ class ChatbotModule(BaseModule):
     
     async def chat_completion(self, request: ChatRequest, user_id: str, db: Session) -> ChatResponse:
         """Generate chat completion response"""
+        logger.info("=== CHAT COMPLETION METHOD CALLED ===")
         
         # Get chatbot configuration from database
         db_chatbot = db.query(DBChatbotInstance).filter(DBChatbotInstance.id == request.chatbot_id).first()
@@ -363,10 +364,11 @@ class ChatbotModule(BaseModule):
                 metadata={"error": str(e), "fallback": True}
             )
     
-    async def _generate_response(self, message: str, db_messages: List[DBMessage], 
+    async def _generate_response(self, message: str, db_messages: List[DBMessage],
                                config: ChatbotConfig, context: Optional[Dict] = None, db: Session = None) -> tuple[str, Optional[List]]:
         """Generate response using LLM with optional RAG"""
-        
+        logger.info("=== _generate_response METHOD CALLED ===")
+
         # Lazy load dependencies if not available
         await self._ensure_dependencies()
         
@@ -426,6 +428,11 @@ class ChatbotModule(BaseModule):
                 logger.warning(f"RAG search traceback: {traceback.format_exc()}")
         
         # Build conversation context (includes the current message from db_messages)
+        logger.info(f"=== CRITICAL DEBUG ===")
+        logger.info(f"rag_context length: {len(rag_context)}")
+        logger.info(f"rag_context empty: {not rag_context}")
+        logger.info(f"rag_context preview: {rag_context[:200] if rag_context else 'EMPTY'}")
+        logger.info(f"=== END CRITICAL DEBUG ===")
         messages = self._build_conversation_messages(db_messages, config, rag_context, context)
         
         # Note: Current user message is already included in db_messages from the query
@@ -511,32 +518,38 @@ class ChatbotModule(BaseModule):
             # Return fallback if available
             return "I'm currently unable to process your request. Please try again later.", None
     
-    def _build_conversation_messages(self, db_messages: List[DBMessage], config: ChatbotConfig, 
+    def _build_conversation_messages(self, db_messages: List[DBMessage], config: ChatbotConfig,
                                    rag_context: str = "", context: Optional[Dict] = None) -> List[Dict]:
         """Build messages array for LLM completion"""
-        
+
         messages = []
-        
-        # System prompt
+        logger.info(f"DEBUG: _build_conversation_messages called. rag_context length: {len(rag_context)}")
+
+        # System prompt - keep it clean without RAG context
         system_prompt = config.system_prompt
-        if rag_context:
-            system_prompt += rag_context
         if context and context.get('additional_instructions'):
-            system_prompt += f"\\n\\nAdditional instructions: {context['additional_instructions']}"
-            
+            system_prompt += f"\n\nAdditional instructions: {context['additional_instructions']}"
+
         messages.append({"role": "system", "content": system_prompt})
-        
+
         logger.info(f"Building messages from {len(db_messages)} database messages")
-        
+
         # Conversation history (messages are already limited by memory_length in the query)
         # Reverse to get chronological order
         # Include ALL messages - the current user message is needed for the LLM to respond!
         for idx, msg in enumerate(reversed(db_messages)):
             logger.info(f"Processing message {idx}: role={msg.role}, content_preview={msg.content[:50] if msg.content else 'None'}...")
             if msg.role in ["user", "assistant"]:
+                # For user messages, prepend RAG context if available
+                content = msg.content
+                if msg.role == "user" and rag_context and idx == 0:
+                    # Add RAG context to the current user message (first in reversed order)
+                    content = f"Relevant information from knowledge base:\n{rag_context}\n\nQuestion: {msg.content}"
+                    logger.info("Added RAG context to user message")
+
                 messages.append({
                     "role": msg.role,
-                    "content": msg.content
+                    "content": content
                 })
                 logger.info(f"Added message with role {msg.role} to LLM messages")
             else:
@@ -677,9 +690,10 @@ class ChatbotModule(BaseModule):
         return router
     
     # API Compatibility Methods
-    async def chat(self, chatbot_config: Dict[str, Any], message: str, 
+    async def chat(self, chatbot_config: Dict[str, Any], message: str,
                    conversation_history: List = None, user_id: str = "anonymous") -> Dict[str, Any]:
         """Chat method for API compatibility"""
+        logger.info("=== CHAT METHOD (API COMPATIBILITY) CALLED ===")
         logger.info(f"Chat method called with message: {message[:50]}... by user: {user_id}")
         
         # Lazy load dependencies
@@ -709,9 +723,20 @@ class ChatbotModule(BaseModule):
                     fallback_responses=chatbot_config.get("fallback_responses", [])
                 )
                 
-                # Generate response using internal method with empty message history
+                # For API compatibility, create a temporary DBMessage for the current message
+                # so RAG context can be properly added
+                from app.models.chatbot import ChatbotMessage as DBMessage
+
+                # Create a temporary user message with the current message
+                temp_user_message = DBMessage(
+                    conversation_id="temp_conversation",
+                    role=MessageRole.USER.value,
+                    content=message
+                )
+
+                # Generate response using internal method with the current message included
                 response_content, sources = await self._generate_response(
-                    message, [], config, None, db
+                    message, [temp_user_message], config, None, db
                 )
                 
                 return {
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index 5f8f91b..b5af4b6 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -2613,9 +2613,9 @@
       }
     },
     "node_modules/axios": {
-      "version": "1.11.0",
-      "resolved": "https://registry.npmjs.org/axios/-/axios-1.11.0.tgz",
-      "integrity": "sha512-1Lx3WLFQWm3ooKDYZD1eXmoGO9fxYQjrycfHFC8P0sCfQVXyROp0p9PFWBehewBOdCwHc+f/b8I0fMto5eSfwA==",
+      "version": "1.12.2",
+      "resolved": "https://registry.npmjs.org/axios/-/axios-1.12.2.tgz",
+      "integrity": "sha512-vMJzPewAlRyOgxV2dU0Cuz2O8zzzx9VYtbJOaBgXFeLc4IV/Eg50n4LowmehOOR61S8ZMpc2K5Sa7g6A4jfkUw==",
       "license": "MIT",
       "dependencies": {
         "follow-redirects": "^1.15.6",
diff --git a/frontend/src/app/api-keys/page.tsx b/frontend/src/app/api-keys/page.tsx
index e62e2c7..da5bc62 100644
--- a/frontend/src/app/api-keys/page.tsx
+++ b/frontend/src/app/api-keys/page.tsx
@@ -2,6 +2,7 @@
 
 import { useState, useEffect } from "react";
 import { useSearchParams } from "next/navigation";
+import { Suspense } from "react";
 import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
 import { Button } from "@/components/ui/button";
 import { Input } from "@/components/ui/input";
@@ -93,7 +94,7 @@ const PERMISSION_OPTIONS = [
   { value: "llm:embeddings", label: "LLM Embeddings" },
 ];
 
-export default function ApiKeysPage() {
+function ApiKeysContent() {
   const { toast } = useToast();
   const searchParams = useSearchParams();
   const [apiKeys, setApiKeys] = useState<ApiKey[]>([]);
@@ -905,4 +906,12 @@ export default function ApiKeysPage() {
       </Dialog>
     </div>
   );
+}
+
+export default function ApiKeysPage() {
+  return (
+    <Suspense fallback={<div>Loading API keys...</div>}>
+      <ApiKeysContent />
+    </Suspense>
+  );
 }
\ No newline at end of file
diff --git a/frontend/src/components/chatbot/ChatInterface.tsx b/frontend/src/components/chatbot/ChatInterface.tsx
index fb65658..4e1fe8d 100644
--- a/frontend/src/components/chatbot/ChatInterface.tsx
+++ b/frontend/src/components/chatbot/ChatInterface.tsx
@@ -87,9 +87,8 @@ export function ChatInterface({ chatbotId, chatbotName, onClose }: ChatInterface
   const [messages, setMessages] = useState<ChatMessage[]>([])
   const [input, setInput] = useState("")
   const [isLoading, setIsLoading] = useState(false)
-  const [conversationId, setConversationId] = useState<string | null>(null)
   const scrollAreaRef = useRef<HTMLDivElement>(null)
-  const { toast } = useToast()
+  const { success: toastSuccess, error: toastError } = useToast()
 
   const scrollToBottom = useCallback(() => {
     if (scrollAreaRef.current) {
@@ -120,23 +119,20 @@ export function ChatInterface({ chatbotId, chatbotName, onClose }: ChatInterface
     setIsLoading(true)
 
     try {
-      // Build conversation history in OpenAI format
+      let data: any
+
+      // Use internal API
       const conversationHistory = messages.map(msg => ({
         role: msg.role,
         content: msg.content
       }))
-      
-      const data = await chatbotApi.sendMessage(
+
+      data = await chatbotApi.sendMessage(
         chatbotId,
         messageToSend,
-        conversationId || undefined,
+        undefined, // No conversation ID
         conversationHistory
       )
-      
-      // Update conversation ID if it's a new conversation
-      if (!conversationId && data.conversation_id) {
-        setConversationId(data.conversation_id)
-      }
 
       const assistantMessage: ChatMessage = {
         id: data.message_id || generateTimestampId('msg'),
@@ -153,16 +149,16 @@ export function ChatInterface({ chatbotId, chatbotName, onClose }: ChatInterface
       
       // More specific error handling
       if (appError.code === 'UNAUTHORIZED') {
-        toast.error("Authentication Required", "Please log in to continue chatting.")
+        toastError("Authentication Required", "Please log in to continue chatting.")
       } else if (appError.code === 'NETWORK_ERROR') {
-        toast.error("Connection Error", "Please check your internet connection and try again.")
+        toastError("Connection Error", "Please check your internet connection and try again.")
       } else {
-        toast.error("Message Failed", appError.message || "Failed to send message. Please try again.")
+        toastError("Message Failed", appError.message || "Failed to send message. Please try again.")
       }
     } finally {
       setIsLoading(false)
     }
-  }, [input, isLoading, chatbotId, conversationId, messages, toast])
+  }, [input, isLoading, chatbotId, messages, toastError])
 
   const handleKeyPress = useCallback((e: React.KeyboardEvent) => {
     if (e.key === 'Enter' && !e.shiftKey) {
@@ -174,11 +170,11 @@ export function ChatInterface({ chatbotId, chatbotName, onClose }: ChatInterface
   const copyMessage = useCallback(async (content: string) => {
     try {
       await navigator.clipboard.writeText(content)
-      toast.success("Copied", "Message copied to clipboard")
+      toastSuccess("Copied", "Message copied to clipboard")
     } catch (error) {
-      toast.error("Copy Failed", "Unable to copy message to clipboard")
+      toastError("Copy Failed", "Unable to copy message to clipboard")
     }
-  }, [toast])
+  }, [toastSuccess, toastError])
 
   const formatTime = useCallback((date: Date) => {
     return date.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })
diff --git a/frontend/src/components/chatbot/ChatbotManager.tsx b/frontend/src/components/chatbot/ChatbotManager.tsx
index 9178c66..6221d4e 100644
--- a/frontend/src/components/chatbot/ChatbotManager.tsx
+++ b/frontend/src/components/chatbot/ChatbotManager.tsx
@@ -138,6 +138,7 @@ export function ChatbotManager() {
   const [editingChatbot, setEditingChatbot] = useState<ChatbotInstance | null>(null)
   const [showChatInterface, setShowChatInterface] = useState(false)
   const [testingChatbot, setTestingChatbot] = useState<ChatbotInstance | null>(null)
+  const [chatbotApiKeys, setChatbotApiKeys] = useState<Record<string, string>>({})
   const { toast } = useToast()
 
   // New chatbot form state
diff --git a/frontend/src/lib/api-client.ts b/frontend/src/lib/api-client.ts
index 126f733..4df0089 100644
--- a/frontend/src/lib/api-client.ts
+++ b/frontend/src/lib/api-client.ts
@@ -86,11 +86,31 @@ export const chatbotApi = {
   deleteChatbot(id: string) {
     return apiClient.delete(`/api-internal/v1/chatbot/delete/${encodeURIComponent(id)}`)
   },
+  // Legacy method with JWT auth (to be deprecated)
   sendMessage(chatbotId: string, message: string, conversationId?: string, history?: Array<{role: string; content: string}>) {
-    const body: any = { chatbot_id: chatbotId, message }
+    const body: any = { message }
     if (conversationId) body.conversation_id = conversationId
     if (history) body.history = history
-    return apiClient.post('/api-internal/v1/chatbot/chat', body)
+    return apiClient.post(`/api-internal/v1/chatbot/chat/${encodeURIComponent(chatbotId)}`, body)
   },
+  // OpenAI-compatible chatbot API with API key auth
+  sendOpenAIChatMessage(chatbotId: string, messages: Array<{role: string; content: string}>, apiKey: string, options?: {
+    temperature?: number
+    max_tokens?: number
+    stream?: boolean
+  }) {
+    const body: any = {
+      messages,
+      ...options
+    }
+    return fetch(`/api/v1/chatbot/external/${encodeURIComponent(chatbotId)}/chat/completions`, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'Authorization': `Bearer ${apiKey}`
+      },
+      body: JSON.stringify(body)
+    }).then(res => res.json())
+  }
 }
 

From f58a76ac596439bf6f69963fa278fd7d47f2d288 Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Sun, 21 Sep 2025 06:49:55 +0200
Subject: [PATCH 03/13] ratelimiting and rag

---
 backend/app/core/config.py                    |  27 +-
 backend/app/main.py                           |   4 +
 backend/app/middleware/rate_limiting.py       | 234 +++++++++++-------
 backend/app/middleware/security.py            |  30 +--
 .../services/enhanced_embedding_service.py    | 201 +++++++++++++++
 backend/app/services/llm/config.py            |  15 +-
 backend/modules/rag/main.py                   |  29 ++-
 7 files changed, 410 insertions(+), 130 deletions(-)
 create mode 100644 backend/app/services/enhanced_embedding_service.py

diff --git a/backend/app/core/config.py b/backend/app/core/config.py
index c5cb8c3..f3ac614 100644
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -82,18 +82,25 @@ class Settings(BaseSettings):
     
     # Rate Limiting Configuration
     API_RATE_LIMITING_ENABLED: bool = os.getenv("API_RATE_LIMITING_ENABLED", "True").lower() == "true"
-    
-    # Authenticated users (JWT token)
-    API_RATE_LIMIT_AUTHENTICATED_PER_MINUTE: int = int(os.getenv("API_RATE_LIMIT_AUTHENTICATED_PER_MINUTE", "300"))
-    API_RATE_LIMIT_AUTHENTICATED_PER_HOUR: int = int(os.getenv("API_RATE_LIMIT_AUTHENTICATED_PER_HOUR", "5000"))
-    
+
+    # PrivateMode Standard tier limits (organization-level, not per user)
+    # These are shared across all API keys and users in the organization
+    PRIVATEMODE_REQUESTS_PER_MINUTE: int = int(os.getenv("PRIVATEMODE_REQUESTS_PER_MINUTE", "20"))
+    PRIVATEMODE_REQUESTS_PER_HOUR: int = int(os.getenv("PRIVATEMODE_REQUESTS_PER_HOUR", "1200"))
+    PRIVATEMODE_PROMPT_TOKENS_PER_MINUTE: int = int(os.getenv("PRIVATEMODE_PROMPT_TOKENS_PER_MINUTE", "20000"))
+    PRIVATEMODE_COMPLETION_TOKENS_PER_MINUTE: int = int(os.getenv("PRIVATEMODE_COMPLETION_TOKENS_PER_MINUTE", "10000"))
+
+    # Per-user limits (additional protection on top of organization limits)
+    API_RATE_LIMIT_AUTHENTICATED_PER_MINUTE: int = int(os.getenv("API_RATE_LIMIT_AUTHENTICATED_PER_MINUTE", "20"))  # Match PrivateMode
+    API_RATE_LIMIT_AUTHENTICATED_PER_HOUR: int = int(os.getenv("API_RATE_LIMIT_AUTHENTICATED_PER_HOUR", "1200"))
+
     # API key users (programmatic access)
-    API_RATE_LIMIT_API_KEY_PER_MINUTE: int = int(os.getenv("API_RATE_LIMIT_API_KEY_PER_MINUTE", "1000"))
-    API_RATE_LIMIT_API_KEY_PER_HOUR: int = int(os.getenv("API_RATE_LIMIT_API_KEY_PER_HOUR", "20000"))
-    
+    API_RATE_LIMIT_API_KEY_PER_MINUTE: int = int(os.getenv("API_RATE_LIMIT_API_KEY_PER_MINUTE", "20"))  # Match PrivateMode
+    API_RATE_LIMIT_API_KEY_PER_HOUR: int = int(os.getenv("API_RATE_LIMIT_API_KEY_PER_HOUR", "1200"))
+
     # Premium/Enterprise API keys
-    API_RATE_LIMIT_PREMIUM_PER_MINUTE: int = int(os.getenv("API_RATE_LIMIT_PREMIUM_PER_MINUTE", "5000"))
-    API_RATE_LIMIT_PREMIUM_PER_HOUR: int = int(os.getenv("API_RATE_LIMIT_PREMIUM_PER_HOUR", "100000"))
+    API_RATE_LIMIT_PREMIUM_PER_MINUTE: int = int(os.getenv("API_RATE_LIMIT_PREMIUM_PER_MINUTE", "20"))  # Match PrivateMode
+    API_RATE_LIMIT_PREMIUM_PER_HOUR: int = int(os.getenv("API_RATE_LIMIT_PREMIUM_PER_HOUR", "1200"))
     
     # Security Thresholds
     API_SECURITY_RISK_THRESHOLD: float = float(os.getenv("API_SECURITY_RISK_THRESHOLD", "0.8"))  # Block requests above this risk score
diff --git a/backend/app/main.py b/backend/app/main.py
index e0466e6..40d51a3 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -139,6 +139,10 @@ setup_analytics_middleware(app)
 from app.middleware.security import setup_security_middleware
 setup_security_middleware(app, enabled=settings.API_SECURITY_ENABLED)
 
+# Add rate limiting middleware only for specific endpoints
+from app.middleware.rate_limiting import RateLimitMiddleware
+app.add_middleware(RateLimitMiddleware)
+
 
 # Exception handlers
 @app.exception_handler(CustomHTTPException)
diff --git a/backend/app/middleware/rate_limiting.py b/backend/app/middleware/rate_limiting.py
index 611a67a..f6e1901 100644
--- a/backend/app/middleware/rate_limiting.py
+++ b/backend/app/middleware/rate_limiting.py
@@ -7,6 +7,7 @@ import redis
 from typing import Dict, Optional
 from fastapi import Request, HTTPException, status
 from fastapi.responses import JSONResponse
+from starlette.middleware.base import BaseHTTPMiddleware
 import asyncio
 from datetime import datetime, timedelta
 
@@ -155,96 +156,153 @@ class RateLimiter:
 rate_limiter = RateLimiter()
 
 
-async def rate_limit_middleware(request: Request, call_next):
-    """
-    Rate limiting middleware for FastAPI
-    """
-    
-    # Skip rate limiting for health checks and static files
-    if request.url.path in ["/health", "/", "/api/v1/docs", "/api/v1/openapi.json"]:
+class RateLimitMiddleware(BaseHTTPMiddleware):
+    """Rate limiting middleware for FastAPI"""
+
+    def __init__(self, app):
+        super().__init__(app)
+        self.rate_limiter = RateLimiter()
+        logger.info("RateLimitMiddleware initialized")
+
+    async def dispatch(self, request: Request, call_next):
+        """Process request through rate limiting"""
+
+        # Skip rate limiting if disabled in settings
+        if not settings.API_RATE_LIMITING_ENABLED:
+            response = await call_next(request)
+            return response
+
+        # Skip rate limiting for all internal API endpoints (platform operations)
+        if request.url.path.startswith("/api-internal/v1/"):
+            response = await call_next(request)
+            return response
+
+        # Only apply rate limiting to privatemode.ai proxy endpoints (OpenAI-compatible API and LLM service)
+        # Skip for all other endpoints
+        if not (request.url.path.startswith("/api/v1/chat/completions") or
+                request.url.path.startswith("/api/v1/embeddings") or
+                request.url.path.startswith("/api/v1/models") or
+                request.url.path.startswith("/api/v1/llm/")):
+            response = await call_next(request)
+            return response
+
+        # Skip rate limiting for health checks and static files
+        if request.url.path in ["/health", "/", "/api/v1/docs", "/api/v1/openapi.json"]:
+            response = await call_next(request)
+            return response
+
+        # Get client IP
+        client_ip = request.client.host
+        forwarded_for = request.headers.get("X-Forwarded-For")
+        if forwarded_for:
+            client_ip = forwarded_for.split(",")[0].strip()
+
+        # Check for API key in headers
+        api_key = None
+        auth_header = request.headers.get("Authorization")
+        if auth_header and auth_header.startswith("Bearer "):
+            api_key = auth_header[7:]
+        elif request.headers.get("X-API-Key"):
+            api_key = request.headers.get("X-API-Key")
+
+        # Determine rate limiting strategy
+        headers = {}
+        is_allowed = True
+
+        if api_key:
+            # API key-based rate limiting
+            api_key_key = f"api_key:{api_key}"
+
+            # First check organization-wide limits (PrivateMode limits are org-wide)
+            org_key = "organization:privatemode"
+
+            # Check organization per-minute limit
+            org_allowed_minute, org_headers_minute = await self.rate_limiter.check_rate_limit(
+                org_key, settings.PRIVATEMODE_REQUESTS_PER_MINUTE, 60, "minute"
+            )
+
+            # Check organization per-hour limit
+            org_allowed_hour, org_headers_hour = await self.rate_limiter.check_rate_limit(
+                org_key, settings.PRIVATEMODE_REQUESTS_PER_HOUR, 3600, "hour"
+            )
+
+            # If organization limits are exceeded, return 429
+            if not (org_allowed_minute and org_allowed_hour):
+                logger.warning(f"Organization rate limit exceeded for {org_key}")
+                return JSONResponse(
+                    status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+                    content={"detail": "Organization rate limit exceeded"},
+                    headers=org_headers_minute
+                )
+
+            # Then check per-API key limits
+            limit_per_minute = settings.API_RATE_LIMIT_API_KEY_PER_MINUTE
+            limit_per_hour = settings.API_RATE_LIMIT_API_KEY_PER_HOUR
+
+            # Check per-minute limit
+            is_allowed_minute, headers_minute = await self.rate_limiter.check_rate_limit(
+                api_key_key, limit_per_minute, 60, "minute"
+            )
+
+            # Check per-hour limit
+            is_allowed_hour, headers_hour = await self.rate_limiter.check_rate_limit(
+                api_key_key, limit_per_hour, 3600, "hour"
+            )
+
+            is_allowed = is_allowed_minute and is_allowed_hour
+            headers = headers_minute  # Use minute headers for response
+
+        else:
+            # IP-based rate limiting for unauthenticated requests
+            rate_limit_key = f"ip:{client_ip}"
+
+            # More restrictive limits for unauthenticated requests
+            limit_per_minute = 20  # Hardcoded for unauthenticated users
+            limit_per_hour = 100
+
+            # Check per-minute limit
+            is_allowed_minute, headers_minute = await self.rate_limiter.check_rate_limit(
+                rate_limit_key, limit_per_minute, 60, "minute"
+            )
+
+            # Check per-hour limit
+            is_allowed_hour, headers_hour = await self.rate_limiter.check_rate_limit(
+                rate_limit_key, limit_per_hour, 3600, "hour"
+            )
+
+            is_allowed = is_allowed_minute and is_allowed_hour
+            headers = headers_minute  # Use minute headers for response
+
+        # If rate limit exceeded, return 429
+        if not is_allowed:
+            return JSONResponse(
+                status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+                content={
+                    "error": "RATE_LIMIT_EXCEEDED",
+                    "message": "Rate limit exceeded. Please try again later.",
+                    "details": {
+                        "limit": headers["X-RateLimit-Limit"],
+                        "reset_time": headers["X-RateLimit-Reset"]
+                    }
+                },
+                headers={k: str(v) for k, v in headers.items()}
+            )
+
+        # Continue with request
         response = await call_next(request)
+
+        # Add rate limit headers to response
+        for key, value in headers.items():
+            response.headers[key] = str(value)
+
         return response
-    
-    # Get client IP
-    client_ip = request.client.host
-    forwarded_for = request.headers.get("X-Forwarded-For")
-    if forwarded_for:
-        client_ip = forwarded_for.split(",")[0].strip()
-    
-    # Check for API key in headers
-    api_key = None
-    auth_header = request.headers.get("Authorization")
-    if auth_header and auth_header.startswith("Bearer "):
-        api_key = auth_header[7:]
-    elif request.headers.get("X-API-Key"):
-        api_key = request.headers.get("X-API-Key")
-    
-    # Determine rate limiting strategy
-    if api_key:
-        # API key-based rate limiting
-        rate_limit_key = f"api_key:{api_key}"
-        
-        # Get API key limits from database (simplified - would implement proper lookup)
-        limit_per_minute = 100  # Default limit
-        limit_per_hour = 1000   # Default limit
-        
-        # Check per-minute limit
-        is_allowed_minute, headers_minute = await rate_limiter.check_rate_limit(
-            rate_limit_key, limit_per_minute, 60, "minute"
-        )
-        
-        # Check per-hour limit
-        is_allowed_hour, headers_hour = await rate_limiter.check_rate_limit(
-            rate_limit_key, limit_per_hour, 3600, "hour"
-        )
-        
-        is_allowed = is_allowed_minute and is_allowed_hour
-        headers = headers_minute  # Use minute headers for response
-        
-    else:
-        # IP-based rate limiting for unauthenticated requests
-        rate_limit_key = f"ip:{client_ip}"
-        
-        # More restrictive limits for unauthenticated requests
-        limit_per_minute = 20
-        limit_per_hour = 100
-        
-        # Check per-minute limit
-        is_allowed_minute, headers_minute = await rate_limiter.check_rate_limit(
-            rate_limit_key, limit_per_minute, 60, "minute"
-        )
-        
-        # Check per-hour limit  
-        is_allowed_hour, headers_hour = await rate_limiter.check_rate_limit(
-            rate_limit_key, limit_per_hour, 3600, "hour"
-        )
-        
-        is_allowed = is_allowed_minute and is_allowed_hour
-        headers = headers_minute  # Use minute headers for response
-    
-    # If rate limit exceeded, return 429
-    if not is_allowed:
-        return JSONResponse(
-            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
-            content={
-                "error": "RATE_LIMIT_EXCEEDED",
-                "message": "Rate limit exceeded. Please try again later.",
-                "details": {
-                    "limit": headers["X-RateLimit-Limit"],
-                    "reset_time": headers["X-RateLimit-Reset"]
-                }
-            },
-            headers={k: str(v) for k, v in headers.items()}
-        )
-    
-    # Continue with request
-    response = await call_next(request)
-    
-    # Add rate limit headers to response
-    for key, value in headers.items():
-        response.headers[key] = str(value)
-    
-    return response
+
+
+# Keep the old function for backward compatibility
+async def rate_limit_middleware(request: Request, call_next):
+    """Legacy function - use RateLimitMiddleware class instead"""
+    middleware = RateLimitMiddleware(None)
+    return await middleware.dispatch(request, call_next)
 
 
 class RateLimitExceeded(HTTPException):
diff --git a/backend/app/middleware/security.py b/backend/app/middleware/security.py
index 6efc1f4..57d2ebe 100644
--- a/backend/app/middleware/security.py
+++ b/backend/app/middleware/security.py
@@ -61,12 +61,12 @@ class SecurityMiddleware(BaseHTTPMiddleware):
             if analysis.is_threat and (analysis.should_block or analysis.risk_score >= settings.API_SECURITY_WARNING_THRESHOLD):
                 await self._log_security_event(request, analysis)
             
-            # Check if request should be blocked
-            if analysis.should_block:
+            # Check if request should be blocked (excluding rate limiting)
+            if analysis.should_block and not analysis.rate_limit_exceeded:
                 threat_detection_service.stats['threats_blocked'] += 1
                 logger.warning(f"Blocked request from {request.client.host if request.client else 'unknown'}: "
                              f"risk_score={analysis.risk_score:.3f}, threats={len(analysis.threats)}")
-                
+
                 # Return security block response
                 return self._create_block_response(analysis)
             
@@ -136,17 +136,13 @@ class SecurityMiddleware(BaseHTTPMiddleware):
         """Create response for blocked requests"""
         # Determine status code based on threat type
         status_code = 403  # Forbidden by default
-        
-        # Rate limiting gets 429
-        if analysis.rate_limit_exceeded:
-            status_code = 429
-        
+
         # Critical threats get 403
         for threat in analysis.threats:
             if threat.threat_type in ["command_injection", "sql_injection"]:
                 status_code = 403
                 break
-        
+
         response_data = {
             "error": "Security Policy Violation",
             "message": "Request blocked due to security policy violation",
@@ -155,24 +151,12 @@ class SecurityMiddleware(BaseHTTPMiddleware):
             "threat_count": len(analysis.threats),
             "recommendations": analysis.recommendations[:3]  # Limit to first 3 recommendations
         }
-        
-        # Add rate limiting info if applicable
-        if analysis.rate_limit_exceeded:
-            response_data["error"] = "Rate Limit Exceeded"
-            response_data["message"] = f"Rate limit exceeded for {analysis.auth_level.value} user"
-            response_data["retry_after"] = "60"  # Suggest retry after 60 seconds
-        
+
         response = JSONResponse(
             content=response_data,
             status_code=status_code
         )
-        
-        # Add rate limiting headers
-        if analysis.rate_limit_exceeded:
-            response.headers["Retry-After"] = "60"
-            response.headers["X-RateLimit-Limit"] = "See API documentation"
-            response.headers["X-RateLimit-Reset"] = str(int(time.time() + 60))
-        
+
         return response
     
     def _add_security_headers(self, response: Response) -> Response:
diff --git a/backend/app/services/enhanced_embedding_service.py b/backend/app/services/enhanced_embedding_service.py
new file mode 100644
index 0000000..284773f
--- /dev/null
+++ b/backend/app/services/enhanced_embedding_service.py
@@ -0,0 +1,201 @@
+# Enhanced Embedding Service with Rate Limiting Handling
+"""
+Enhanced embedding service with robust rate limiting and retry logic
+"""
+
+import asyncio
+import logging
+import time
+from typing import List, Dict, Any, Optional
+import numpy as np
+from datetime import datetime, timedelta
+
+from .embedding_service import EmbeddingService
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class EnhancedEmbeddingService(EmbeddingService):
+    """Enhanced embedding service with rate limiting handling"""
+
+    def __init__(self, model_name: str = "intfloat/multilingual-e5-large-instruct"):
+        super().__init__(model_name)
+        self.rate_limit_tracker = {
+            'requests_count': 0,
+            'window_start': time.time(),
+            'window_size': 60,  # 1 minute window
+            'max_requests_per_minute': int(getattr(settings, 'RAG_EMBEDDING_MAX_REQUESTS_PER_MINUTE', 60)),  # Configurable
+            'retry_delays': [int(x) for x in getattr(settings, 'RAG_EMBEDDING_RETRY_DELAYS', '1,2,4,8,16').split(',')],  # Exponential backoff
+            'delay_between_batches': float(getattr(settings, 'RAG_EMBEDDING_DELAY_BETWEEN_BATCHES', 0.5)),
+            'last_rate_limit_error': None
+        }
+
+    async def get_embeddings_with_retry(self, texts: List[str], max_retries: int = None) -> tuple[List[List[float]], bool]:
+        """
+        Get embeddings with rate limiting and retry logic
+        """
+        if max_retries is None:
+            max_retries = int(getattr(settings, 'RAG_EMBEDDING_RETRY_COUNT', 3))
+
+        batch_size = int(getattr(settings, 'RAG_EMBEDDING_BATCH_SIZE', 5))
+
+        if not self.initialized:
+            logger.warning("Embedding service not initialized, using fallback")
+            return self._generate_fallback_embeddings(texts), False
+
+        embeddings = []
+        success = True
+
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i+batch_size]
+            batch_embeddings, batch_success = await self._get_batch_embeddings_with_retry(batch, max_retries)
+            embeddings.extend(batch_embeddings)
+            success = success and batch_success
+
+            # Add delay between batches to avoid rate limiting
+            if i + batch_size < len(texts):
+                delay = self.rate_limit_tracker['delay_between_batches']
+                await asyncio.sleep(delay)  # Configurable delay between batches
+
+        return embeddings, success
+
+    async def _get_batch_embeddings_with_retry(self, texts: List[str], max_retries: int) -> tuple[List[List[float]], bool]:
+        """Get embeddings for a batch with retry logic"""
+        last_error = None
+
+        for attempt in range(max_retries + 1):
+            try:
+                # Check rate limit before making request
+                if self._is_rate_limited():
+                    delay = self._get_rate_limit_delay()
+                    logger.warning(f"Rate limit detected, waiting {delay} seconds")
+                    await asyncio.sleep(delay)
+                    continue
+
+                # Make the request
+                embeddings = await self._get_embeddings_batch_impl(texts)
+
+                # Update rate limit tracker on success
+                self._update_rate_limit_tracker(success=True)
+
+                return embeddings, True
+
+            except Exception as e:
+                last_error = e
+                error_msg = str(e).lower()
+
+                # Check if it's a rate limit error
+                if any(indicator in error_msg for indicator in ['429', 'rate limit', 'too many requests', 'quota exceeded']):
+                    logger.warning(f"Rate limit error (attempt {attempt + 1}/{max_retries + 1}): {e}")
+                    self._update_rate_limit_tracker(success=False)
+
+                    if attempt < max_retries:
+                        delay = self.rate_limit_tracker['retry_delays'][min(attempt, len(self.rate_limit_tracker['retry_delays']) - 1)]
+                        logger.info(f"Retrying in {delay} seconds...")
+                        await asyncio.sleep(delay)
+                        continue
+                    else:
+                        logger.error(f"Max retries exceeded for rate limit, using fallback embeddings")
+                        return self._generate_fallback_embeddings(texts), False
+                else:
+                    # Non-rate-limit error
+                    logger.error(f"Error generating embeddings: {e}")
+                    if attempt < max_retries:
+                        delay = self.rate_limit_tracker['retry_delays'][min(attempt, len(self.rate_limit_tracker['retry_delays']) - 1)]
+                        await asyncio.sleep(delay)
+                    else:
+                        logger.error("Max retries exceeded, using fallback embeddings")
+                        return self._generate_fallback_embeddings(texts), False
+
+        # If we get here, all retries failed
+        logger.error(f"All retries failed, last error: {last_error}")
+        return self._generate_fallback_embeddings(texts), False
+
+    async def _get_embeddings_batch_impl(self, texts: List[str]) -> List[List[float]]:
+        """Implementation of getting embeddings for a batch"""
+        from app.services.llm.service import llm_service
+        from app.services.llm.models import EmbeddingRequest
+
+        embeddings = []
+
+        for text in texts:
+            # Truncate text if needed
+            max_chars = 1600
+            truncated_text = text[:max_chars] if len(text) > max_chars else text
+
+            llm_request = EmbeddingRequest(
+                model=self.model_name,
+                input=truncated_text,
+                user_id="rag_system",
+                api_key_id=0
+            )
+
+            response = await llm_service.create_embedding(llm_request)
+
+            if response.data and len(response.data) > 0:
+                embedding = response.data[0].embedding
+                if embedding:
+                    embeddings.append(embedding)
+                    if not hasattr(self, '_dimension_confirmed'):
+                        self.dimension = len(embedding)
+                        self._dimension_confirmed = True
+                else:
+                    raise ValueError("Empty embedding in response")
+            else:
+                raise ValueError("Invalid response structure")
+
+        return embeddings
+
+    def _is_rate_limited(self) -> bool:
+        """Check if we're currently rate limited"""
+        now = time.time()
+        window_start = self.rate_limit_tracker['window_start']
+
+        # Reset window if it's expired
+        if now - window_start > self.rate_limit_tracker['window_size']:
+            self.rate_limit_tracker['requests_count'] = 0
+            self.rate_limit_tracker['window_start'] = now
+            return False
+
+        # Check if we've exceeded the limit
+        return self.rate_limit_tracker['requests_count'] >= self.rate_limit_tracker['max_requests_per_minute']
+
+    def _get_rate_limit_delay(self) -> float:
+        """Get delay to wait for rate limit reset"""
+        now = time.time()
+        window_end = self.rate_limit_tracker['window_start'] + self.rate_limit_tracker['window_size']
+        return max(0, window_end - now)
+
+    def _update_rate_limit_tracker(self, success: bool):
+        """Update the rate limit tracker"""
+        now = time.time()
+
+        # Reset window if it's expired
+        if now - self.rate_limit_tracker['window_start'] > self.rate_limit_tracker['window_size']:
+            self.rate_limit_tracker['requests_count'] = 0
+            self.rate_limit_tracker['window_start'] = now
+
+        # Increment counter on successful requests
+        if success:
+            self.rate_limit_tracker['requests_count'] += 1
+
+    async def get_embedding_stats(self) -> Dict[str, Any]:
+        """Get embedding service statistics including rate limiting info"""
+        base_stats = await self.get_stats()
+
+        return {
+            **base_stats,
+            "rate_limit_info": {
+                "requests_in_current_window": self.rate_limit_tracker['requests_count'],
+                "max_requests_per_minute": self.rate_limit_tracker['max_requests_per_minute'],
+                "window_reset_in_seconds": max(0,
+                    self.rate_limit_tracker['window_start'] + self.rate_limit_tracker['window_size'] - time.time()
+                ),
+                "last_rate_limit_error": self.rate_limit_tracker['last_rate_limit_error']
+            }
+        }
+
+
+# Global enhanced embedding service instance
+enhanced_embedding_service = EnhancedEmbeddingService()
\ No newline at end of file
diff --git a/backend/app/services/llm/config.py b/backend/app/services/llm/config.py
index 8ac8fb8..61a8576 100644
--- a/backend/app/services/llm/config.py
+++ b/backend/app/services/llm/config.py
@@ -65,7 +65,16 @@ class LLMServiceConfig(BaseModel):
     
     # Provider configurations
     providers: Dict[str, ProviderConfig] = Field(default_factory=dict, description="Provider configurations")
-    
+
+    # Token rate limiting (organization-wide)
+    token_limits_per_minute: Dict[str, int] = Field(
+        default_factory=lambda: {
+            "prompt_tokens": 20000,    # PrivateMode Standard tier
+            "completion_tokens": 10000  # PrivateMode Standard tier
+        },
+        description="Token rate limits per minute (organization-wide)"
+    )
+
     # Model routing (model_name -> provider_name)
     model_routing: Dict[str, str] = Field(default_factory=dict, description="Model to provider routing")
     
@@ -91,8 +100,8 @@ def create_default_config() -> LLMServiceConfig:
         supported_models=[],  # Will be populated dynamically from proxy
         capabilities=["chat", "embeddings", "tee"],
         priority=1,
-        max_requests_per_minute=100,
-        max_requests_per_hour=2000,
+        max_requests_per_minute=20,    # PrivateMode Standard tier limit: 20 req/min
+        max_requests_per_hour=1200,   # 20 req/min * 60 min
         supports_streaming=True,
         supports_function_calling=True,
         max_context_window=128000,
diff --git a/backend/modules/rag/main.py b/backend/modules/rag/main.py
index 1871b0d..b6c90b7 100644
--- a/backend/modules/rag/main.py
+++ b/backend/modules/rag/main.py
@@ -60,6 +60,7 @@ import tiktoken
 from app.core.config import settings
 from app.core.logging import log_module_event
 from app.services.base_module import BaseModule, Permission
+from app.services.enhanced_embedding_service import enhanced_embedding_service
 
 
 @dataclass
@@ -1125,9 +1126,17 @@ class RAGModule(BaseModule):
             # Chunk the document
             chunks = self._chunk_text(content)
             
-            # Generate embeddings for all chunks in batch (more efficient)
-            embeddings = await self._generate_embeddings(chunks)
-            
+            # Generate embeddings with enhanced rate limiting handling
+            embeddings, success = await enhanced_embedding_service.get_embeddings_with_retry(chunks)
+
+            # Log if fallback embeddings were used
+            if not success:
+                logger.warning(f"Used fallback embeddings for document {doc_id} - search quality may be degraded")
+                log_module_event("rag", "fallback_embeddings_used", {
+                    "document_id": doc_id,
+                    "content_preview": content[:100] + "..." if len(content) > 100 else content
+                })
+
             # Create document points
             points = []
             for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
@@ -1188,9 +1197,17 @@ class RAGModule(BaseModule):
             # Chunk the document
             chunks = self._chunk_text(processed_doc.content)
             
-            # Generate embeddings for all chunks in batch (more efficient)
-            embeddings = await self._generate_embeddings(chunks)
-            
+            # Generate embeddings with enhanced rate limiting handling
+            embeddings, success = await enhanced_embedding_service.get_embeddings_with_retry(chunks)
+
+            # Log if fallback embeddings were used
+            if not success:
+                logger.warning(f"Used fallback embeddings for document {processed_doc.id} - search quality may be degraded")
+                log_module_event("rag", "fallback_embeddings_used", {
+                    "document_id": processed_doc.id,
+                    "filename": processed_doc.original_filename
+                })
+
             # Create document points with enhanced metadata
             points = []
             for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):

From a2ee959ec951b0ecedd5112b7e467dc237b7a504 Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Sun, 21 Sep 2025 18:44:02 +0200
Subject: [PATCH 04/13] rag improvements

---
 .../app/services/llm/token_rate_limiter.py    | 153 +++++++++++
 backend/modules/chatbot/main.py               |  79 +++---
 backend/modules/rag/main.py                   | 255 +++++++++++++++---
 3 files changed, 401 insertions(+), 86 deletions(-)
 create mode 100644 backend/app/services/llm/token_rate_limiter.py

diff --git a/backend/app/services/llm/token_rate_limiter.py b/backend/app/services/llm/token_rate_limiter.py
new file mode 100644
index 0000000..2338a03
--- /dev/null
+++ b/backend/app/services/llm/token_rate_limiter.py
@@ -0,0 +1,153 @@
+"""
+Token-based rate limiting for LLM service
+"""
+
+import time
+import redis
+from typing import Dict, Optional, Tuple
+from datetime import datetime, timedelta
+from ..core.config import settings
+from ..core.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class TokenRateLimiter:
+    """Token-based rate limiting implementation"""
+
+    def __init__(self):
+        try:
+            self.redis_client = redis.from_url(settings.REDIS_URL, decode_responses=True)
+            self.redis_client.ping()
+            logger.info("Token rate limiter initialized with Redis backend")
+        except Exception as e:
+            logger.warning(f"Redis not available for token rate limiting: {e}")
+            self.redis_client = None
+            # Fall back to in-memory rate limiting
+            self.in_memory_store = {}
+            logger.info("Token rate limiter using in-memory fallback")
+
+    async def check_token_limits(
+        self,
+        provider: str,
+        prompt_tokens: int,
+        completion_tokens: int = 0
+    ) -> Tuple[bool, Dict[str, str]]:
+        """
+        Check if token usage is within limits
+
+        Args:
+            provider: Provider name (e.g., "privatemode")
+            prompt_tokens: Number of prompt tokens to use
+            completion_tokens: Number of completion tokens to use
+
+        Returns:
+            Tuple of (is_allowed, headers)
+        """
+        # Get token limits from configuration
+        from .config import get_config
+        config = get_config()
+        token_limits = config.token_limits_per_minute
+
+        # Check organization-wide limits
+        org_key = f"tokens:org:{provider}"
+
+        # Get current usage
+        current_usage = await self._get_token_usage(org_key)
+
+        # Calculate new usage
+        new_prompt_tokens = current_usage.get("prompt_tokens", 0) + prompt_tokens
+        new_completion_tokens = current_usage.get("completion_tokens", 0) + completion_tokens
+
+        # Check limits
+        prompt_limit = token_limits.get("prompt_tokens", 20000)
+        completion_limit = token_limits.get("completion_tokens", 10000)
+
+        is_allowed = (
+            new_prompt_tokens <= prompt_limit and
+            new_completion_tokens <= completion_limit
+        )
+
+        if is_allowed:
+            # Update usage
+            await self._update_token_usage(org_key, prompt_tokens, completion_tokens)
+            logger.debug(f"Token usage updated: {new_prompt_tokens}/{prompt_limit} prompt, "
+                        f"{new_completion_tokens}/{completion_limit} completion")
+
+        # Calculate remaining tokens
+        remaining_prompt = max(0, prompt_limit - new_prompt_tokens)
+        remaining_completion = max(0, completion_limit - new_completion_tokens)
+
+        # Create headers
+        headers = {
+            "X-TokenLimit-Prompt-Remaining": str(remaining_prompt),
+            "X-TokenLimit-Completion-Remaining": str(remaining_completion),
+            "X-TokenLimit-Prompt-Limit": str(prompt_limit),
+            "X-TokenLimit-Completion-Limit": str(completion_limit),
+            "X-TokenLimit-Reset": str(int(time.time() + 60))  # Reset in 1 minute
+        }
+
+        if not is_allowed:
+            logger.warning(f"Token rate limit exceeded for {provider}. "
+                          f"Requested: {prompt_tokens} prompt, {completion_tokens} completion. "
+                          f"Current: {current_usage}")
+
+        return is_allowed, headers
+
+    async def _get_token_usage(self, key: str) -> Dict[str, int]:
+        """Get current token usage"""
+        if self.redis_client:
+            try:
+                data = self.redis_client.hgetall(key)
+                if data:
+                    return {
+                        "prompt_tokens": int(data.get("prompt_tokens", 0)),
+                        "completion_tokens": int(data.get("completion_tokens", 0)),
+                        "updated_at": float(data.get("updated_at", time.time()))
+                    }
+            except Exception as e:
+                logger.error(f"Error getting token usage from Redis: {e}")
+
+        # Fallback to in-memory
+        return self.in_memory_store.get(key, {"prompt_tokens": 0, "completion_tokens": 0})
+
+    async def _update_token_usage(self, key: str, prompt_tokens: int, completion_tokens: int):
+        """Update token usage"""
+        if self.redis_client:
+            try:
+                pipe = self.redis_client.pipeline()
+                pipe.hincrby(key, "prompt_tokens", prompt_tokens)
+                pipe.hincrby(key, "completion_tokens", completion_tokens)
+                pipe.hset(key, "updated_at", time.time())
+                pipe.expire(key, 60)  # Expire after 1 minute
+                pipe.execute()
+            except Exception as e:
+                logger.error(f"Error updating token usage in Redis: {e}")
+                # Fallback to in-memory
+                self._update_in_memory(key, prompt_tokens, completion_tokens)
+        else:
+            self._update_in_memory(key, prompt_tokens, completion_tokens)
+
+    def _update_in_memory(self, key: str, prompt_tokens: int, completion_tokens: int):
+        """Update in-memory token usage"""
+        if key not in self.in_memory_store:
+            self.in_memory_store[key] = {"prompt_tokens": 0, "completion_tokens": 0}
+
+        self.in_memory_store[key]["prompt_tokens"] += prompt_tokens
+        self.in_memory_store[key]["completion_tokens"] += completion_tokens
+        self.in_memory_store[key]["updated_at"] = time.time()
+
+    def cleanup_expired(self):
+        """Clean up expired entries (for in-memory store)"""
+        if not self.redis_client:
+            current_time = time.time()
+            expired_keys = [
+                key for key, data in self.in_memory_store.items()
+                if current_time - data.get("updated_at", 0) > 60
+            ]
+            for key in expired_keys:
+                del self.in_memory_store[key]
+
+
+# Global token rate limiter instance
+token_rate_limiter = TokenRateLimiter()
\ No newline at end of file
diff --git a/backend/modules/chatbot/main.py b/backend/modules/chatbot/main.py
index 6f42f09..5ab62c7 100644
--- a/backend/modules/chatbot/main.py
+++ b/backend/modules/chatbot/main.py
@@ -265,7 +265,6 @@ class ChatbotModule(BaseModule):
     
     async def chat_completion(self, request: ChatRequest, user_id: str, db: Session) -> ChatResponse:
         """Generate chat completion response"""
-        logger.info("=== CHAT COMPLETION METHOD CALLED ===")
         
         # Get chatbot configuration from database
         db_chatbot = db.query(DBChatbotInstance).filter(DBChatbotInstance.id == request.chatbot_id).first()
@@ -364,11 +363,10 @@ class ChatbotModule(BaseModule):
                 metadata={"error": str(e), "fallback": True}
             )
     
-    async def _generate_response(self, message: str, db_messages: List[DBMessage],
+    async def _generate_response(self, message: str, db_messages: List[DBMessage], 
                                config: ChatbotConfig, context: Optional[Dict] = None, db: Session = None) -> tuple[str, Optional[List]]:
         """Generate response using LLM with optional RAG"""
-        logger.info("=== _generate_response METHOD CALLED ===")
-
+        
         # Lazy load dependencies if not available
         await self._ensure_dependencies()
         
@@ -397,8 +395,8 @@ class ChatbotModule(BaseModule):
                                   for i, result in enumerate(rag_results)]
                         
                         # Build full RAG context from all results
-                        rag_context = "\\n\\nRelevant information from knowledge base:\\n" + "\\n\\n".join([
-                            f"[Document {i+1}]:\\n{result.document.content}" for i, result in enumerate(rag_results)
+                        rag_context = "\n\nRelevant information from knowledge base:\n" + "\n\n".join([
+                            f"[Document {i+1}]:\n{result.document.content}" for i, result in enumerate(rag_results)
                         ])
                         
                         # Detailed RAG logging - ALWAYS log for debugging
@@ -407,14 +405,14 @@ class ChatbotModule(BaseModule):
                         logger.info(f"Collection: {qdrant_collection_name}")
                         logger.info(f"Number of results: {len(rag_results)}")
                         for i, result in enumerate(rag_results):
-                            logger.info(f"\\n--- RAG Result {i+1} ---")
+                            logger.info(f"\n--- RAG Result {i+1} ---")
                             logger.info(f"Score: {getattr(result, 'score', 'N/A')}")
                             logger.info(f"Document ID: {getattr(result.document, 'id', 'N/A')}")
                             logger.info(f"Full Content ({len(result.document.content)} chars):")
                             logger.info(f"{result.document.content}")
                             if hasattr(result.document, 'metadata'):
                                 logger.info(f"Metadata: {result.document.metadata}")
-                        logger.info(f"\\n=== RAG CONTEXT BEING ADDED TO PROMPT ({len(rag_context)} chars) ===")
+                        logger.info(f"\n=== RAG CONTEXT BEING ADDED TO PROMPT ({len(rag_context)} chars) ===")
                         logger.info(rag_context)
                         logger.info("=== END RAG SEARCH RESULTS ===")
                     else:
@@ -428,11 +426,6 @@ class ChatbotModule(BaseModule):
                 logger.warning(f"RAG search traceback: {traceback.format_exc()}")
         
         # Build conversation context (includes the current message from db_messages)
-        logger.info(f"=== CRITICAL DEBUG ===")
-        logger.info(f"rag_context length: {len(rag_context)}")
-        logger.info(f"rag_context empty: {not rag_context}")
-        logger.info(f"rag_context preview: {rag_context[:200] if rag_context else 'EMPTY'}")
-        logger.info(f"=== END CRITICAL DEBUG ===")
         messages = self._build_conversation_messages(db_messages, config, rag_context, context)
         
         # Note: Current user message is already included in db_messages from the query
@@ -452,9 +445,9 @@ class ChatbotModule(BaseModule):
         if config.use_rag and rag_context:
             logger.info(f"RAG context added: {len(rag_context)} characters")
             logger.info(f"RAG sources: {len(sources) if sources else 0} documents")
-        logger.info("\\n=== COMPLETE MESSAGES SENT TO LLM ===")
+        logger.info("\n=== COMPLETE MESSAGES SENT TO LLM ===")
         for i, msg in enumerate(messages):
-            logger.info(f"\\n--- Message {i+1} ---")
+            logger.info(f"\n--- Message {i+1} ---")
             logger.info(f"Role: {msg['role']}")
             logger.info(f"Content ({len(msg['content'])} chars):")
             # Truncate long content for logging (full RAG context can be very long)
@@ -518,38 +511,34 @@ class ChatbotModule(BaseModule):
             # Return fallback if available
             return "I'm currently unable to process your request. Please try again later.", None
     
-    def _build_conversation_messages(self, db_messages: List[DBMessage], config: ChatbotConfig,
+    def _build_conversation_messages(self, db_messages: List[DBMessage], config: ChatbotConfig, 
                                    rag_context: str = "", context: Optional[Dict] = None) -> List[Dict]:
         """Build messages array for LLM completion"""
-
+        
         messages = []
-        logger.info(f"DEBUG: _build_conversation_messages called. rag_context length: {len(rag_context)}")
-
-        # System prompt - keep it clean without RAG context
+        
+        # System prompt
         system_prompt = config.system_prompt
+        if rag_context:
+            # Add explicit instruction to use RAG context
+            system_prompt += "\n\nIMPORTANT: Use the following information from the knowledge base to answer the user's question. " \
+                           "This information is directly relevant to their query and should be your primary source:\n" + rag_context
         if context and context.get('additional_instructions'):
             system_prompt += f"\n\nAdditional instructions: {context['additional_instructions']}"
-
+            
         messages.append({"role": "system", "content": system_prompt})
-
+        
         logger.info(f"Building messages from {len(db_messages)} database messages")
-
+        
         # Conversation history (messages are already limited by memory_length in the query)
         # Reverse to get chronological order
         # Include ALL messages - the current user message is needed for the LLM to respond!
         for idx, msg in enumerate(reversed(db_messages)):
             logger.info(f"Processing message {idx}: role={msg.role}, content_preview={msg.content[:50] if msg.content else 'None'}...")
             if msg.role in ["user", "assistant"]:
-                # For user messages, prepend RAG context if available
-                content = msg.content
-                if msg.role == "user" and rag_context and idx == 0:
-                    # Add RAG context to the current user message (first in reversed order)
-                    content = f"Relevant information from knowledge base:\n{rag_context}\n\nQuestion: {msg.content}"
-                    logger.info("Added RAG context to user message")
-
                 messages.append({
                     "role": msg.role,
-                    "content": content
+                    "content": msg.content
                 })
                 logger.info(f"Added message with role {msg.role} to LLM messages")
             else:
@@ -690,10 +679,9 @@ class ChatbotModule(BaseModule):
         return router
     
     # API Compatibility Methods
-    async def chat(self, chatbot_config: Dict[str, Any], message: str,
+    async def chat(self, chatbot_config: Dict[str, Any], message: str, 
                    conversation_history: List = None, user_id: str = "anonymous") -> Dict[str, Any]:
         """Chat method for API compatibility"""
-        logger.info("=== CHAT METHOD (API COMPATIBILITY) CALLED ===")
         logger.info(f"Chat method called with message: {message[:50]}... by user: {user_id}")
         
         # Lazy load dependencies
@@ -723,20 +711,21 @@ class ChatbotModule(BaseModule):
                     fallback_responses=chatbot_config.get("fallback_responses", [])
                 )
                 
-                # For API compatibility, create a temporary DBMessage for the current message
-                # so RAG context can be properly added
-                from app.models.chatbot import ChatbotMessage as DBMessage
+                # Generate response using internal method
+                # Create a temporary message object for the current user message
+                temp_messages = [
+                    DBMessage(
+                        id=0,
+                        conversation_id=0,
+                        role="user",
+                        content=message,
+                        timestamp=datetime.utcnow(),
+                        metadata={}
+                    )
+                ]
 
-                # Create a temporary user message with the current message
-                temp_user_message = DBMessage(
-                    conversation_id="temp_conversation",
-                    role=MessageRole.USER.value,
-                    content=message
-                )
-
-                # Generate response using internal method with the current message included
                 response_content, sources = await self._generate_response(
-                    message, [temp_user_message], config, None, db
+                    message, temp_messages, config, None, db
                 )
                 
                 return {
diff --git a/backend/modules/rag/main.py b/backend/modules/rag/main.py
index b6c90b7..7d75fbd 100644
--- a/backend/modules/rag/main.py
+++ b/backend/modules/rag/main.py
@@ -53,14 +53,13 @@ except ImportError:
     PYTHON_DOCX_AVAILABLE = False
 
 from qdrant_client import QdrantClient
-from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
+from qdrant_client.models import Distance, VectorParams, PointStruct, ScoredPoint, Filter, FieldCondition, MatchValue
 from qdrant_client.http import models
 import tiktoken
 
 from app.core.config import settings
 from app.core.logging import log_module_event
 from app.services.base_module import BaseModule, Permission
-from app.services.enhanced_embedding_service import enhanced_embedding_service
 
 
 @dataclass
@@ -134,6 +133,19 @@ class RAGModule(BaseModule):
         self.embedding_model = None
         self.embedding_service = None
         self.tokenizer = None
+
+        # Set improved default configuration
+        self.config = {
+            "chunk_size": 300,      # Reduced from 400 for better precision
+            "chunk_overlap": 50,    # Added overlap for context preservation
+            "max_results": 10,
+            "score_threshold": 0.3, # Increased from 0.0 to filter low-quality results
+            "enable_hybrid": True,   # Enable hybrid search (vector + BM25)
+            "hybrid_weights": {"vector": 0.7, "bm25": 0.3}  # Weight for hybrid scoring
+        }
+        # Update with any provided config
+        if config:
+            self.config.update(config)
         
         # Content processing components
         self.nlp_model = None
@@ -640,19 +652,33 @@ class RAGModule(BaseModule):
             return embeddings
     
     def _chunk_text(self, text: str, chunk_size: int = None) -> List[str]:
-        """Split text into chunks"""
-        chunk_size = chunk_size or self.config.get("chunk_size", 400)
-        
+        """Split text into overlapping chunks for better context preservation"""
+        chunk_size = chunk_size or self.config.get("chunk_size", 300)
+        chunk_overlap = self.config.get("chunk_overlap", 50)
+
         # Tokenize text
         tokens = self.tokenizer.encode(text)
-        
-        # Split into chunks
+
+        # Split into chunks with overlap
         chunks = []
-        for i in range(0, len(tokens), chunk_size):
-            chunk_tokens = tokens[i:i + chunk_size]
+        start_idx = 0
+
+        while start_idx < len(tokens):
+            end_idx = min(start_idx + chunk_size, len(tokens))
+            chunk_tokens = tokens[start_idx:end_idx]
             chunk_text = self.tokenizer.decode(chunk_tokens)
-            chunks.append(chunk_text)
-        
+
+            # Only add non-empty chunks
+            if chunk_text.strip():
+                chunks.append(chunk_text)
+
+            # Move to next chunk with overlap
+            start_idx = end_idx - chunk_overlap
+
+            # Ensure progress (in case overlap >= chunk_size)
+            if start_idx >= end_idx:
+                start_idx = end_idx
+
         return chunks
     
     async def _process_text(self, content: bytes, filename: str) -> str:
@@ -1126,17 +1152,9 @@ class RAGModule(BaseModule):
             # Chunk the document
             chunks = self._chunk_text(content)
             
-            # Generate embeddings with enhanced rate limiting handling
-            embeddings, success = await enhanced_embedding_service.get_embeddings_with_retry(chunks)
-
-            # Log if fallback embeddings were used
-            if not success:
-                logger.warning(f"Used fallback embeddings for document {doc_id} - search quality may be degraded")
-                log_module_event("rag", "fallback_embeddings_used", {
-                    "document_id": doc_id,
-                    "content_preview": content[:100] + "..." if len(content) > 100 else content
-                })
-
+            # Generate embeddings for all chunks in batch (more efficient)
+            embeddings = await self._generate_embeddings(chunks)
+            
             # Create document points
             points = []
             for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
@@ -1197,17 +1215,9 @@ class RAGModule(BaseModule):
             # Chunk the document
             chunks = self._chunk_text(processed_doc.content)
             
-            # Generate embeddings with enhanced rate limiting handling
-            embeddings, success = await enhanced_embedding_service.get_embeddings_with_retry(chunks)
-
-            # Log if fallback embeddings were used
-            if not success:
-                logger.warning(f"Used fallback embeddings for document {processed_doc.id} - search quality may be degraded")
-                log_module_event("rag", "fallback_embeddings_used", {
-                    "document_id": processed_doc.id,
-                    "filename": processed_doc.original_filename
-                })
-
+            # Generate embeddings for all chunks in batch (more efficient)
+            embeddings = await self._generate_embeddings(chunks)
+            
             # Create document points with enhanced metadata
             points = []
             for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
@@ -1277,6 +1287,154 @@ class RAGModule(BaseModule):
         except Exception:
             return False
     
+    async def _hybrid_search(self, collection_name: str, query: str, query_vector: List[float],
+                         query_filter: Optional[Filter], limit: int, score_threshold: float) -> List[Any]:
+        """Perform hybrid search combining vector similarity and BM25 scoring"""
+
+        # Preprocess query for BM25
+        query_terms = self._preprocess_text_for_bm25(query)
+
+        # Get all documents from the collection (for BM25 scoring)
+        # Note: In production, you'd want to optimize this with a proper BM25 index
+        scroll_filter = query_filter or Filter()
+        all_points = []
+
+        # Use scroll to get all points
+        offset = None
+        batch_size = 100
+        while True:
+            search_result = self.qdrant_client.scroll(
+                collection_name=collection_name,
+                scroll_filter=scroll_filter,
+                limit=batch_size,
+                offset=offset,
+                with_payload=True,
+                with_vectors=False
+            )
+
+            points = search_result[0]
+            all_points.extend(points)
+
+            if len(points) < batch_size:
+                break
+
+            offset = points[-1].id
+
+        # Calculate BM25 scores for each document
+        bm25_scores = {}
+        for point in all_points:
+            doc_id = point.payload.get("document_id", "")
+            content = point.payload.get("content", "")
+
+            # Calculate BM25 score
+            bm25_score = self._calculate_bm25_score(query_terms, content)
+            bm25_scores[doc_id] = bm25_score
+
+        # Perform vector search
+        vector_results = self.qdrant_client.search(
+            collection_name=collection_name,
+            query_vector=query_vector,
+            query_filter=query_filter,
+            limit=limit * 2,  # Get more results for re-ranking
+            score_threshold=score_threshold / 2  # Lower threshold for initial search
+        )
+
+        # Combine scores
+        hybrid_weights = self.config.get("hybrid_weights", {"vector": 0.7, "bm25": 0.3})
+        vector_weight = hybrid_weights.get("vector", 0.7)
+        bm25_weight = hybrid_weights.get("bm25", 0.3)
+
+        # Create hybrid results
+        hybrid_results = []
+        for result in vector_results:
+            doc_id = result.payload.get("document_id", "")
+            vector_score = result.score
+            bm25_score = bm25_scores.get(doc_id, 0.0)
+
+            # Normalize scores (simple min-max normalization)
+            vector_norm = (vector_score - score_threshold) / (1.0 - score_threshold) if vector_score > score_threshold else 0
+            bm25_norm = min(bm25_score, 1.0)  # BM25 scores are typically 0-1
+
+            # Calculate hybrid score
+            hybrid_score = (vector_weight * vector_norm) + (bm25_weight * bm25_norm)
+
+            # Create new point with hybrid score
+            hybrid_point = ScoredPoint(
+                id=result.id,
+                payload=result.payload,
+                score=hybrid_score,
+                vector=result.vector,
+                shard_key=None,
+                order_value=None
+            )
+            hybrid_results.append(hybrid_point)
+
+        # Sort by hybrid score and apply final threshold
+        hybrid_results.sort(key=lambda x: x.score, reverse=True)
+        final_results = [r for r in hybrid_results if r.score >= score_threshold][:limit]
+
+        logger.info(f"Hybrid search: {len(vector_results)} vector results, {len(final_results)} final results")
+        return final_results
+
+    def _preprocess_text_for_bm25(self, text: str) -> List[str]:
+        """Preprocess text for BM25 scoring"""
+        if not NLTK_AVAILABLE:
+            return text.lower().split()
+
+        try:
+            # Tokenize
+            tokens = word_tokenize(text.lower())
+
+            # Remove stopwords and non-alphabetic tokens
+            stop_words = set(stopwords.words('english'))
+            filtered_tokens = [
+                token for token in tokens
+                if token.isalpha() and token not in stop_words and len(token) > 2
+            ]
+
+            return filtered_tokens
+        except:
+            # Fallback to simple splitting
+            return text.lower().split()
+
+    def _calculate_bm25_score(self, query_terms: List[str], document: str) -> float:
+        """Calculate BM25 score for a document against query terms"""
+        if not query_terms:
+            return 0.0
+
+        # Preprocess document
+        doc_terms = self._preprocess_text_for_bm25(document)
+        if not doc_terms:
+            return 0.0
+
+        # Calculate term frequencies
+        doc_len = len(doc_terms)
+        avg_doc_len = 300  # Average document length (configurable)
+
+        # BM25 parameters
+        k1 = 1.2  # Controls term frequency saturation
+        b = 0.75  # Controls document length normalization
+
+        score = 0.0
+
+        # Calculate IDF for each query term
+        for term in set(query_terms):
+            # Term frequency in document
+            tf = doc_terms.count(term)
+
+            # Simple IDF (log(N/n) + 1)
+            # In production, you'd use the actual document frequency
+            idf = 2.0  # Simplified IDF
+
+            # BM25 formula
+            numerator = tf * (k1 + 1)
+            denominator = tf + k1 * (1 - b + b * (doc_len / avg_doc_len))
+
+            score += idf * (numerator / denominator)
+
+        # Normalize score to 0-1 range
+        return min(score / 10.0, 1.0)  # Simple normalization
+
     async def search_documents(self, query: str, max_results: int = None, filters: Dict[str, Any] = None, collection_name: str = None) -> List[SearchResult]:
         """Search for relevant documents"""
         if not self.enabled:
@@ -1314,14 +1472,29 @@ class RAGModule(BaseModule):
             logger.info(f"Query embedding (first 10 values): {query_embedding[:10] if query_embedding else 'None'}")
             logger.info(f"Embedding service available: {self.embedding_service is not None}")
             
-            # Search in Qdrant
-            search_results = self.qdrant_client.search(
-                collection_name=collection_name,
-                query_vector=query_embedding,
-                query_filter=search_filter,
-                limit=max_results,
-                score_threshold=0.0  # Lowered from 0.5 to see all results including low scores
-            )
+            # Check if hybrid search is enabled
+            enable_hybrid = self.config.get("enable_hybrid", False)
+            score_threshold = self.config.get("score_threshold", 0.3)
+
+            if enable_hybrid and NLTK_AVAILABLE:
+                # Perform hybrid search (vector + BM25)
+                search_results = await self._hybrid_search(
+                    collection_name=collection_name,
+                    query=query,
+                    query_vector=query_embedding,
+                    query_filter=search_filter,
+                    limit=max_results,
+                    score_threshold=score_threshold
+                )
+            else:
+                # Pure vector search with improved threshold
+                search_results = self.qdrant_client.search(
+                    collection_name=collection_name,
+                    query_vector=query_embedding,
+                    query_filter=search_filter,
+                    limit=max_results,
+                    score_threshold=score_threshold
+                )
             
             logger.info(f"Raw search results count: {len(search_results)}")
             

From 361c016da4467b2b1bda424e1fe1523c499f072a Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Mon, 22 Sep 2025 11:42:40 +0200
Subject: [PATCH 05/13] chatbot rag testing

---
 backend/app/api/v1/chatbot.py   | 47 ++++++++++++++++++++-------------
 backend/modules/chatbot/main.py |  4 ++-
 docker-compose.yml              | 11 +++++---
 3 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/backend/app/api/v1/chatbot.py b/backend/app/api/v1/chatbot.py
index 20f03dc..f738b05 100644
--- a/backend/app/api/v1/chatbot.py
+++ b/backend/app/api/v1/chatbot.py
@@ -32,12 +32,28 @@ class ChatbotCreateRequest(BaseModel):
     use_rag: bool = False
     rag_collection: Optional[str] = None
     rag_top_k: int = 5
+    rag_score_threshold: float = 0.02  # Lowered from default 0.3 to allow more results
     temperature: float = 0.7
     max_tokens: int = 1000
     memory_length: int = 10
     fallback_responses: List[str] = []
 
 
+class ChatbotUpdateRequest(BaseModel):
+    name: Optional[str] = None
+    chatbot_type: Optional[str] = None
+    model: Optional[str] = None
+    system_prompt: Optional[str] = None
+    use_rag: Optional[bool] = None
+    rag_collection: Optional[str] = None
+    rag_top_k: Optional[int] = None
+    rag_score_threshold: Optional[float] = None
+    temperature: Optional[float] = None
+    max_tokens: Optional[int] = None
+    memory_length: Optional[int] = None
+    fallback_responses: Optional[List[str]] = None
+
+
 class ChatRequest(BaseModel):
     message: str
     conversation_id: Optional[str] = None
@@ -190,7 +206,7 @@ async def create_chatbot(
 @router.put("/update/{chatbot_id}")
 async def update_chatbot(
     chatbot_id: str,
-    request: ChatbotCreateRequest,
+    request: ChatbotUpdateRequest,
     current_user: User = Depends(get_current_user),
     db: AsyncSession = Depends(get_db)
 ):
@@ -214,28 +230,23 @@ async def update_chatbot(
         if not chatbot:
             raise HTTPException(status_code=404, detail="Chatbot not found or access denied")
         
-        # Update chatbot configuration
-        config = {
-            "name": request.name,
-            "chatbot_type": request.chatbot_type,
-            "model": request.model,
-            "system_prompt": request.system_prompt,
-            "use_rag": request.use_rag,
-            "rag_collection": request.rag_collection,
-            "rag_top_k": request.rag_top_k,
-            "temperature": request.temperature,
-            "max_tokens": request.max_tokens,
-            "memory_length": request.memory_length,
-            "fallback_responses": request.fallback_responses
-        }
-        
+        # Get existing config
+        existing_config = chatbot.config.copy() if chatbot.config else {}
+
+        # Update only the fields that are provided in the request
+        update_data = request.dict(exclude_unset=True)
+
+        # Merge with existing config, preserving unset values
+        for key, value in update_data.items():
+            existing_config[key] = value
+
         # Update the chatbot
         await db.execute(
             update(ChatbotInstance)
             .where(ChatbotInstance.id == chatbot_id)
             .values(
-                name=request.name,
-                config=config,
+                name=existing_config.get("name", chatbot.name),
+                config=existing_config,
                 updated_at=datetime.utcnow()
             )
         )
diff --git a/backend/modules/chatbot/main.py b/backend/modules/chatbot/main.py
index 5ab62c7..e378414 100644
--- a/backend/modules/chatbot/main.py
+++ b/backend/modules/chatbot/main.py
@@ -69,6 +69,7 @@ class ChatbotConfig:
     memory_length: int = 10  # Number of previous messages to remember
     use_rag: bool = False
     rag_top_k: int = 5
+    rag_score_threshold: float = 0.02  # Lowered from default 0.3 to allow more results
     fallback_responses: List[str] = None
     
     def __post_init__(self):
@@ -386,7 +387,8 @@ class ChatbotModule(BaseModule):
                     rag_results = await self.rag_module.search_documents(
                         query=message,
                         max_results=config.rag_top_k,
-                        collection_name=qdrant_collection_name
+                        collection_name=qdrant_collection_name,
+                        score_threshold=config.rag_score_threshold
                     )
                     
                     if rag_results:
diff --git a/docker-compose.yml b/docker-compose.yml
index 8210ba9..badc278 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -15,16 +15,18 @@ services:
 
   # Database migration service - runs once to apply migrations
   enclava-migrate:
-    build: 
+    build:
       context: ./backend
       dockerfile: Dockerfile
     environment:
       - DATABASE_URL=postgresql://enclava_user:enclava_pass@enclava-postgres:5432/enclava_db
+      - JWT_SECRET=${JWT_SECRET:-your-jwt-secret-here}
     depends_on:
       - enclava-postgres
     command: ["/usr/local/bin/migrate.sh"]
     volumes:
       - ./backend:/app
+      - ./.env:/app/.env
     networks:
       - enclava-net
     restart: "no"  # Run once and exit
@@ -63,9 +65,9 @@ services:
   enclava-frontend:
     image: node:18-alpine
     working_dir: /app
-    command: sh -c "npm install && npm run dev"
+    command: sh -c "npm ci --ignore-scripts && npm run dev"
     environment:
-      # Required base URL (derives APP/API/WS URLs)  
+      # Required base URL (derives APP/API/WS URLs)
       - BASE_URL=${BASE_URL}
       - NEXT_PUBLIC_BASE_URL=${BASE_URL}
       # Docker internal ports
@@ -79,7 +81,7 @@ services:
       - "3002:3000"  # Direct frontend access for development
     volumes:
       - ./frontend:/app
-      - /app/node_modules
+      - enclava-frontend-node-modules:/app/node_modules
     networks:
       - enclava-net
     restart: unless-stopped
@@ -148,6 +150,7 @@ volumes:
   enclava-postgres-data:
   enclava-redis-data:
   enclava-qdrant-data:
+  enclava-frontend-node-modules:
 #  enclava-ollama-data:
 
 networks:

From a8fe7d6d29bef10f74e9a201c64224afc9e39d0b Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Mon, 22 Sep 2025 11:47:09 +0200
Subject: [PATCH 06/13] Backup before security middleware removal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .env         | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++
 backend/.env |   0
 2 files changed, 152 insertions(+)
 create mode 100644 .env
 create mode 100644 backend/.env

diff --git a/.env b/.env
new file mode 100644
index 0000000..9e074ae
--- /dev/null
+++ b/.env
@@ -0,0 +1,152 @@
+# ===================================
+# ENCLAVA MINIMAL CONFIGURATION
+# ===================================
+# Only essential environment variables that CANNOT have defaults
+# Other settings should be configurable through the app UI
+
+# ===================================
+# INFRASTRUCTURE (Required)
+# ===================================
+DATABASE_URL=postgresql://enclava_user:enclava_pass@enclava-postgres:5432/enclava_db
+REDIS_URL=redis://enclava-redis:6379
+
+# ===================================
+# SECURITY CRITICAL (Required)
+# ===================================
+JWT_SECRET=your-super-secret-jwt-key-here-change-in-production
+PRIVATEMODE_API_KEY=dfaea90e-df15-48d4-94ff-5ee243b846bb
+
+# Admin user (created on first startup only)
+ADMIN_EMAIL=admin@example.com
+ADMIN_PASSWORD=admin123
+API_RATE_LIMITING_ENABLED=false
+# ===================================
+# ADDITIONAL SECURITY SETTINGS (Optional but recommended)
+# ===================================
+# JWT Algorithm (default: HS256)
+# JWT_ALGORITHM=HS256
+
+# Token expiration times (in minutes)
+# ACCESS_TOKEN_EXPIRE_MINUTES=30
+# REFRESH_TOKEN_EXPIRE_MINUTES=10080
+# SESSION_EXPIRE_MINUTES=1440
+
+# API Key prefix (default: en_)
+# API_KEY_PREFIX=en_
+
+# Security thresholds (0.0-1.0)
+# API_SECURITY_RISK_THRESHOLD=0.8
+# API_SECURITY_WARNING_THRESHOLD=0.6
+# API_SECURITY_ANOMALY_THRESHOLD=0.7
+
+# IP security (comma-separated for multiple IPs)
+# API_BLOCKED_IPS=
+# API_ALLOWED_IPS=
+
+# ===================================
+# APPLICATION BASE URL (Required - derives all URLs and CORS)
+# ===================================
+BASE_URL=localhost
+# Frontend derives: APP_URL=http://localhost, API_URL=http://localhost, WS_URL=ws://localhost  
+# Backend derives: CORS_ORIGINS=["http://localhost"]
+
+# ===================================
+# DOCKER NETWORKING (Required for containers)
+# ===================================
+BACKEND_INTERNAL_PORT=8000
+FRONTEND_INTERNAL_PORT=3000
+# Hosts are fixed: enclava-backend, enclava-frontend
+# Upstreams derive: enclava-backend:8000, enclava-frontend:3000
+
+# ===================================
+# QDRANT (Required for RAG)
+# ===================================
+QDRANT_HOST=enclava-qdrant
+QDRANT_PORT=6333
+QDRANT_URL=http://enclava-qdrant:6333
+
+# ===================================
+# OPTIONAL PRIVATEMODE SETTINGS (Have defaults)
+# ===================================
+# PRIVATEMODE_CACHE_MODE=none  # Optional: defaults to 'none'
+# PRIVATEMODE_CACHE_SALT=      # Optional: defaults to empty
+
+# ===================================
+# OPTIONAL CONFIGURATION (All have sensible defaults)
+# ===================================
+
+# Application Settings
+# APP_NAME=Enclava
+# APP_DEBUG=false
+# APP_LOG_LEVEL=INFO
+# APP_HOST=0.0.0.0
+# APP_PORT=8000
+
+# Security Features
+API_SECURITY_ENABLED=false
+# API_THREAT_DETECTION_ENABLED=true
+# API_IP_REPUTATION_ENABLED=true
+# API_ANOMALY_DETECTION_ENABLED=true
+API_RATE_LIMITING_ENABLED=false
+# API_SECURITY_HEADERS_ENABLED=true
+
+# Content Security Policy
+# API_CSP_HEADER=default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'
+
+# Rate Limiting (requests per minute/hour)
+# API_RATE_LIMIT_AUTHENTICATED_PER_MINUTE=300
+# API_RATE_LIMIT_AUTHENTICATED_PER_HOUR=5000
+# API_RATE_LIMIT_API_KEY_PER_MINUTE=1000
+# API_RATE_LIMIT_API_KEY_PER_HOUR=20000
+# API_RATE_LIMIT_PREMIUM_PER_MINUTE=5000
+# API_RATE_LIMIT_PREMIUM_PER_HOUR=100000
+
+# Request Size Limits (in bytes)
+# API_MAX_REQUEST_BODY_SIZE=10485760  # 10MB
+# API_MAX_REQUEST_BODY_SIZE_PREMIUM=52428800  # 50MB
+# MAX_UPLOAD_SIZE=10485760  # 10MB
+
+# Monitoring
+# PROMETHEUS_ENABLED=true
+# PROMETHEUS_PORT=9090
+
+# Logging
+# LOG_FORMAT=json
+# LOG_LEVEL=INFO
+# LOG_LLM_PROMPTS=false
+
+# Module Configuration
+# MODULES_CONFIG_PATH=config/modules.yaml
+
+# Plugin Configuration
+# PLUGINS_DIR=/plugins
+# PLUGINS_CONFIG_PATH=config/plugins.yaml
+# PLUGIN_REPOSITORY_URL=https://plugins.enclava.com
+# PLUGIN_ENCRYPTION_KEY=
+
+# ===================================
+# RAG EMBEDDING ENHANCED SETTINGS
+# ===================================
+# Enhanced embedding service configuration
+RAG_EMBEDDING_MAX_REQUESTS_PER_MINUTE=60
+RAG_EMBEDDING_BATCH_SIZE=5
+RAG_EMBEDDING_RETRY_COUNT=3
+RAG_EMBEDDING_RETRY_DELAYS=1,2,4,8,16
+RAG_EMBEDDING_DELAY_BETWEEN_BATCHES=0.5
+
+# Fallback embedding behavior
+RAG_ALLOW_FALLBACK_EMBEDDINGS=true
+RAG_WARN_ON_FALLBACK=true
+
+# Processing timeouts (in seconds)
+RAG_DOCUMENT_PROCESSING_TIMEOUT=300
+RAG_EMBEDDING_GENERATION_TIMEOUT=120
+RAG_INDEXING_TIMEOUT=120
+
+# ===================================
+# SUMMARY
+# ===================================
+# Required: DATABASE_URL, REDIS_URL, JWT_SECRET, ADMIN_EMAIL, ADMIN_PASSWORD, BASE_URL
+# Recommended: PRIVATEMODE_API_KEY, QDRANT_HOST, QDRANT_PORT
+# Optional: All other settings have secure defaults
+# ===================================
diff --git a/backend/.env b/backend/.env
new file mode 100644
index 0000000..e69de29

From 95d5b3a443cad87b23c3cd1da93387f0c7e20d3a Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Mon, 22 Sep 2025 11:48:11 +0200
Subject: [PATCH 07/13] Remove security and rate limiting middleware from
 backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Removed security middleware setup from main.py
- Disabled security middleware functionality
- Removed rate limiting middleware setup

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 backend/app/main.py                |  8 +---
 backend/app/middleware/security.py | 74 +++---------------------------
 2 files changed, 9 insertions(+), 73 deletions(-)

diff --git a/backend/app/main.py b/backend/app/main.py
index 40d51a3..8bea827 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -135,13 +135,9 @@ app.add_middleware(
 # Add analytics middleware
 setup_analytics_middleware(app)
 
-# Add security middleware
-from app.middleware.security import setup_security_middleware
-setup_security_middleware(app, enabled=settings.API_SECURITY_ENABLED)
+# Security middleware disabled - handled externally
 
-# Add rate limiting middleware only for specific endpoints
-from app.middleware.rate_limiting import RateLimitMiddleware
-app.add_middleware(RateLimitMiddleware)
+# Rate limiting middleware disabled - handled externally
 
 
 # Exception handlers
diff --git a/backend/app/middleware/security.py b/backend/app/middleware/security.py
index 57d2ebe..c7b7952 100644
--- a/backend/app/middleware/security.py
+++ b/backend/app/middleware/security.py
@@ -18,77 +18,17 @@ logger = get_logger(__name__)
 
 
 class SecurityMiddleware(BaseHTTPMiddleware):
-    """Security middleware for threat detection and request filtering"""
-    
+    """Security middleware for threat detection and request filtering - DISABLED"""
+
     def __init__(self, app, enabled: bool = True):
         super().__init__(app)
-        self.enabled = enabled and settings.API_SECURITY_ENABLED
-        logger.info(f"SecurityMiddleware initialized, enabled: {self.enabled}")
+        self.enabled = False  # Force disable regardless of settings
+        logger.info("SecurityMiddleware initialized, enabled: False (DISABLED)")
     
     async def dispatch(self, request: Request, call_next: Callable) -> Response:
-        """Process request through security analysis"""
-        if not self.enabled:
-            # Security disabled, pass through
-            return await call_next(request)
-        
-        # Skip security analysis for certain endpoints
-        if self._should_skip_security(request):
-            response = await call_next(request)
-            return self._add_security_headers(response)
-        
-        # Simple authentication check - drop requests without valid auth
-        if not self._has_valid_auth(request):
-            return JSONResponse(
-                content={"error": "Authentication required", "message": "Valid API key or authentication token required"},
-                status_code=401,
-                headers={"WWW-Authenticate": "Bearer"}
-            )
-        
-        try:
-            # Get user context if available
-            user_context = getattr(request.state, 'user', None)
-            
-            # Perform security analysis
-            start_time = time.time()
-            analysis = await threat_detection_service.analyze_request(request, user_context)
-            analysis_time = time.time() - start_time
-            
-            # Store analysis in request state for later use
-            request.state.security_analysis = analysis
-            
-            # Log security events (only for significant threats to reduce false positive noise)
-            # Only log if: being blocked OR risk score above warning threshold (0.6)
-            if analysis.is_threat and (analysis.should_block or analysis.risk_score >= settings.API_SECURITY_WARNING_THRESHOLD):
-                await self._log_security_event(request, analysis)
-            
-            # Check if request should be blocked (excluding rate limiting)
-            if analysis.should_block and not analysis.rate_limit_exceeded:
-                threat_detection_service.stats['threats_blocked'] += 1
-                logger.warning(f"Blocked request from {request.client.host if request.client else 'unknown'}: "
-                             f"risk_score={analysis.risk_score:.3f}, threats={len(analysis.threats)}")
-
-                # Return security block response
-                return self._create_block_response(analysis)
-            
-            # Log warnings for medium-risk requests
-            if analysis.risk_score >= settings.API_SECURITY_WARNING_THRESHOLD:
-                logger.warning(f"High-risk request detected from {request.client.host if request.client else 'unknown'}: "
-                             f"risk_score={analysis.risk_score:.3f}, auth_level={analysis.auth_level.value}")
-            
-            # Continue with request processing
-            response = await call_next(request)
-            
-            # Add security headers and metrics
-            response = self._add_security_headers(response)
-            response = self._add_security_metrics(response, analysis, analysis_time)
-            
-            return response
-            
-        except Exception as e:
-            logger.error(f"Security middleware error: {e}")
-            # Continue with request on security middleware errors to avoid breaking the app
-            response = await call_next(request)
-            return self._add_security_headers(response)
+        """Process request through security analysis - DISABLED"""
+        # Security disabled, always pass through
+        return await call_next(request)
     
     def _should_skip_security(self, request: Request) -> bool:
         """Determine if security analysis should be skipped for this request"""

From 354b43494dd1e3dcdfea32defec17418ca53bfa9 Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Mon, 22 Sep 2025 11:49:13 +0200
Subject: [PATCH 08/13] Add verification script for security middleware removal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Script verifies:
- Environment settings are correct
- Python syntax is valid
- Docker configuration exists
- No security import errors

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 verify_security_removal.py | 166 +++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 verify_security_removal.py

diff --git a/verify_security_removal.py b/verify_security_removal.py
new file mode 100644
index 0000000..81c3e9b
--- /dev/null
+++ b/verify_security_removal.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Verification script for security middleware removal
+"""
+import subprocess
+import sys
+import time
+
+def run_command(cmd, cwd=None):
+    """Run a command and return the result"""
+    try:
+        result = subprocess.run(
+            cmd,
+            shell=True,
+            capture_output=True,
+            text=True,
+            cwd=cwd,
+            timeout=30
+        )
+        return result.returncode, result.stdout, result.stderr
+    except subprocess.TimeoutExpired:
+        return -1, "", "Command timed out"
+
+def test_backend_syntax():
+    """Test if backend Python files have valid syntax"""
+    print("🔍 Testing backend Python syntax...")
+
+    # Check main.py
+    code, stdout, stderr = run_command("python3 -m py_compile app/main.py", cwd="backend")
+    if code == 0:
+        print("✅ main.py syntax OK")
+    else:
+        print(f"❌ main.py syntax error: {stderr}")
+        return False
+
+    # Check security middleware
+    code, stdout, stderr = run_command("python3 -m py_compile app/middleware/security.py", cwd="backend")
+    if code == 0:
+        print("✅ security.py syntax OK")
+    else:
+        print(f"❌ security.py syntax error: {stderr}")
+        return False
+
+    return True
+
+def test_docker_build():
+    """Test if Docker can build the backend service"""
+    print("\n🐳 Testing Docker backend build...")
+
+    # Just check if the Dockerfile exists and is readable
+    try:
+        with open("backend/Dockerfile", "r") as f:
+            content = f.read()
+            if "FROM" in content and "python" in content:
+                print("✅ Dockerfile exists and looks valid")
+                return True
+            else:
+                print("❌ Dockerfile appears invalid")
+                return False
+    except FileNotFoundError:
+        print("❌ Dockerfile not found")
+        return False
+
+def test_env_settings():
+    """Test if environment settings are correct"""
+    print("\n⚙️ Testing environment settings...")
+
+    try:
+        with open(".env", "r") as f:
+            env_content = f.read()
+
+        if "API_SECURITY_ENABLED=false" in env_content:
+            print("✅ Security is disabled in .env")
+        else:
+            print("❌ Security is not disabled in .env")
+            return False
+
+        if "API_RATE_LIMITING_ENABLED=false" in env_content:
+            print("✅ Rate limiting is disabled in .env")
+        else:
+            print("❌ Rate limiting is not disabled in .env")
+            return False
+
+        return True
+    except FileNotFoundError:
+        print("❌ .env file not found")
+        return False
+
+def test_imports():
+    """Test if the main application can be imported without security dependencies"""
+    print("\n📦 Testing import dependencies...")
+
+    # Create a minimal test script
+    test_script = """
+import sys
+sys.path.insert(0, 'backend')
+
+try:
+    # Test if we can create the app without security middleware
+    from app.main import app
+    print("✅ App can be imported successfully")
+except ImportError as e:
+    print(f"❌ Import error: {e}")
+    sys.exit(1)
+except Exception as e:
+    print(f"❌ Other error: {e}")
+    sys.exit(1)
+"""
+
+    # Save test script
+    with open("test_import.py", "w") as f:
+        f.write(test_script)
+
+    # Run test (will likely fail due to missing dependencies, but should not fail due to security imports)
+    code, stdout, stderr = run_command("python3 test_import.py")
+
+    # Clean up
+    import os
+    os.remove("test_import.py")
+
+    # We expect this to fail due to missing FastAPI, but not due to security imports
+    if "security" in stderr.lower() and "No module named" not in stderr:
+        print("❌ Security import errors detected")
+        return False
+    else:
+        print("✅ No security import errors detected")
+        return True
+
+def main():
+    """Run all verification tests"""
+    print("🚀 Starting verification of security middleware removal...\n")
+
+    tests = [
+        ("Environment Settings", test_env_settings),
+        ("Python Syntax", test_backend_syntax),
+        ("Docker Configuration", test_docker_build),
+        ("Import Dependencies", test_imports),
+    ]
+
+    results = []
+    for test_name, test_func in tests:
+        print(f"\n--- {test_name} ---")
+        result = test_func()
+        results.append((test_name, result))
+
+    # Print summary
+    print("\n" + "="*50)
+    print("📊 VERIFICATION SUMMARY")
+    print("="*50)
+
+    for test_name, result in results:
+        status = "✅ PASS" if result else "❌ FAIL"
+        print(f"{test_name}: {status}")
+
+    all_passed = all(result for _, result in results)
+
+    if all_passed:
+        print("\n🎉 All tests passed! Security middleware has been successfully removed.")
+    else:
+        print("\n⚠️ Some tests failed. Please review the issues above.")
+
+    return all_passed
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)
\ No newline at end of file

From f8d127ff4211f79f11ceeb77d63b436e65dbe205 Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Tue, 23 Sep 2025 15:26:54 +0200
Subject: [PATCH 09/13] rag improvements

---
 .env                                          |   2 +-
 .env.example                                  |  12 +-
 backend/app/api/internal_v1/__init__.py       |   7 +-
 backend/app/api/v1/__init__.py                |   3 -
 backend/app/api/v1/llm.py                     |   3 +-
 backend/app/api/v1/rag.py                     | 308 +++++++-
 backend/app/api/v1/security.py                | 251 ------
 backend/app/api/v1/settings.py                |   4 -
 backend/app/core/config.py                    |  44 +-
 backend/app/core/threat_detection.py          | 744 ------------------
 backend/app/main.py                           |  12 +-
 backend/app/middleware/rate_limiting.py       | 371 ---------
 backend/app/middleware/security.py            | 210 -----
 backend/app/services/document_processor.py    | 148 +++-
 backend/app/services/embedding_service.py     |  24 +-
 .../services/enhanced_embedding_service.py    |  28 +-
 backend/app/services/llm/config.py            |  14 +-
 backend/app/services/llm/metrics.py           |   2 -
 backend/app/services/llm/models.py            |   1 -
 .../app/services/llm/providers/privatemode.py |   2 +
 backend/app/services/llm/security.py          | 325 --------
 backend/app/services/llm/service.py           | 286 ++-----
 .../app/services/llm/token_rate_limiter.py    | 153 ----
 backend/app/services/rag_service.py           |   9 +-
 backend/modules/rag/main.py                   | 131 ++-
 frontend/src/app/api/auth/login/route.ts      |   2 +-
 frontend/src/app/rag/page.tsx                 |  25 +-
 .../src/components/rag/document-browser.tsx   |  55 +-
 frontend/src/components/ui/navigation.tsx     |   5 +-
 nginx/nginx.conf                              |  64 +-
 30 files changed, 817 insertions(+), 2428 deletions(-)
 delete mode 100644 backend/app/api/v1/security.py
 delete mode 100644 backend/app/core/threat_detection.py
 delete mode 100644 backend/app/middleware/rate_limiting.py
 delete mode 100644 backend/app/middleware/security.py
 delete mode 100644 backend/app/services/llm/security.py
 delete mode 100644 backend/app/services/llm/token_rate_limiter.py

diff --git a/.env b/.env
index 9e074ae..b8d34af 100644
--- a/.env
+++ b/.env
@@ -46,7 +46,7 @@ API_RATE_LIMITING_ENABLED=false
 # ===================================
 # APPLICATION BASE URL (Required - derives all URLs and CORS)
 # ===================================
-BASE_URL=localhost
+BASE_URL=localhost:80
 # Frontend derives: APP_URL=http://localhost, API_URL=http://localhost, WS_URL=ws://localhost  
 # Backend derives: CORS_ORIGINS=["http://localhost"]
 
diff --git a/.env.example b/.env.example
index cf6d8f1..b9dd120 100644
--- a/.env.example
+++ b/.env.example
@@ -65,6 +65,16 @@ QDRANT_HOST=enclava-qdrant
 QDRANT_PORT=6333
 QDRANT_URL=http://enclava-qdrant:6333
 
+# ===================================
+# RAG EMBEDDING CONFIGURATION (Optional overrides)
+# ===================================
+# These control embedding throughput to avoid provider 429s.
+# Defaults are conservative; uncomment to override.
+# RAG_EMBEDDING_MAX_REQUESTS_PER_MINUTE=12
+# RAG_EMBEDDING_BATCH_SIZE=3
+# RAG_EMBEDDING_DELAY_BETWEEN_BATCHES=1.0   # seconds
+# RAG_EMBEDDING_DELAY_PER_REQUEST=0.5       # seconds
+
 # ===================================
 # OPTIONAL PRIVATEMODE SETTINGS (Have defaults)
 # ===================================
@@ -130,4 +140,4 @@ QDRANT_URL=http://enclava-qdrant:6333
 # Required: DATABASE_URL, REDIS_URL, JWT_SECRET, ADMIN_EMAIL, ADMIN_PASSWORD, BASE_URL
 # Recommended: PRIVATEMODE_API_KEY, QDRANT_HOST, QDRANT_PORT
 # Optional: All other settings have secure defaults
-# ===================================
\ No newline at end of file
+# ===================================
diff --git a/backend/app/api/internal_v1/__init__.py b/backend/app/api/internal_v1/__init__.py
index 97e8510..29af4ab 100644
--- a/backend/app/api/internal_v1/__init__.py
+++ b/backend/app/api/internal_v1/__init__.py
@@ -12,8 +12,8 @@ from ..v1.audit import router as audit_router
 from ..v1.settings import router as settings_router
 from ..v1.analytics import router as analytics_router
 from ..v1.rag import router as rag_router
+from ..rag_debug import router as rag_debug_router
 from ..v1.prompt_templates import router as prompt_templates_router
-from ..v1.security import router as security_router
 from ..v1.plugin_registry import router as plugin_registry_router
 from ..v1.platform import router as platform_router
 from ..v1.llm_internal import router as llm_internal_router
@@ -52,11 +52,12 @@ internal_api_router.include_router(analytics_router, prefix="/analytics", tags=[
 # Include RAG routes (frontend RAG document management)
 internal_api_router.include_router(rag_router, prefix="/rag", tags=["internal-rag"])
 
+# Include RAG debug routes (for demo and debugging)
+internal_api_router.include_router(rag_debug_router, prefix="/rag/debug", tags=["internal-rag-debug"])
+
 # Include prompt template routes (frontend prompt template management)
 internal_api_router.include_router(prompt_templates_router, prefix="/prompt-templates", tags=["internal-prompt-templates"])
 
-# Include security routes (frontend security settings)
-internal_api_router.include_router(security_router, prefix="/security", tags=["internal-security"])
 
 # Include plugin registry routes (frontend plugin management)
 internal_api_router.include_router(plugin_registry_router, prefix="/plugins", tags=["internal-plugins"])
diff --git a/backend/app/api/v1/__init__.py b/backend/app/api/v1/__init__.py
index 6f66641..f9412e4 100644
--- a/backend/app/api/v1/__init__.py
+++ b/backend/app/api/v1/__init__.py
@@ -16,7 +16,6 @@ from .analytics import router as analytics_router
 from .rag import router as rag_router
 from .chatbot import router as chatbot_router
 from .prompt_templates import router as prompt_templates_router
-from .security import router as security_router
 from .plugin_registry import router as plugin_registry_router
 
 # Create main API router
@@ -61,8 +60,6 @@ api_router.include_router(chatbot_router, prefix="/chatbot", tags=["chatbot"])
 # Include prompt template routes
 api_router.include_router(prompt_templates_router, prefix="/prompt-templates", tags=["prompt-templates"])
 
-# Include security routes
-api_router.include_router(security_router, prefix="/security", tags=["security"])
 
 
 # Include plugin registry routes
diff --git a/backend/app/api/v1/llm.py b/backend/app/api/v1/llm.py
index c30d797..5fdc20c 100644
--- a/backend/app/api/v1/llm.py
+++ b/backend/app/api/v1/llm.py
@@ -745,8 +745,7 @@ async def get_llm_metrics(
                 "total_requests": metrics.total_requests,
                 "successful_requests": metrics.successful_requests,
                 "failed_requests": metrics.failed_requests,
-                "security_blocked_requests": metrics.security_blocked_requests,
-                "average_latency_ms": metrics.average_latency_ms,
+                  "average_latency_ms": metrics.average_latency_ms,
                 "average_risk_score": metrics.average_risk_score,
                 "provider_metrics": metrics.provider_metrics,
                 "last_updated": metrics.last_updated.isoformat()
diff --git a/backend/app/api/v1/rag.py b/backend/app/api/v1/rag.py
index b5d00cf..0e65c2f 100644
--- a/backend/app/api/v1/rag.py
+++ b/backend/app/api/v1/rag.py
@@ -3,12 +3,14 @@ RAG API Endpoints
 Provides REST API for RAG (Retrieval Augmented Generation) operations
 """
 
-from typing import List, Optional
+from typing import List, Optional, Dict, Any
 from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, status
 from fastapi.responses import StreamingResponse
 from sqlalchemy.ext.asyncio import AsyncSession
 from pydantic import BaseModel
 import io
+import asyncio
+from datetime import datetime
 
 from app.db.database import get_db
 from app.core.security import get_current_user
@@ -16,6 +18,9 @@ from app.models.user import User
 from app.services.rag_service import RAGService
 from app.utils.exceptions import APIException
 
+# Import RAG module from module manager
+from app.services.module_manager import module_manager
+
 
 router = APIRouter(tags=["RAG"])
 
@@ -78,14 +83,25 @@ async def get_collections(
     db: AsyncSession = Depends(get_db),
     current_user: User = Depends(get_current_user)
 ):
-    """Get all RAG collections from Qdrant (source of truth) with PostgreSQL metadata"""
+    """Get all RAG collections - live data directly from Qdrant (source of truth)"""
     try:
-        rag_service = RAGService(db)
-        collections_data = await rag_service.get_all_collections(skip=skip, limit=limit)
+        from app.services.qdrant_stats_service import qdrant_stats_service
+
+        # Get live stats from Qdrant
+        stats_data = await qdrant_stats_service.get_collections_stats()
+        collections = stats_data.get("collections", [])
+
+        # Apply pagination
+        start_idx = skip
+        end_idx = skip + limit
+        paginated_collections = collections[start_idx:end_idx]
+
         return {
             "success": True,
-            "collections": collections_data,
-            "total": len(collections_data)
+            "collections": paginated_collections,
+            "total": len(collections),
+            "total_documents": stats_data.get("total_documents", 0),
+            "total_size_bytes": stats_data.get("total_size_bytes", 0)
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
@@ -116,6 +132,62 @@ async def create_collection(
         raise HTTPException(status_code=500, detail=str(e))
 
 
+@router.get("/stats", response_model=dict)
+async def get_rag_stats(
+    db: AsyncSession = Depends(get_db),
+    current_user: User = Depends(get_current_user)
+):
+    """Get overall RAG statistics - live data directly from Qdrant"""
+    try:
+        from app.services.qdrant_stats_service import qdrant_stats_service
+
+        # Get live stats from Qdrant
+        stats_data = await qdrant_stats_service.get_collections_stats()
+
+        # Calculate active collections (collections with documents)
+        active_collections = sum(1 for col in stats_data.get("collections", []) if col.get("document_count", 0) > 0)
+
+        # Calculate processing documents from database
+        processing_docs = 0
+        try:
+            from sqlalchemy import select
+            from app.models.rag_document import RagDocument, ProcessingStatus
+
+            result = await db.execute(
+                select(RagDocument).where(RagDocument.status == ProcessingStatus.PROCESSING)
+            )
+            processing_docs = len(result.scalars().all())
+        except Exception:
+            pass  # If database query fails, default to 0
+
+        response_data = {
+            "success": True,
+            "stats": {
+                "collections": {
+                    "total": stats_data.get("total_collections", 0),
+                    "active": active_collections
+                },
+                "documents": {
+                    "total": stats_data.get("total_documents", 0),
+                    "processing": processing_docs,
+                    "processed": stats_data.get("total_documents", 0)  # Indexed documents
+                },
+                "storage": {
+                    "total_size_bytes": stats_data.get("total_size_bytes", 0),
+                    "total_size_mb": round(stats_data.get("total_size_bytes", 0) / (1024 * 1024), 2)
+                },
+                "vectors": {
+                    "total": stats_data.get("total_documents", 0)  # Same as documents for RAG
+                },
+                "last_updated": datetime.utcnow().isoformat()
+            }
+        }
+
+        return response_data
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
 @router.get("/collections/{collection_id}", response_model=dict)
 async def get_collection(
     collection_id: int,
@@ -225,21 +297,65 @@ async def upload_document(
     try:
         # Read file content
         file_content = await file.read()
-        
+
         if len(file_content) == 0:
             raise HTTPException(status_code=400, detail="Empty file uploaded")
-        
+
         if len(file_content) > 50 * 1024 * 1024:  # 50MB limit
             raise HTTPException(status_code=400, detail="File too large (max 50MB)")
-        
+
+        # Validate file can be read before processing
+        filename = file.filename or "unknown"
+        file_extension = filename.split('.')[-1].lower() if '.' in filename else ''
+
+        try:
+            # Test file readability based on type
+            if file_extension == 'jsonl':
+                # Validate JSONL format - try to parse first few lines
+                try:
+                    content_str = file_content.decode('utf-8')
+                    lines = content_str.strip().split('\n')[:5]  # Check first 5 lines
+                    import json
+                    for i, line in enumerate(lines):
+                        if line.strip():  # Skip empty lines
+                            json.loads(line)  # Will raise JSONDecodeError if invalid
+                except UnicodeDecodeError:
+                    raise HTTPException(status_code=400, detail="File is not valid UTF-8 text")
+                except json.JSONDecodeError as e:
+                    raise HTTPException(status_code=400, detail=f"Invalid JSONL format: {str(e)}")
+
+            elif file_extension in ['txt', 'md', 'py', 'js', 'html', 'css', 'json']:
+                # Validate text files can be decoded
+                try:
+                    file_content.decode('utf-8')
+                except UnicodeDecodeError:
+                    raise HTTPException(status_code=400, detail="File is not valid UTF-8 text")
+
+            elif file_extension in ['pdf']:
+                # For PDF files, just check if it starts with PDF signature
+                if not file_content.startswith(b'%PDF'):
+                    raise HTTPException(status_code=400, detail="Invalid PDF file format")
+
+            elif file_extension in ['docx', 'xlsx', 'pptx']:
+                # For Office documents, check ZIP signature
+                if not file_content.startswith(b'PK'):
+                    raise HTTPException(status_code=400, detail=f"Invalid {file_extension.upper()} file format")
+
+            # For other file types, we'll rely on the document processor
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"File validation failed: {str(e)}")
+
         rag_service = RAGService(db)
         document = await rag_service.upload_document(
             collection_id=collection_id,
             file_content=file_content,
-            filename=file.filename or "unknown",
+            filename=filename,
             content_type=file.content_type
         )
-        
+
         return {
             "success": True,
             "document": document.to_dict(),
@@ -362,21 +478,167 @@ async def download_document(
         raise HTTPException(status_code=500, detail=str(e))
 
 
-# Stats Endpoint
 
-@router.get("/stats", response_model=dict)
-async def get_rag_stats(
-    db: AsyncSession = Depends(get_db),
+# Debug Endpoints
+
+@router.post("/debug/search")
+async def search_with_debug(
+    query: str,
+    max_results: int = 10,
+    score_threshold: float = 0.3,
+    collection_name: str = None,
+    config: Dict[str, Any] = None,
     current_user: User = Depends(get_current_user)
-):
-    """Get RAG system statistics"""
+) -> Dict[str, Any]:
+    """
+    Enhanced search with comprehensive debug information
+    """
+    # Get RAG module from module manager
+    rag_module = module_manager.modules.get('rag')
+    if not rag_module or not rag_module.enabled:
+        raise HTTPException(status_code=503, detail="RAG module not initialized")
+
+    debug_info = {}
+    start_time = datetime.utcnow()
+
     try:
-        rag_service = RAGService(db)
-        stats = await rag_service.get_stats()
-        
+        # Apply configuration if provided
+        if config:
+            # Update RAG config temporarily
+            original_config = rag_module.config.copy()
+            rag_module.config.update(config)
+
+        # Generate query embedding (with or without prefix)
+        if config and config.get("use_query_prefix"):
+            optimized_query = f"query: {query}"
+        else:
+            optimized_query = query
+
+        query_embedding = await rag_module._generate_embedding(optimized_query)
+
+        # Store embedding info for debug
+        if config and config.get("debug", {}).get("show_embeddings"):
+            debug_info["query_embedding"] = query_embedding[:10]  # First 10 dimensions
+            debug_info["embedding_dimension"] = len(query_embedding)
+            debug_info["optimized_query"] = optimized_query
+
+        # Perform search
+        search_start = asyncio.get_event_loop().time()
+        results = await rag_module.search_documents(
+            query,
+            max_results=max_results,
+            score_threshold=score_threshold,
+            collection_name=collection_name
+        )
+        search_time = (asyncio.get_event_loop().time() - search_start) * 1000
+
+        # Calculate score statistics
+        scores = [r.score for r in results if r.score is not None]
+        if scores:
+            import statistics
+            debug_info["score_stats"] = {
+                "min": min(scores),
+                "max": max(scores),
+                "avg": statistics.mean(scores),
+                "stddev": statistics.stdev(scores) if len(scores) > 1 else 0
+            }
+
+        # Get collection statistics
+        try:
+            from qdrant_client.http.models import Filter
+            collection_name = collection_name or rag_module.default_collection_name
+
+            # Count total documents
+            count_result = rag_module.qdrant_client.count(
+                collection_name=collection_name,
+                count_filter=Filter(must=[])
+            )
+            total_points = count_result.count
+
+            # Get unique documents and languages
+            scroll_result = rag_module.qdrant_client.scroll(
+                collection_name=collection_name,
+                limit=1000,  # Sample for stats
+                with_payload=True,
+                with_vectors=False
+            )
+
+            unique_docs = set()
+            languages = set()
+
+            for point in scroll_result[0]:
+                payload = point.payload or {}
+                doc_id = payload.get("document_id")
+                if doc_id:
+                    unique_docs.add(doc_id)
+
+                language = payload.get("language")
+                if language:
+                    languages.add(language)
+
+            debug_info["collection_stats"] = {
+                "total_documents": len(unique_docs),
+                "total_chunks": total_points,
+                "languages": sorted(list(languages))
+            }
+
+        except Exception as e:
+            debug_info["collection_stats_error"] = str(e)
+
+        # Enhance results with debug info
+        enhanced_results = []
+        for result in results:
+            enhanced_result = {
+                "document": {
+                    "id": result.document.id,
+                    "content": result.document.content,
+                    "metadata": result.document.metadata
+                },
+                "score": result.score,
+                "debug_info": {}
+            }
+
+            # Add hybrid search debug info if available
+            metadata = result.document.metadata or {}
+            if "_vector_score" in metadata:
+                enhanced_result["debug_info"]["vector_score"] = metadata["_vector_score"]
+            if "_bm25_score" in metadata:
+                enhanced_result["debug_info"]["bm25_score"] = metadata["_bm25_score"]
+
+            enhanced_results.append(enhanced_result)
+
+        # Note: Analytics logging disabled (module not available)
+
         return {
-            "success": True,
-            "stats": stats
+            "results": enhanced_results,
+            "debug_info": debug_info,
+            "search_time_ms": search_time,
+            "timestamp": start_time.isoformat()
         }
+
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
\ No newline at end of file
+        # Note: Analytics logging disabled (module not available)
+        raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
+
+    finally:
+        # Restore original config if modified
+        if config and 'original_config' in locals():
+            rag_module.config = original_config
+
+
+@router.get("/debug/config")
+async def get_current_config(
+    current_user: User = Depends(get_current_user)
+) -> Dict[str, Any]:
+    """Get current RAG configuration"""
+    # Get RAG module from module manager
+    rag_module = module_manager.modules.get('rag')
+    if not rag_module or not rag_module.enabled:
+        raise HTTPException(status_code=503, detail="RAG module not initialized")
+
+    return {
+        "config": rag_module.config,
+        "embedding_model": rag_module.embedding_model,
+        "enabled": rag_module.enabled,
+        "collections": await rag_module._get_collections_safely()
+    }
diff --git a/backend/app/api/v1/security.py b/backend/app/api/v1/security.py
deleted file mode 100644
index 838dd6f..0000000
--- a/backend/app/api/v1/security.py
+++ /dev/null
@@ -1,251 +0,0 @@
-"""
-Security API endpoints for monitoring and configuration
-"""
-
-from typing import Dict, Any, List, Optional
-from fastapi import APIRouter, Depends, HTTPException, Request, status
-from pydantic import BaseModel, Field
-
-from app.core.security import get_current_active_user, RequiresRole
-from app.middleware.security import get_security_stats, get_request_auth_level, get_request_risk_score
-from app.core.config import settings
-from app.core.logging import get_logger
-
-logger = get_logger(__name__)
-
-router = APIRouter(tags=["security"])
-
-
-# Pydantic models for API responses
-class SecurityStatsResponse(BaseModel):
-    """Security statistics response model"""
-    total_requests_analyzed: int
-    threats_detected: int
-    threats_blocked: int
-    anomalies_detected: int
-    rate_limits_exceeded: int
-    avg_analysis_time: float
-    threat_types: Dict[str, int]
-    threat_levels: Dict[str, int]
-    top_attacking_ips: List[tuple]
-    security_enabled: bool
-    threat_detection_enabled: bool
-    rate_limiting_enabled: bool
-
-
-class SecurityConfigResponse(BaseModel):
-    """Security configuration response model"""
-    security_enabled: bool = Field(description="Overall security system enabled")
-    threat_detection_enabled: bool = Field(description="Threat detection analysis enabled")
-    rate_limiting_enabled: bool = Field(description="Rate limiting enabled")
-    ip_reputation_enabled: bool = Field(description="IP reputation checking enabled")
-    anomaly_detection_enabled: bool = Field(description="Anomaly detection enabled")
-    security_headers_enabled: bool = Field(description="Security headers enabled")
-    
-    # Rate limiting settings
-    unauthenticated_per_minute: int = Field(description="Rate limit for unauthenticated requests per minute")
-    authenticated_per_minute: int = Field(description="Rate limit for authenticated users per minute")
-    api_key_per_minute: int = Field(description="Rate limit for API key users per minute")
-    premium_per_minute: int = Field(description="Rate limit for premium users per minute")
-    
-    # Security thresholds
-    risk_threshold: float = Field(description="Risk score threshold for blocking requests")
-    warning_threshold: float = Field(description="Risk score threshold for warnings")
-    anomaly_threshold: float = Field(description="Anomaly severity threshold")
-    
-    # IP settings
-    blocked_ips: List[str] = Field(description="List of blocked IP addresses")
-    allowed_ips: List[str] = Field(description="List of allowed IP addresses (empty = allow all)")
-
-
-class RateLimitInfoResponse(BaseModel):
-    """Rate limit information for current request"""
-    auth_level: str = Field(description="Authentication level (unauthenticated, authenticated, api_key, premium)")
-    current_limits: Dict[str, int] = Field(description="Current rate limits for this auth level")
-    remaining_requests: Optional[Dict[str, int]] = Field(description="Estimated remaining requests (if available)")
-
-
-@router.get("/stats", response_model=SecurityStatsResponse)
-async def get_security_statistics(
-    current_user: Dict[str, Any] = Depends(RequiresRole("admin"))
-):
-    """
-    Get security system statistics
-    
-    Requires admin role. Returns comprehensive statistics about:
-    - Request analysis counts
-    - Threat detection results
-    - Rate limiting enforcement
-    - Top attacking IPs
-    - Performance metrics
-    """
-    try:
-        stats = get_security_stats()
-        return SecurityStatsResponse(**stats)
-    except Exception as e:
-        logger.error(f"Error getting security stats: {e}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="Failed to retrieve security statistics"
-        )
-
-
-@router.get("/config", response_model=SecurityConfigResponse)
-async def get_security_config(
-    current_user: Dict[str, Any] = Depends(RequiresRole("admin"))
-):
-    """
-    Get current security configuration
-    
-    Requires admin role. Returns current security settings including:
-    - Feature enablement flags
-    - Rate limiting thresholds
-    - Security thresholds
-    - IP allowlists/blocklists
-    """
-    return SecurityConfigResponse(
-        security_enabled=settings.API_SECURITY_ENABLED,
-        threat_detection_enabled=settings.API_THREAT_DETECTION_ENABLED,
-        rate_limiting_enabled=settings.API_RATE_LIMITING_ENABLED,
-        ip_reputation_enabled=settings.API_IP_REPUTATION_ENABLED,
-        anomaly_detection_enabled=settings.API_ANOMALY_DETECTION_ENABLED,
-        security_headers_enabled=settings.API_SECURITY_HEADERS_ENABLED,
-        
-        unauthenticated_per_minute=settings.API_RATE_LIMIT_UNAUTHENTICATED_PER_MINUTE,
-        authenticated_per_minute=settings.API_RATE_LIMIT_AUTHENTICATED_PER_MINUTE,
-        api_key_per_minute=settings.API_RATE_LIMIT_API_KEY_PER_MINUTE,
-        premium_per_minute=settings.API_RATE_LIMIT_PREMIUM_PER_MINUTE,
-        
-        risk_threshold=settings.API_SECURITY_RISK_THRESHOLD,
-        warning_threshold=settings.API_SECURITY_WARNING_THRESHOLD,
-        anomaly_threshold=settings.API_SECURITY_ANOMALY_THRESHOLD,
-        
-        blocked_ips=settings.API_BLOCKED_IPS,
-        allowed_ips=settings.API_ALLOWED_IPS
-    )
-
-
-@router.get("/status")
-async def get_security_status(
-    request: Request,
-    current_user: Dict[str, Any] = Depends(get_current_active_user)
-):
-    """
-    Get security status for current request
-    
-    Returns information about the security analysis of the current request:
-    - Authentication level
-    - Risk score (if available)
-    - Rate limiting status
-    """
-    auth_level = get_request_auth_level(request)
-    risk_score = get_request_risk_score(request)
-    
-    # Get rate limits for current auth level
-    from app.core.threat_detection import AuthLevel
-    try:
-        auth_enum = AuthLevel(auth_level)
-        from app.core.threat_detection import threat_detection_service
-        minute_limit, hour_limit = threat_detection_service.get_rate_limits(auth_enum)
-        
-        rate_limit_info = RateLimitInfoResponse(
-            auth_level=auth_level,
-            current_limits={
-                "per_minute": minute_limit,
-                "per_hour": hour_limit
-            },
-            remaining_requests=None  # We don't track remaining requests in current implementation
-        )
-    except ValueError:
-        rate_limit_info = RateLimitInfoResponse(
-            auth_level=auth_level,
-            current_limits={},
-            remaining_requests=None
-        )
-    
-    return {
-        "security_enabled": settings.API_SECURITY_ENABLED,
-        "auth_level": auth_level,
-        "risk_score": round(risk_score, 3) if risk_score > 0 else None,
-        "rate_limit_info": rate_limit_info.dict(),
-        "security_headers_enabled": settings.API_SECURITY_HEADERS_ENABLED
-    }
-
-
-@router.post("/test")
-async def test_security_analysis(
-    request: Request,
-    current_user: Dict[str, Any] = Depends(RequiresRole("admin"))
-):
-    """
-    Test security analysis on current request
-    
-    Requires admin role. Manually triggers security analysis on the current request
-    and returns detailed results. Useful for testing security rules and thresholds.
-    """
-    try:
-        from app.middleware.security import analyze_request_security
-        
-        analysis = await analyze_request_security(request, current_user)
-        
-        return {
-            "analysis_complete": True,
-            "is_threat": analysis.is_threat,
-            "risk_score": round(analysis.risk_score, 3),
-            "auth_level": analysis.auth_level.value,
-            "should_block": analysis.should_block,
-            "rate_limit_exceeded": analysis.rate_limit_exceeded,
-            "threat_count": len(analysis.threats),
-            "threats": [
-                {
-                    "type": threat.threat_type,
-                    "level": threat.level.value,
-                    "confidence": round(threat.confidence, 3),
-                    "description": threat.description,
-                    "mitigation": threat.mitigation
-                }
-                for threat in analysis.threats
-            ],
-            "recommendations": analysis.recommendations
-        }
-    except Exception as e:
-        logger.error(f"Error in security analysis test: {e}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="Failed to perform security analysis test"
-        )
-
-
-@router.get("/health")
-async def security_health_check():
-    """
-    Security system health check
-    
-    Public endpoint that returns the health status of the security system.
-    Does not require authentication.
-    """
-    try:
-        stats = get_security_stats()
-        
-        # Basic health checks
-        is_healthy = (
-            settings.API_SECURITY_ENABLED and
-            stats.get("total_requests_analyzed", 0) >= 0 and
-            stats.get("avg_analysis_time", 0) < 1.0  # Analysis should be under 1 second
-        )
-        
-        return {
-            "status": "healthy" if is_healthy else "degraded",
-            "security_enabled": settings.API_SECURITY_ENABLED,
-            "threat_detection_enabled": settings.API_THREAT_DETECTION_ENABLED,
-            "rate_limiting_enabled": settings.API_RATE_LIMITING_ENABLED,
-            "avg_analysis_time_ms": round(stats.get("avg_analysis_time", 0) * 1000, 2),
-            "total_requests_analyzed": stats.get("total_requests_analyzed", 0)
-        }
-    except Exception as e:
-        logger.error(f"Security health check failed: {e}")
-        return {
-            "status": "unhealthy",
-            "error": "Security system error",
-            "security_enabled": settings.API_SECURITY_ENABLED
-        }
\ No newline at end of file
diff --git a/backend/app/api/v1/settings.py b/backend/app/api/v1/settings.py
index 8595ad6..4b97e25 100644
--- a/backend/app/api/v1/settings.py
+++ b/backend/app/api/v1/settings.py
@@ -97,7 +97,6 @@ SETTINGS_STORE: Dict[str, Dict[str, Any]] = {
     "api": {
         # Security Settings
         "security_enabled": {"value": True, "type": "boolean", "description": "Enable API security system"},
-        "threat_detection_enabled": {"value": True, "type": "boolean", "description": "Enable threat detection analysis"},
         "rate_limiting_enabled": {"value": True, "type": "boolean", "description": "Enable rate limiting"},
         "ip_reputation_enabled": {"value": True, "type": "boolean", "description": "Enable IP reputation checking"},
         "anomaly_detection_enabled": {"value": True, "type": "boolean", "description": "Enable anomaly detection"},
@@ -112,7 +111,6 @@ SETTINGS_STORE: Dict[str, Dict[str, Any]] = {
         "rate_limit_premium_per_hour": {"value": 100000, "type": "integer", "description": "Rate limit for premium users per hour"},
         
         # Security Thresholds
-        "security_risk_threshold": {"value": 0.8, "type": "float", "description": "Risk score threshold for blocking requests (0.0-1.0)"},
         "security_warning_threshold": {"value": 0.6, "type": "float", "description": "Risk score threshold for warnings (0.0-1.0)"},
         "anomaly_threshold": {"value": 0.7, "type": "float", "description": "Anomaly severity threshold (0.0-1.0)"},
         
@@ -601,7 +599,6 @@ async def reset_to_defaults(
         "api": {
             # Security Settings
             "security_enabled": {"value": True, "type": "boolean"},
-            "threat_detection_enabled": {"value": True, "type": "boolean"},
             "rate_limiting_enabled": {"value": True, "type": "boolean"},
             "ip_reputation_enabled": {"value": True, "type": "boolean"},
             "anomaly_detection_enabled": {"value": True, "type": "boolean"},
@@ -616,7 +613,6 @@ async def reset_to_defaults(
             "rate_limit_premium_per_hour": {"value": 100000, "type": "integer"},
             
             # Security Thresholds
-            "security_risk_threshold": {"value": 0.8, "type": "float"},
             "security_warning_threshold": {"value": 0.6, "type": "float"},
             "anomaly_threshold": {"value": 0.7, "type": "float"},
             
diff --git a/backend/app/core/config.py b/backend/app/core/config.py
index f3ac614..7d53387 100644
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -17,6 +17,8 @@ class Settings(BaseSettings):
     APP_LOG_LEVEL: str = os.getenv("APP_LOG_LEVEL", "INFO")
     APP_HOST: str = os.getenv("APP_HOST", "0.0.0.0")
     APP_PORT: int = int(os.getenv("APP_PORT", "8000"))
+    BACKEND_INTERNAL_PORT: int = int(os.getenv("BACKEND_INTERNAL_PORT", "8000"))
+    FRONTEND_INTERNAL_PORT: int = int(os.getenv("FRONTEND_INTERNAL_PORT", "3000"))
     
     # Detailed logging for LLM interactions
     LOG_LLM_PROMPTS: bool = os.getenv("LOG_LLM_PROMPTS", "False").lower() == "true"  # Set to True to log prompts and context sent to LLM
@@ -73,16 +75,11 @@ class Settings(BaseSettings):
     QDRANT_HOST: str = os.getenv("QDRANT_HOST", "localhost")
     QDRANT_PORT: int = int(os.getenv("QDRANT_PORT", "6333"))
     QDRANT_API_KEY: Optional[str] = os.getenv("QDRANT_API_KEY")
+    QDRANT_URL: str = os.getenv("QDRANT_URL", "http://localhost:6333")
     
-    # API & Security Settings
-    API_SECURITY_ENABLED: bool = os.getenv("API_SECURITY_ENABLED", "True").lower() == "true"
-    API_THREAT_DETECTION_ENABLED: bool = os.getenv("API_THREAT_DETECTION_ENABLED", "True").lower() == "true"
-    API_IP_REPUTATION_ENABLED: bool = os.getenv("API_IP_REPUTATION_ENABLED", "True").lower() == "true"
-    API_ANOMALY_DETECTION_ENABLED: bool = os.getenv("API_ANOMALY_DETECTION_ENABLED", "True").lower() == "true"
-    
+      
     # Rate Limiting Configuration
-    API_RATE_LIMITING_ENABLED: bool = os.getenv("API_RATE_LIMITING_ENABLED", "True").lower() == "true"
-
+  
     # PrivateMode Standard tier limits (organization-level, not per user)
     # These are shared across all API keys and users in the organization
     PRIVATEMODE_REQUESTS_PER_MINUTE: int = int(os.getenv("PRIVATEMODE_REQUESTS_PER_MINUTE", "20"))
@@ -101,23 +98,14 @@ class Settings(BaseSettings):
     # Premium/Enterprise API keys
     API_RATE_LIMIT_PREMIUM_PER_MINUTE: int = int(os.getenv("API_RATE_LIMIT_PREMIUM_PER_MINUTE", "20"))  # Match PrivateMode
     API_RATE_LIMIT_PREMIUM_PER_HOUR: int = int(os.getenv("API_RATE_LIMIT_PREMIUM_PER_HOUR", "1200"))
-    
-    # Security Thresholds
-    API_SECURITY_RISK_THRESHOLD: float = float(os.getenv("API_SECURITY_RISK_THRESHOLD", "0.8"))  # Block requests above this risk score
-    API_SECURITY_WARNING_THRESHOLD: float = float(os.getenv("API_SECURITY_WARNING_THRESHOLD", "0.6"))  # Log warnings above this threshold
-    API_SECURITY_ANOMALY_THRESHOLD: float = float(os.getenv("API_SECURITY_ANOMALY_THRESHOLD", "0.7"))  # Flag anomalies above this threshold
-    
+  
     # Request Size Limits
     API_MAX_REQUEST_BODY_SIZE: int = int(os.getenv("API_MAX_REQUEST_BODY_SIZE", "10485760"))  # 10MB
     API_MAX_REQUEST_BODY_SIZE_PREMIUM: int = int(os.getenv("API_MAX_REQUEST_BODY_SIZE_PREMIUM", "52428800"))  # 50MB for premium
     
     # IP Security
-    API_BLOCKED_IPS: List[str] = os.getenv("API_BLOCKED_IPS", "").split(",") if os.getenv("API_BLOCKED_IPS") else []
-    API_ALLOWED_IPS: List[str] = os.getenv("API_ALLOWED_IPS", "").split(",") if os.getenv("API_ALLOWED_IPS") else []
-    API_IP_REPUTATION_CACHE_TTL: int = int(os.getenv("API_IP_REPUTATION_CACHE_TTL", "3600"))  # 1 hour
     
     # Security Headers
-    API_SECURITY_HEADERS_ENABLED: bool = os.getenv("API_SECURITY_HEADERS_ENABLED", "True").lower() == "true"
     API_CSP_HEADER: str = os.getenv("API_CSP_HEADER", "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'")
     
     # Monitoring
@@ -129,6 +117,19 @@ class Settings(BaseSettings):
     
     # Module configuration
     MODULES_CONFIG_PATH: str = os.getenv("MODULES_CONFIG_PATH", "config/modules.yaml")
+
+    # RAG Embedding Configuration
+    RAG_EMBEDDING_MAX_REQUESTS_PER_MINUTE: int = int(os.getenv("RAG_EMBEDDING_MAX_REQUESTS_PER_MINUTE", "12"))
+    RAG_EMBEDDING_BATCH_SIZE: int = int(os.getenv("RAG_EMBEDDING_BATCH_SIZE", "3"))
+    RAG_EMBEDDING_RETRY_COUNT: int = int(os.getenv("RAG_EMBEDDING_RETRY_COUNT", "3"))
+    RAG_EMBEDDING_RETRY_DELAYS: str = os.getenv("RAG_EMBEDDING_RETRY_DELAYS", "1,2,4,8,16")
+    RAG_EMBEDDING_DELAY_BETWEEN_BATCHES: float = float(os.getenv("RAG_EMBEDDING_DELAY_BETWEEN_BATCHES", "1.0"))
+    RAG_EMBEDDING_DELAY_PER_REQUEST: float = float(os.getenv("RAG_EMBEDDING_DELAY_PER_REQUEST", "0.5"))
+    RAG_ALLOW_FALLBACK_EMBEDDINGS: bool = os.getenv("RAG_ALLOW_FALLBACK_EMBEDDINGS", "True").lower() == "true"
+    RAG_WARN_ON_FALLBACK: bool = os.getenv("RAG_WARN_ON_FALLBACK", "True").lower() == "true"
+    RAG_DOCUMENT_PROCESSING_TIMEOUT: int = int(os.getenv("RAG_DOCUMENT_PROCESSING_TIMEOUT", "300"))
+    RAG_EMBEDDING_GENERATION_TIMEOUT: int = int(os.getenv("RAG_EMBEDDING_GENERATION_TIMEOUT", "120"))
+    RAG_INDEXING_TIMEOUT: int = int(os.getenv("RAG_INDEXING_TIMEOUT", "120"))
     
     # Plugin configuration
     PLUGINS_DIR: str = os.getenv("PLUGINS_DIR", "/plugins")
@@ -142,9 +143,12 @@ class Settings(BaseSettings):
     
     model_config = {
         "env_file": ".env",
-        "case_sensitive": True
+        "case_sensitive": True,
+        # Ignore unknown environment variables to avoid validation errors
+        # when optional/deprecated flags are present in .env
+        "extra": "ignore",
     }
 
 
 # Global settings instance
-settings = Settings()
\ No newline at end of file
+settings = Settings()
diff --git a/backend/app/core/threat_detection.py b/backend/app/core/threat_detection.py
deleted file mode 100644
index cac2c7b..0000000
--- a/backend/app/core/threat_detection.py
+++ /dev/null
@@ -1,744 +0,0 @@
-"""
-Core threat detection and security analysis for the platform
-"""
-
-import re
-import time
-from collections import defaultdict, deque
-from dataclasses import dataclass, field
-from datetime import datetime, timedelta
-from enum import Enum
-from typing import Dict, List, Optional, Set, Tuple, Any, Union
-from urllib.parse import unquote
-
-from fastapi import Request
-from app.core.config import settings
-from app.core.logging import get_logger
-
-logger = get_logger(__name__)
-
-
-class ThreatLevel(Enum):
-    """Threat severity levels"""
-    LOW = "low"
-    MEDIUM = "medium"
-    HIGH = "high"
-    CRITICAL = "critical"
-
-
-class AuthLevel(Enum):
-    """Authentication levels for rate limiting"""
-    AUTHENTICATED = "authenticated"
-    API_KEY = "api_key"
-    PREMIUM = "premium"
-
-
-@dataclass
-class SecurityThreat:
-    """Security threat detection result"""
-    threat_type: str
-    level: ThreatLevel
-    confidence: float
-    description: str
-    source_ip: str
-    user_agent: Optional[str] = None
-    request_path: Optional[str] = None
-    payload: Optional[str] = None
-    timestamp: datetime = field(default_factory=datetime.utcnow)
-    mitigation: Optional[str] = None
-
-
-@dataclass
-class SecurityAnalysis:
-    """Comprehensive security analysis result"""
-    is_threat: bool
-    threats: List[SecurityThreat]
-    risk_score: float
-    recommendations: List[str]
-    auth_level: AuthLevel
-    rate_limit_exceeded: bool
-    should_block: bool
-    timestamp: datetime = field(default_factory=datetime.utcnow)
-
-
-@dataclass
-class RateLimitInfo:
-    """Rate limiting information"""
-    auth_level: AuthLevel
-    requests_per_minute: int
-    requests_per_hour: int
-    minute_limit: int
-    hour_limit: int
-    exceeded: bool
-
-
-@dataclass
-class AnomalyDetection:
-    """Anomaly detection result"""
-    is_anomaly: bool
-    anomaly_type: str
-    severity: float
-    details: Dict[str, Any]
-    baseline_value: Optional[float] = None
-    current_value: Optional[float] = None
-
-
-class ThreatDetectionService:
-    """Core threat detection and security analysis service"""
-    
-    def __init__(self):
-        self.name = "threat_detection"
-        
-        # Statistics
-        self.stats = {
-            'total_requests_analyzed': 0,
-            'threats_detected': 0,
-            'threats_blocked': 0,
-            'anomalies_detected': 0,
-            'rate_limits_exceeded': 0,
-            'total_analysis_time': 0,
-            'threat_types': defaultdict(int),
-            'threat_levels': defaultdict(int),
-            'attacking_ips': defaultdict(int)
-        }
-        
-        # Threat detection patterns
-        self.sql_injection_patterns = [
-            r"(\bunion\b.*\bselect\b)",
-            r"(\bselect\b.*\bfrom\b)",
-            r"(\binsert\b.*\binto\b)",
-            r"(\bupdate\b.*\bset\b)",
-            r"(\bdelete\b.*\bfrom\b)",
-            r"(\bdrop\b.*\btable\b)",
-            r"(\bor\b.*\b1\s*=\s*1\b)",
-            r"(\band\b.*\b1\s*=\s*1\b)",
-            r"(\bexec\b.*\bxp_\w+)",
-            r"(\bsp_\w+)",
-            r"(\bsleep\b\s*\(\s*\d+\s*\))",
-            r"(\bwaitfor\b.*\bdelay\b)",
-            r"(\bbenchmark\b\s*\(\s*\d+)",
-            r"(\bload_file\b\s*\()",
-            r"(\binto\b.*\boutfile\b)"
-        ]
-        
-        self.xss_patterns = [
-            r"<script[^>]*>.*?</script>",
-            r"<iframe[^>]*>.*?</iframe>",
-            r"<object[^>]*>.*?</object>",
-            r"<embed[^>]*>.*?</embed>",
-            r"<link[^>]*>",
-            r"<meta[^>]*>",
-            r"javascript:",
-            r"vbscript:",
-            r"on\w+\s*=",
-            r"style\s*=.*expression",
-            r"style\s*=.*javascript"
-        ]
-        
-        self.path_traversal_patterns = [
-            r"\.\.\/",
-            r"\.\.\\",
-            r"%2e%2e%2f",
-            r"%2e%2e%5c",
-            r"..%2f",
-            r"..%5c",
-            r"%252e%252e%252f",
-            r"%252e%252e%255c"
-        ]
-        
-        self.command_injection_patterns = [
-            r";\s*cat\s+",
-            r";\s*ls\s+",
-            r";\s*pwd\s*",
-            r";\s*whoami\s*",
-            r";\s*id\s*",
-            r";\s*uname\s*",
-            r";\s*ps\s+",
-            r";\s*netstat\s+",
-            r";\s*wget\s+",
-            r";\s*curl\s+",
-            r"\|\s*cat\s+",
-            r"\|\s*ls\s+",
-            r"&&\s*cat\s+",
-            r"&&\s*ls\s+"
-        ]
-        
-        self.suspicious_ua_patterns = [
-            r"sqlmap",
-            r"nikto",
-            r"nmap",
-            r"masscan",
-            r"zap",
-            r"burp",
-            r"w3af",
-            r"acunetix",
-            r"nessus",
-            r"openvas",
-            r"metasploit"
-        ]
-        
-        # Rate limiting tracking - separate by auth level (excluding unauthenticated since they're blocked)
-        self.rate_limits = {
-            AuthLevel.AUTHENTICATED: defaultdict(lambda: {'minute': deque(maxlen=60), 'hour': deque(maxlen=3600)}),
-            AuthLevel.API_KEY: defaultdict(lambda: {'minute': deque(maxlen=60), 'hour': deque(maxlen=3600)}),
-            AuthLevel.PREMIUM: defaultdict(lambda: {'minute': deque(maxlen=60), 'hour': deque(maxlen=3600)})
-        }
-        
-        # Anomaly detection
-        self.request_history = deque(maxlen=1000)
-        self.ip_history = defaultdict(lambda: deque(maxlen=100))
-        self.endpoint_history = defaultdict(lambda: deque(maxlen=100))
-        
-        # Blocked and allowed IPs
-        self.blocked_ips = set(settings.API_BLOCKED_IPS)
-        self.allowed_ips = set(settings.API_ALLOWED_IPS) if settings.API_ALLOWED_IPS else None
-        
-        # IP reputation cache
-        self.ip_reputation_cache = {}
-        self.cache_expiry = {}
-        
-        # Compile patterns for performance
-        self._compile_patterns()
-        
-        logger.info(f"ThreatDetectionService initialized with {len(self.sql_injection_patterns)} SQL patterns, "
-                   f"{len(self.xss_patterns)} XSS patterns, rate limiting enabled: {settings.API_RATE_LIMITING_ENABLED}")
-    
-    def _compile_patterns(self):
-        """Compile regex patterns for better performance"""
-        try:
-            self.compiled_sql_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.sql_injection_patterns]
-            self.compiled_xss_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.xss_patterns]
-            self.compiled_path_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.path_traversal_patterns]
-            self.compiled_cmd_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.command_injection_patterns]
-            self.compiled_ua_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.suspicious_ua_patterns]
-        except re.error as e:
-            logger.error(f"Failed to compile security patterns: {e}")
-            # Fallback to empty lists to prevent crashes
-            self.compiled_sql_patterns = []
-            self.compiled_xss_patterns = []
-            self.compiled_path_patterns = []
-            self.compiled_cmd_patterns = []
-            self.compiled_ua_patterns = []
-    
-    def determine_auth_level(self, request: Request, user_context: Optional[Dict] = None) -> AuthLevel:
-        """Determine authentication level for rate limiting"""
-        # Check if request has API key authentication
-        if hasattr(request.state, 'api_key_context') and request.state.api_key_context:
-            api_key = request.state.api_key_context.get('api_key')
-            if api_key and hasattr(api_key, 'tier'):
-                # Check for premium tier
-                if api_key.tier in ['premium', 'enterprise']:
-                    return AuthLevel.PREMIUM
-            return AuthLevel.API_KEY
-        
-        # Check for JWT authentication
-        if user_context or hasattr(request.state, 'user'):
-            return AuthLevel.AUTHENTICATED
-        
-        # Check Authorization header for API key
-        auth_header = request.headers.get("Authorization", "")
-        api_key_header = request.headers.get("X-API-Key", "")
-        if auth_header.startswith("Bearer ") or api_key_header:
-            return AuthLevel.API_KEY
-        
-        # Default to authenticated since unauthenticated requests are blocked at middleware
-        return AuthLevel.AUTHENTICATED
-    
-    def get_rate_limits(self, auth_level: AuthLevel) -> Tuple[int, int]:
-        """Get rate limits for authentication level"""
-        if not settings.API_RATE_LIMITING_ENABLED:
-            return float('inf'), float('inf')
-        
-        if auth_level == AuthLevel.AUTHENTICATED:
-            return (settings.API_RATE_LIMIT_AUTHENTICATED_PER_MINUTE, settings.API_RATE_LIMIT_AUTHENTICATED_PER_HOUR)
-        elif auth_level == AuthLevel.API_KEY:
-            return (settings.API_RATE_LIMIT_API_KEY_PER_MINUTE, settings.API_RATE_LIMIT_API_KEY_PER_HOUR)
-        elif auth_level == AuthLevel.PREMIUM:
-            return (settings.API_RATE_LIMIT_PREMIUM_PER_MINUTE, settings.API_RATE_LIMIT_PREMIUM_PER_HOUR)
-        else:
-            # Fallback to authenticated limits
-            return (settings.API_RATE_LIMIT_AUTHENTICATED_PER_MINUTE, settings.API_RATE_LIMIT_AUTHENTICATED_PER_HOUR)
-    
-    def check_rate_limit(self, client_ip: str, auth_level: AuthLevel) -> RateLimitInfo:
-        """Check if request exceeds rate limits"""
-        minute_limit, hour_limit = self.get_rate_limits(auth_level)
-        current_time = time.time()
-        
-        # Get or create tracking for this auth level
-        if auth_level not in self.rate_limits:
-            # This shouldn't happen, but handle gracefully
-            return RateLimitInfo(
-                auth_level=auth_level,
-                requests_per_minute=0,
-                requests_per_hour=0,
-                minute_limit=minute_limit,
-                hour_limit=hour_limit,
-                exceeded=False
-            )
-        
-        ip_limits = self.rate_limits[auth_level][client_ip]
-        
-        # Clean old entries
-        minute_ago = current_time - 60
-        hour_ago = current_time - 3600
-        
-        while ip_limits['minute'] and ip_limits['minute'][0] < minute_ago:
-            ip_limits['minute'].popleft()
-        
-        while ip_limits['hour'] and ip_limits['hour'][0] < hour_ago:
-            ip_limits['hour'].popleft()
-        
-        # Check current counts
-        requests_per_minute = len(ip_limits['minute'])
-        requests_per_hour = len(ip_limits['hour'])
-        
-        # Check if limits exceeded
-        exceeded = (requests_per_minute >= minute_limit) or (requests_per_hour >= hour_limit)
-        
-        # Add current request to tracking
-        if not exceeded:
-            ip_limits['minute'].append(current_time)
-            ip_limits['hour'].append(current_time)
-        
-        return RateLimitInfo(
-            auth_level=auth_level,
-            requests_per_minute=requests_per_minute,
-            requests_per_hour=requests_per_hour,
-            minute_limit=minute_limit,
-            hour_limit=hour_limit,
-            exceeded=exceeded
-        )
-    
-    async def analyze_request(self, request: Request, user_context: Optional[Dict] = None) -> SecurityAnalysis:
-        """Perform comprehensive security analysis on a request"""
-        start_time = time.time()
-        
-        try:
-            client_ip = request.client.host if request.client else "unknown"
-            user_agent = request.headers.get("user-agent", "")
-            path = str(request.url.path)
-            method = request.method
-            
-            # Determine authentication level
-            auth_level = self.determine_auth_level(request, user_context)
-            
-            # Check IP allowlist/blocklist first
-            if self.allowed_ips and client_ip not in self.allowed_ips:
-                threat = SecurityThreat(
-                    threat_type="ip_not_allowed",
-                    level=ThreatLevel.HIGH,
-                    confidence=1.0,
-                    description=f"IP {client_ip} not in allowlist",
-                    source_ip=client_ip,
-                    mitigation="Add IP to allowlist or remove IP restrictions"
-                )
-                return SecurityAnalysis(
-                    is_threat=True,
-                    threats=[threat],
-                    risk_score=1.0,
-                    recommendations=["Block request immediately"],
-                    auth_level=auth_level,
-                    rate_limit_exceeded=False,
-                    should_block=True
-                )
-            
-            if client_ip in self.blocked_ips:
-                threat = SecurityThreat(
-                    threat_type="ip_blocked",
-                    level=ThreatLevel.CRITICAL,
-                    confidence=1.0,
-                    description=f"IP {client_ip} is blocked",
-                    source_ip=client_ip,
-                    mitigation="Remove IP from blocklist if legitimate"
-                )
-                return SecurityAnalysis(
-                    is_threat=True,
-                    threats=[threat],
-                    risk_score=1.0,
-                    recommendations=["Block request immediately"],
-                    auth_level=auth_level,
-                    rate_limit_exceeded=False,
-                    should_block=True
-                )
-            
-            # Check rate limiting
-            rate_limit_info = self.check_rate_limit(client_ip, auth_level)
-            if rate_limit_info.exceeded:
-                self.stats['rate_limits_exceeded'] += 1
-                threat = SecurityThreat(
-                    threat_type="rate_limit_exceeded",
-                    level=ThreatLevel.MEDIUM,
-                    confidence=0.9,
-                    description=f"Rate limit exceeded for {auth_level.value}: {rate_limit_info.requests_per_minute}/min, {rate_limit_info.requests_per_hour}/hr",
-                    source_ip=client_ip,
-                    mitigation=f"Implement rate limiting, current limits: {rate_limit_info.minute_limit}/min, {rate_limit_info.hour_limit}/hr"
-                )
-                return SecurityAnalysis(
-                    is_threat=True,
-                    threats=[threat],
-                    risk_score=0.7,
-                    recommendations=[f"Rate limit exceeded for {auth_level.value} user"],
-                    auth_level=auth_level,
-                    rate_limit_exceeded=True,
-                    should_block=True
-                )
-            
-            # Skip threat detection if disabled
-            if not settings.API_THREAT_DETECTION_ENABLED:
-                return SecurityAnalysis(
-                    is_threat=False,
-                    threats=[],
-                    risk_score=0.0,
-                    recommendations=[],
-                    auth_level=auth_level,
-                    rate_limit_exceeded=False,
-                    should_block=False
-                )
-            
-            # Collect request data for threat analysis
-            query_params = str(request.query_params)
-            headers = dict(request.headers)
-            
-            # Try to get body content safely
-            body_content = ""
-            try:
-                if hasattr(request, '_body') and request._body:
-                    body_content = request._body.decode() if isinstance(request._body, bytes) else str(request._body)
-            except:
-                pass
-            
-            threats = []
-            
-            # Analyze for various threats
-            threats.extend(await self._detect_sql_injection(query_params, body_content, path, client_ip))
-            threats.extend(await self._detect_xss(query_params, body_content, headers, client_ip))
-            threats.extend(await self._detect_path_traversal(path, query_params, client_ip))
-            threats.extend(await self._detect_command_injection(query_params, body_content, client_ip))
-            threats.extend(await self._detect_suspicious_patterns(headers, user_agent, path, client_ip))
-            
-            # Anomaly detection if enabled
-            if settings.API_ANOMALY_DETECTION_ENABLED:
-                anomaly = await self._detect_anomalies(client_ip, path, method, len(body_content))
-                if anomaly.is_anomaly and anomaly.severity > settings.API_SECURITY_ANOMALY_THRESHOLD:
-                    threat = SecurityThreat(
-                        threat_type=f"anomaly_{anomaly.anomaly_type}",
-                        level=ThreatLevel.MEDIUM if anomaly.severity > 0.7 else ThreatLevel.LOW,
-                        confidence=anomaly.severity,
-                        description=f"Anomalous behavior detected: {anomaly.details}",
-                        source_ip=client_ip,
-                        user_agent=user_agent,
-                        request_path=path
-                    )
-                    threats.append(threat)
-            
-            # Calculate risk score
-            risk_score = self._calculate_risk_score(threats)
-            
-            # Determine if request should be blocked
-            should_block = risk_score >= settings.API_SECURITY_RISK_THRESHOLD
-            
-            # Generate recommendations
-            recommendations = self._generate_recommendations(threats, risk_score, auth_level)
-            
-            # Update statistics
-            self._update_stats(threats, time.time() - start_time)
-            
-            return SecurityAnalysis(
-                is_threat=len(threats) > 0,
-                threats=threats,
-                risk_score=risk_score,
-                recommendations=recommendations,
-                auth_level=auth_level,
-                rate_limit_exceeded=False,
-                should_block=should_block
-            )
-            
-        except Exception as e:
-            logger.error(f"Error in threat analysis: {e}")
-            return SecurityAnalysis(
-                is_threat=False,
-                threats=[],
-                risk_score=0.0,
-                recommendations=["Error occurred during security analysis"],
-                auth_level=AuthLevel.AUTHENTICATED,
-                rate_limit_exceeded=False,
-                should_block=False
-            )
-    
-    async def _detect_sql_injection(self, query_params: str, body_content: str, path: str, client_ip: str) -> List[SecurityThreat]:
-        """Detect SQL injection attempts"""
-        threats = []
-        content_to_check = f"{query_params} {body_content} {path}".lower()
-        
-        for pattern in self.compiled_sql_patterns:
-            if pattern.search(content_to_check):
-                threat = SecurityThreat(
-                    threat_type="sql_injection",
-                    level=ThreatLevel.HIGH,
-                    confidence=0.85,
-                    description="Potential SQL injection attempt detected",
-                    source_ip=client_ip,
-                    payload=pattern.pattern,
-                    mitigation="Block request, sanitize input, use parameterized queries"
-                )
-                threats.append(threat)
-                break  # Don't duplicate for multiple patterns
-        
-        return threats
-    
-    async def _detect_xss(self, query_params: str, body_content: str, headers: dict, client_ip: str) -> List[SecurityThreat]:
-        """Detect XSS attempts"""
-        threats = []
-        content_to_check = f"{query_params} {body_content}".lower()
-        
-        # Check headers for XSS
-        for header_name, header_value in headers.items():
-            content_to_check += f" {header_value}".lower()
-        
-        for pattern in self.compiled_xss_patterns:
-            if pattern.search(content_to_check):
-                threat = SecurityThreat(
-                    threat_type="xss",
-                    level=ThreatLevel.HIGH,
-                    confidence=0.80,
-                    description="Potential XSS attack detected",
-                    source_ip=client_ip,
-                    payload=pattern.pattern,
-                    mitigation="Block request, sanitize input, implement CSP headers"
-                )
-                threats.append(threat)
-                break
-        
-        return threats
-    
-    async def _detect_path_traversal(self, path: str, query_params: str, client_ip: str) -> List[SecurityThreat]:
-        """Detect path traversal attempts"""
-        threats = []
-        content_to_check = f"{path} {query_params}".lower()
-        decoded_content = unquote(content_to_check)
-        
-        for pattern in self.compiled_path_patterns:
-            if pattern.search(content_to_check) or pattern.search(decoded_content):
-                threat = SecurityThreat(
-                    threat_type="path_traversal",
-                    level=ThreatLevel.HIGH,
-                    confidence=0.90,
-                    description="Path traversal attempt detected",
-                    source_ip=client_ip,
-                    request_path=path,
-                    mitigation="Block request, validate file paths, implement access controls"
-                )
-                threats.append(threat)
-                break
-        
-        return threats
-    
-    async def _detect_command_injection(self, query_params: str, body_content: str, client_ip: str) -> List[SecurityThreat]:
-        """Detect command injection attempts"""
-        threats = []
-        content_to_check = f"{query_params} {body_content}".lower()
-        
-        for pattern in self.compiled_cmd_patterns:
-            if pattern.search(content_to_check):
-                threat = SecurityThreat(
-                    threat_type="command_injection",
-                    level=ThreatLevel.CRITICAL,
-                    confidence=0.95,
-                    description="Command injection attempt detected",
-                    source_ip=client_ip,
-                    payload=pattern.pattern,
-                    mitigation="Block request immediately, sanitize input, disable shell execution"
-                )
-                threats.append(threat)
-                break
-        
-        return threats
-    
-    async def _detect_suspicious_patterns(self, headers: dict, user_agent: str, path: str, client_ip: str) -> List[SecurityThreat]:
-        """Detect suspicious patterns in headers and user agent"""
-        threats = []
-        
-        # Check for suspicious user agents
-        ua_lower = user_agent.lower()
-        for pattern in self.compiled_ua_patterns:
-            if pattern.search(ua_lower):
-                threat = SecurityThreat(
-                    threat_type="suspicious_user_agent",
-                    level=ThreatLevel.HIGH,
-                    confidence=0.85,
-                    description=f"Suspicious user agent detected: {pattern.pattern}",
-                    source_ip=client_ip,
-                    user_agent=user_agent,
-                    mitigation="Block request, monitor IP for further activity"
-                )
-                threats.append(threat)
-                break
-        
-        # Check for suspicious headers
-        if "x-forwarded-for" in headers and "x-real-ip" in headers:
-            # Potential header manipulation
-            threat = SecurityThreat(
-                threat_type="header_manipulation",
-                level=ThreatLevel.LOW,
-                confidence=0.30,
-                description="Potential IP header manipulation detected",
-                source_ip=client_ip,
-                mitigation="Validate proxy headers, implement IP whitelisting"
-            )
-            threats.append(threat)
-        
-        return threats
-    
-    async def _detect_anomalies(self, client_ip: str, path: str, method: str, body_size: int) -> AnomalyDetection:
-        """Detect anomalous behavior patterns"""
-        try:
-            # Request size anomaly
-            max_size = settings.API_MAX_REQUEST_BODY_SIZE
-            if body_size > max_size:
-                return AnomalyDetection(
-                    is_anomaly=True,
-                    anomaly_type="request_size",
-                    severity=0.8,
-                    details={"body_size": body_size, "threshold": max_size},
-                    current_value=body_size,
-                    baseline_value=max_size // 10
-                )
-            
-            # Unusual endpoint access
-            if path.startswith("/admin") or path.startswith("/api/admin"):
-                return AnomalyDetection(
-                    is_anomaly=True,
-                    anomaly_type="sensitive_endpoint",
-                    severity=0.6,
-                    details={"path": path, "reason": "admin endpoint access"},
-                    current_value=1.0,
-                    baseline_value=0.0
-                )
-            
-            # IP request frequency anomaly
-            current_time = time.time()
-            ip_requests = self.ip_history[client_ip]
-            
-            # Clean old entries (last 5 minutes)
-            five_minutes_ago = current_time - 300
-            while ip_requests and ip_requests[0] < five_minutes_ago:
-                ip_requests.popleft()
-            
-            ip_requests.append(current_time)
-            
-            if len(ip_requests) > 100:  # More than 100 requests in 5 minutes
-                return AnomalyDetection(
-                    is_anomaly=True,
-                    anomaly_type="request_frequency",
-                    severity=0.7,
-                    details={"requests_5min": len(ip_requests), "threshold": 100},
-                    current_value=len(ip_requests),
-                    baseline_value=10  # 10 requests baseline
-                )
-            
-            return AnomalyDetection(
-                is_anomaly=False,
-                anomaly_type="none",
-                severity=0.0,
-                details={}
-            )
-            
-        except Exception as e:
-            logger.error(f"Error in anomaly detection: {e}")
-            return AnomalyDetection(
-                is_anomaly=False,
-                anomaly_type="error",
-                severity=0.0,
-                details={"error": str(e)}
-            )
-    
-    def _calculate_risk_score(self, threats: List[SecurityThreat]) -> float:
-        """Calculate overall risk score based on threats"""
-        if not threats:
-            return 0.0
-        
-        score = 0.0
-        for threat in threats:
-            level_multiplier = {
-                ThreatLevel.LOW: 0.25,
-                ThreatLevel.MEDIUM: 0.5,
-                ThreatLevel.HIGH: 0.75,
-                ThreatLevel.CRITICAL: 1.0
-            }
-            score += threat.confidence * level_multiplier.get(threat.level, 0.5)
-        
-        # Normalize to 0-1 range
-        return min(score / len(threats), 1.0)
-    
-    def _generate_recommendations(self, threats: List[SecurityThreat], risk_score: float, auth_level: AuthLevel) -> List[str]:
-        """Generate security recommendations based on analysis"""
-        recommendations = []
-        
-        if risk_score >= settings.API_SECURITY_RISK_THRESHOLD:
-            recommendations.append("CRITICAL: Block this request immediately")
-        elif risk_score >= settings.API_SECURITY_WARNING_THRESHOLD:
-            recommendations.append("HIGH: Consider blocking or rate limiting this IP")
-        elif risk_score > 0.4:
-            recommendations.append("MEDIUM: Monitor this IP closely")
-        
-        threat_types = {threat.threat_type for threat in threats}
-        
-        if "sql_injection" in threat_types:
-            recommendations.append("Implement parameterized queries and input validation")
-        
-        if "xss" in threat_types:
-            recommendations.append("Implement Content Security Policy (CSP) headers")
-        
-        if "command_injection" in threat_types:
-            recommendations.append("Disable shell execution and validate all inputs")
-        
-        if "path_traversal" in threat_types:
-            recommendations.append("Implement proper file path validation and access controls")
-        
-        if "rate_limit_exceeded" in threat_types:
-            recommendations.append(f"Rate limiting active for {auth_level.value} user")
-        
-        if not recommendations:
-            recommendations.append("No immediate action required, continue monitoring")
-        
-        return recommendations
-    
-    def _update_stats(self, threats: List[SecurityThreat], analysis_time: float):
-        """Update service statistics"""
-        self.stats['total_requests_analyzed'] += 1
-        self.stats['total_analysis_time'] += analysis_time
-        
-        if threats:
-            self.stats['threats_detected'] += len(threats)
-            for threat in threats:
-                self.stats['threat_types'][threat.threat_type] += 1
-                self.stats['threat_levels'][threat.level.value] += 1
-                if threat.source_ip:
-                    self.stats['attacking_ips'][threat.source_ip] += 1
-    
-    def get_stats(self) -> Dict[str, Any]:
-        """Get service statistics"""
-        avg_time = (self.stats['total_analysis_time'] / self.stats['total_requests_analyzed'] 
-                   if self.stats['total_requests_analyzed'] > 0 else 0)
-        
-        # Get top attacking IPs
-        top_ips = sorted(self.stats['attacking_ips'].items(), key=lambda x: x[1], reverse=True)[:10]
-        
-        return {
-            "total_requests_analyzed": self.stats['total_requests_analyzed'],
-            "threats_detected": self.stats['threats_detected'],
-            "threats_blocked": self.stats['threats_blocked'],
-            "anomalies_detected": self.stats['anomalies_detected'],
-            "rate_limits_exceeded": self.stats['rate_limits_exceeded'],
-            "avg_analysis_time": avg_time,
-            "threat_types": dict(self.stats['threat_types']),
-            "threat_levels": dict(self.stats['threat_levels']),
-            "top_attacking_ips": top_ips,
-            "security_enabled": settings.API_SECURITY_ENABLED,
-            "threat_detection_enabled": settings.API_THREAT_DETECTION_ENABLED,
-            "rate_limiting_enabled": settings.API_RATE_LIMITING_ENABLED
-        }
-
-
-# Global threat detection service instance
-threat_detection_service = ThreatDetectionService()
\ No newline at end of file
diff --git a/backend/app/main.py b/backend/app/main.py
index 8bea827..8c8b26d 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -52,10 +52,18 @@ async def lifespan(app: FastAPI):
     
     # Initialize config manager
     await init_config_manager()
-    
+
+    # Initialize LLM service (needed by RAG module)
+    from app.services.llm.service import llm_service
+    try:
+        await llm_service.initialize()
+        logger.info("LLM service initialized successfully")
+    except Exception as e:
+        logger.warning(f"LLM service initialization failed: {e}")
+
     # Initialize analytics service
     init_analytics_service()
-    
+
     # Initialize module manager with FastAPI app for router registration
     await module_manager.initialize(app)
     app.state.module_manager = module_manager
diff --git a/backend/app/middleware/rate_limiting.py b/backend/app/middleware/rate_limiting.py
deleted file mode 100644
index f6e1901..0000000
--- a/backend/app/middleware/rate_limiting.py
+++ /dev/null
@@ -1,371 +0,0 @@
-"""
-Rate limiting middleware
-"""
-
-import time
-import redis
-from typing import Dict, Optional
-from fastapi import Request, HTTPException, status
-from fastapi.responses import JSONResponse
-from starlette.middleware.base import BaseHTTPMiddleware
-import asyncio
-from datetime import datetime, timedelta
-
-from app.core.config import settings
-from app.core.logging import get_logger
-
-logger = get_logger(__name__)
-
-
-class RateLimiter:
-    """Rate limiting implementation using Redis"""
-    
-    def __init__(self):
-        try:
-            self.redis_client = redis.from_url(settings.REDIS_URL, decode_responses=True)
-            self.redis_client.ping()  # Test connection
-            logger.info("Rate limiter initialized with Redis backend")
-        except Exception as e:
-            logger.warning(f"Redis not available for rate limiting: {e}")
-            self.redis_client = None
-            # Fall back to in-memory rate limiting
-            self.memory_store: Dict[str, Dict[str, float]] = {}
-    
-    async def check_rate_limit(
-        self,
-        key: str,
-        limit: int,
-        window_seconds: int,
-        identifier: str = "default"
-    ) -> tuple[bool, Dict[str, int]]:
-        """
-        Check if request is within rate limit
-        
-        Args:
-            key: Rate limiting key (e.g., IP address, API key)
-            limit: Maximum number of requests allowed
-            window_seconds: Time window in seconds
-            identifier: Additional identifier for the rate limit
-            
-        Returns:
-            Tuple of (is_allowed, headers_dict)
-        """
-        
-        full_key = f"rate_limit:{identifier}:{key}"
-        current_time = int(time.time())
-        window_start = current_time - window_seconds
-        
-        if self.redis_client:
-            return await self._check_redis_rate_limit(
-                full_key, limit, window_seconds, current_time, window_start
-            )
-        else:
-            return self._check_memory_rate_limit(
-                full_key, limit, window_seconds, current_time, window_start
-            )
-    
-    async def _check_redis_rate_limit(
-        self,
-        key: str,
-        limit: int,
-        window_seconds: int,
-        current_time: int,
-        window_start: int
-    ) -> tuple[bool, Dict[str, int]]:
-        """Check rate limit using Redis"""
-        
-        pipe = self.redis_client.pipeline()
-        
-        # Remove old entries
-        pipe.zremrangebyscore(key, 0, window_start)
-        
-        # Count current requests in window
-        pipe.zcard(key)
-        
-        # Add current request
-        pipe.zadd(key, {str(current_time): current_time})
-        
-        # Set expiration
-        pipe.expire(key, window_seconds + 1)
-        
-        results = pipe.execute()
-        current_requests = results[1]
-        
-        # Calculate remaining requests and reset time
-        remaining = max(0, limit - current_requests - 1)
-        reset_time = current_time + window_seconds
-        
-        headers = {
-            "X-RateLimit-Limit": limit,
-            "X-RateLimit-Remaining": remaining,
-            "X-RateLimit-Reset": reset_time,
-            "X-RateLimit-Window": window_seconds
-        }
-        
-        is_allowed = current_requests < limit
-        
-        if not is_allowed:
-            logger.warning(f"Rate limit exceeded for key: {key}")
-        
-        return is_allowed, headers
-    
-    def _check_memory_rate_limit(
-        self,
-        key: str,
-        limit: int,
-        window_seconds: int,
-        current_time: int,
-        window_start: int
-    ) -> tuple[bool, Dict[str, int]]:
-        """Check rate limit using in-memory storage"""
-        
-        if key not in self.memory_store:
-            self.memory_store[key] = {}
-        
-        # Clean old entries
-        store = self.memory_store[key]
-        keys_to_remove = [k for k, v in store.items() if v < window_start]
-        for k in keys_to_remove:
-            del store[k]
-        
-        current_requests = len(store)
-        
-        # Calculate remaining requests and reset time
-        remaining = max(0, limit - current_requests - 1)
-        reset_time = current_time + window_seconds
-        
-        headers = {
-            "X-RateLimit-Limit": limit,
-            "X-RateLimit-Remaining": remaining,
-            "X-RateLimit-Reset": reset_time,
-            "X-RateLimit-Window": window_seconds
-        }
-        
-        is_allowed = current_requests < limit
-        
-        if is_allowed:
-            # Add current request
-            store[str(current_time)] = current_time
-        else:
-            logger.warning(f"Rate limit exceeded for key: {key}")
-        
-        return is_allowed, headers
-
-
-# Global rate limiter instance
-rate_limiter = RateLimiter()
-
-
-class RateLimitMiddleware(BaseHTTPMiddleware):
-    """Rate limiting middleware for FastAPI"""
-
-    def __init__(self, app):
-        super().__init__(app)
-        self.rate_limiter = RateLimiter()
-        logger.info("RateLimitMiddleware initialized")
-
-    async def dispatch(self, request: Request, call_next):
-        """Process request through rate limiting"""
-
-        # Skip rate limiting if disabled in settings
-        if not settings.API_RATE_LIMITING_ENABLED:
-            response = await call_next(request)
-            return response
-
-        # Skip rate limiting for all internal API endpoints (platform operations)
-        if request.url.path.startswith("/api-internal/v1/"):
-            response = await call_next(request)
-            return response
-
-        # Only apply rate limiting to privatemode.ai proxy endpoints (OpenAI-compatible API and LLM service)
-        # Skip for all other endpoints
-        if not (request.url.path.startswith("/api/v1/chat/completions") or
-                request.url.path.startswith("/api/v1/embeddings") or
-                request.url.path.startswith("/api/v1/models") or
-                request.url.path.startswith("/api/v1/llm/")):
-            response = await call_next(request)
-            return response
-
-        # Skip rate limiting for health checks and static files
-        if request.url.path in ["/health", "/", "/api/v1/docs", "/api/v1/openapi.json"]:
-            response = await call_next(request)
-            return response
-
-        # Get client IP
-        client_ip = request.client.host
-        forwarded_for = request.headers.get("X-Forwarded-For")
-        if forwarded_for:
-            client_ip = forwarded_for.split(",")[0].strip()
-
-        # Check for API key in headers
-        api_key = None
-        auth_header = request.headers.get("Authorization")
-        if auth_header and auth_header.startswith("Bearer "):
-            api_key = auth_header[7:]
-        elif request.headers.get("X-API-Key"):
-            api_key = request.headers.get("X-API-Key")
-
-        # Determine rate limiting strategy
-        headers = {}
-        is_allowed = True
-
-        if api_key:
-            # API key-based rate limiting
-            api_key_key = f"api_key:{api_key}"
-
-            # First check organization-wide limits (PrivateMode limits are org-wide)
-            org_key = "organization:privatemode"
-
-            # Check organization per-minute limit
-            org_allowed_minute, org_headers_minute = await self.rate_limiter.check_rate_limit(
-                org_key, settings.PRIVATEMODE_REQUESTS_PER_MINUTE, 60, "minute"
-            )
-
-            # Check organization per-hour limit
-            org_allowed_hour, org_headers_hour = await self.rate_limiter.check_rate_limit(
-                org_key, settings.PRIVATEMODE_REQUESTS_PER_HOUR, 3600, "hour"
-            )
-
-            # If organization limits are exceeded, return 429
-            if not (org_allowed_minute and org_allowed_hour):
-                logger.warning(f"Organization rate limit exceeded for {org_key}")
-                return JSONResponse(
-                    status_code=status.HTTP_429_TOO_MANY_REQUESTS,
-                    content={"detail": "Organization rate limit exceeded"},
-                    headers=org_headers_minute
-                )
-
-            # Then check per-API key limits
-            limit_per_minute = settings.API_RATE_LIMIT_API_KEY_PER_MINUTE
-            limit_per_hour = settings.API_RATE_LIMIT_API_KEY_PER_HOUR
-
-            # Check per-minute limit
-            is_allowed_minute, headers_minute = await self.rate_limiter.check_rate_limit(
-                api_key_key, limit_per_minute, 60, "minute"
-            )
-
-            # Check per-hour limit
-            is_allowed_hour, headers_hour = await self.rate_limiter.check_rate_limit(
-                api_key_key, limit_per_hour, 3600, "hour"
-            )
-
-            is_allowed = is_allowed_minute and is_allowed_hour
-            headers = headers_minute  # Use minute headers for response
-
-        else:
-            # IP-based rate limiting for unauthenticated requests
-            rate_limit_key = f"ip:{client_ip}"
-
-            # More restrictive limits for unauthenticated requests
-            limit_per_minute = 20  # Hardcoded for unauthenticated users
-            limit_per_hour = 100
-
-            # Check per-minute limit
-            is_allowed_minute, headers_minute = await self.rate_limiter.check_rate_limit(
-                rate_limit_key, limit_per_minute, 60, "minute"
-            )
-
-            # Check per-hour limit
-            is_allowed_hour, headers_hour = await self.rate_limiter.check_rate_limit(
-                rate_limit_key, limit_per_hour, 3600, "hour"
-            )
-
-            is_allowed = is_allowed_minute and is_allowed_hour
-            headers = headers_minute  # Use minute headers for response
-
-        # If rate limit exceeded, return 429
-        if not is_allowed:
-            return JSONResponse(
-                status_code=status.HTTP_429_TOO_MANY_REQUESTS,
-                content={
-                    "error": "RATE_LIMIT_EXCEEDED",
-                    "message": "Rate limit exceeded. Please try again later.",
-                    "details": {
-                        "limit": headers["X-RateLimit-Limit"],
-                        "reset_time": headers["X-RateLimit-Reset"]
-                    }
-                },
-                headers={k: str(v) for k, v in headers.items()}
-            )
-
-        # Continue with request
-        response = await call_next(request)
-
-        # Add rate limit headers to response
-        for key, value in headers.items():
-            response.headers[key] = str(value)
-
-        return response
-
-
-# Keep the old function for backward compatibility
-async def rate_limit_middleware(request: Request, call_next):
-    """Legacy function - use RateLimitMiddleware class instead"""
-    middleware = RateLimitMiddleware(None)
-    return await middleware.dispatch(request, call_next)
-
-
-class RateLimitExceeded(HTTPException):
-    """Exception raised when rate limit is exceeded"""
-    
-    def __init__(self, limit: int, reset_time: int):
-        super().__init__(
-            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
-            detail=f"Rate limit exceeded. Limit: {limit}, Reset: {reset_time}"
-        )
-
-
-# Decorator for applying rate limits to specific endpoints
-def rate_limit(requests_per_minute: int = 60, requests_per_hour: int = 1000):
-    """
-    Decorator to apply rate limiting to specific endpoints
-    
-    Args:
-        requests_per_minute: Maximum requests per minute
-        requests_per_hour: Maximum requests per hour
-    """
-    def decorator(func):
-        async def wrapper(*args, **kwargs):
-            # This would be implemented to work with FastAPI dependencies
-            # For now, this is a placeholder for endpoint-specific rate limiting
-            return await func(*args, **kwargs)
-        return wrapper
-    return decorator
-
-
-# Helper functions for different rate limiting strategies
-async def check_api_key_rate_limit(api_key: str, endpoint: str) -> bool:
-    """Check rate limit for specific API key and endpoint"""
-    
-    # This would lookup API key specific limits from database
-    # For now, using default limits
-    key = f"api_key:{api_key}:endpoint:{endpoint}"
-    
-    is_allowed, _ = await rate_limiter.check_rate_limit(
-        key, limit=100, window_seconds=60, identifier="endpoint"
-    )
-    
-    return is_allowed
-
-
-async def check_user_rate_limit(user_id: str, action: str) -> bool:
-    """Check rate limit for specific user and action"""
-    
-    key = f"user:{user_id}:action:{action}"
-    
-    is_allowed, _ = await rate_limiter.check_rate_limit(
-        key, limit=50, window_seconds=60, identifier="user_action"
-    )
-    
-    return is_allowed
-
-
-async def apply_burst_protection(key: str) -> bool:
-    """Apply burst protection for high-frequency actions"""
-    
-    # Allow burst of 10 requests in 10 seconds
-    is_allowed, _ = await rate_limiter.check_rate_limit(
-        key, limit=10, window_seconds=10, identifier="burst"
-    )
-    
-    return is_allowed
\ No newline at end of file
diff --git a/backend/app/middleware/security.py b/backend/app/middleware/security.py
deleted file mode 100644
index c7b7952..0000000
--- a/backend/app/middleware/security.py
+++ /dev/null
@@ -1,210 +0,0 @@
-"""
-Security middleware for request/response processing
-"""
-
-import json
-import time
-from typing import Callable, Optional, Dict, Any
-
-from fastapi import Request, Response
-from fastapi.responses import JSONResponse
-from starlette.middleware.base import BaseHTTPMiddleware
-
-from app.core.config import settings
-from app.core.logging import get_logger
-from app.core.threat_detection import threat_detection_service, SecurityAnalysis
-
-logger = get_logger(__name__)
-
-
-class SecurityMiddleware(BaseHTTPMiddleware):
-    """Security middleware for threat detection and request filtering - DISABLED"""
-
-    def __init__(self, app, enabled: bool = True):
-        super().__init__(app)
-        self.enabled = False  # Force disable regardless of settings
-        logger.info("SecurityMiddleware initialized, enabled: False (DISABLED)")
-    
-    async def dispatch(self, request: Request, call_next: Callable) -> Response:
-        """Process request through security analysis - DISABLED"""
-        # Security disabled, always pass through
-        return await call_next(request)
-    
-    def _should_skip_security(self, request: Request) -> bool:
-        """Determine if security analysis should be skipped for this request"""
-        path = request.url.path
-        
-        # Skip for health checks, authentication endpoints, and static assets
-        skip_paths = [
-            "/health",
-            "/metrics", 
-            "/api/v1/docs",
-            "/api/v1/openapi.json",
-            "/api/v1/redoc",
-            "/favicon.ico",
-            "/api/v1/auth/register",
-            "/api/v1/auth/login",
-            "/api/v1/auth/refresh",  # Allow refresh endpoint
-            "/api-internal/v1/auth/register",
-            "/api-internal/v1/auth/login",
-            "/api-internal/v1/auth/refresh",  # Allow refresh endpoint for internal API
-            "/",  # Root endpoint
-        ]
-        
-        # Skip for static file extensions
-        static_extensions = [".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".ico", ".svg", ".woff", ".woff2"]
-        
-        return (
-            path in skip_paths or 
-            any(path.endswith(ext) for ext in static_extensions) or
-            path.startswith("/static/")
-        )
-    
-    def _has_valid_auth(self, request: Request) -> bool:
-        """Check if request has valid authentication"""
-        # Check Authorization header
-        auth_header = request.headers.get("Authorization", "")
-        api_key_header = request.headers.get("X-API-Key", "")
-        
-        # Has some form of auth token/key
-        return (
-            auth_header.startswith("Bearer ") and len(auth_header) > 7 or
-            len(api_key_header.strip()) > 0
-        )
-    
-    def _create_block_response(self, analysis: SecurityAnalysis) -> JSONResponse:
-        """Create response for blocked requests"""
-        # Determine status code based on threat type
-        status_code = 403  # Forbidden by default
-
-        # Critical threats get 403
-        for threat in analysis.threats:
-            if threat.threat_type in ["command_injection", "sql_injection"]:
-                status_code = 403
-                break
-
-        response_data = {
-            "error": "Security Policy Violation",
-            "message": "Request blocked due to security policy violation",
-            "risk_score": round(analysis.risk_score, 3),
-            "auth_level": analysis.auth_level.value,
-            "threat_count": len(analysis.threats),
-            "recommendations": analysis.recommendations[:3]  # Limit to first 3 recommendations
-        }
-
-        response = JSONResponse(
-            content=response_data,
-            status_code=status_code
-        )
-
-        return response
-    
-    def _add_security_headers(self, response: Response) -> Response:
-        """Add security headers to response"""
-        if not settings.API_SECURITY_HEADERS_ENABLED:
-            return response
-        
-        # Standard security headers
-        response.headers["X-Content-Type-Options"] = "nosniff"
-        response.headers["X-Frame-Options"] = "DENY"
-        response.headers["X-XSS-Protection"] = "1; mode=block"
-        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
-        
-        # Only add HSTS for HTTPS
-        if hasattr(response, 'headers') and response.headers.get("X-Forwarded-Proto") == "https":
-            response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
-        
-        # Content Security Policy
-        if settings.API_CSP_HEADER:
-            response.headers["Content-Security-Policy"] = settings.API_CSP_HEADER
-        
-        return response
-    
-    def _add_security_metrics(self, response: Response, analysis: SecurityAnalysis, analysis_time: float) -> Response:
-        """Add security metrics to response headers (for debugging/monitoring)"""
-        # Only add in debug mode or for admin users
-        if settings.APP_DEBUG:
-            response.headers["X-Security-Risk-Score"] = str(round(analysis.risk_score, 3))
-            response.headers["X-Security-Threats"] = str(len(analysis.threats))
-            response.headers["X-Security-Auth-Level"] = analysis.auth_level.value
-            response.headers["X-Security-Analysis-Time"] = f"{analysis_time*1000:.1f}ms"
-        
-        return response
-    
-    async def _log_security_event(self, request: Request, analysis: SecurityAnalysis):
-        """Log security events for audit and monitoring"""
-        client_ip = request.client.host if request.client else "unknown"
-        user_agent = request.headers.get("user-agent", "")
-        
-        # Create security event log
-        event_data = {
-            "timestamp": analysis.timestamp.isoformat(),
-            "client_ip": client_ip,
-            "user_agent": user_agent,
-            "path": str(request.url.path),
-            "method": request.method,
-            "risk_score": round(analysis.risk_score, 3),
-            "auth_level": analysis.auth_level.value,
-            "threat_count": len(analysis.threats),
-            "rate_limit_exceeded": analysis.rate_limit_exceeded,
-            "should_block": analysis.should_block,
-            "threats": [
-                {
-                    "type": threat.threat_type,
-                    "level": threat.level.value,
-                    "confidence": round(threat.confidence, 3),
-                    "description": threat.description
-                }
-                for threat in analysis.threats[:5]  # Limit to first 5 threats
-            ],
-            "recommendations": analysis.recommendations
-        }
-        
-        # Log at appropriate level based on risk
-        if analysis.should_block:
-            logger.warning(f"SECURITY_BLOCK: {json.dumps(event_data)}")
-        elif analysis.risk_score >= settings.API_SECURITY_WARNING_THRESHOLD:
-            logger.warning(f"SECURITY_WARNING: {json.dumps(event_data)}")
-        else:
-            logger.info(f"SECURITY_THREAT: {json.dumps(event_data)}")
-
-
-def setup_security_middleware(app, enabled: bool = True) -> None:
-    """Setup security middleware on FastAPI app"""
-    if enabled and settings.API_SECURITY_ENABLED:
-        app.add_middleware(SecurityMiddleware, enabled=enabled)
-        logger.info("Security middleware enabled")
-    else:
-        logger.info("Security middleware disabled")
-
-
-# Helper functions for manual security checks
-async def analyze_request_security(request: Request, user_context: Optional[Dict] = None) -> SecurityAnalysis:
-    """Manually analyze request security (for use in route handlers)"""
-    return await threat_detection_service.analyze_request(request, user_context)
-
-
-def get_security_stats() -> Dict[str, Any]:
-    """Get security statistics"""
-    return threat_detection_service.get_stats()
-
-
-def is_request_blocked(request: Request) -> bool:
-    """Check if request was blocked by security analysis"""
-    if hasattr(request.state, 'security_analysis'):
-        return request.state.security_analysis.should_block
-    return False
-
-
-def get_request_risk_score(request: Request) -> float:
-    """Get risk score for request"""
-    if hasattr(request.state, 'security_analysis'):
-        return request.state.security_analysis.risk_score
-    return 0.0
-
-
-def get_request_auth_level(request: Request) -> str:
-    """Get authentication level for request"""
-    if hasattr(request.state, 'security_analysis'):
-        return request.state.security_analysis.auth_level.value
-    return "unknown"
\ No newline at end of file
diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py
index 8447333..8875ae8 100644
--- a/backend/app/services/document_processor.py
+++ b/backend/app/services/document_processor.py
@@ -162,6 +162,7 @@ class DocumentProcessor:
     
     async def _process_document(self, task: ProcessingTask) -> bool:
         """Process a single document"""
+        from datetime import datetime
         from app.db.database import async_session_factory
         async with async_session_factory() as session:
             try:
@@ -182,16 +183,24 @@ class DocumentProcessor:
                 document.status = ProcessingStatus.PROCESSING
                 await session.commit()
                 
-                # Get RAG module for processing (now includes content processing)
+                # Get RAG module for processing
                 try:
-                    from app.services.module_manager import module_manager
-                    rag_module = module_manager.get_module('rag')
+                    # Import RAG module and initialize it properly
+                    from modules.rag.main import RAGModule
+                    from app.core.config import settings
+
+                    # Create and initialize RAG module instance
+                    rag_module = RAGModule(settings)
+                    init_result = await rag_module.initialize()
+                    if not rag_module.enabled:
+                        raise Exception("Failed to enable RAG module")
+
                 except Exception as e:
                     logger.error(f"Failed to get RAG module: {e}")
                     raise Exception(f"RAG module not available: {e}")
-                
-                if not rag_module:
-                    raise Exception("RAG module not available")
+
+                if not rag_module or not rag_module.enabled:
+                    raise Exception("RAG module not available or not enabled")
                 
                 logger.info(f"RAG module loaded successfully for document {task.document_id}")
                 
@@ -204,31 +213,45 @@ class DocumentProcessor:
                 
                 # Process with RAG module
                 logger.info(f"Starting document processing for document {task.document_id} with RAG module")
-                try:
-                    # Add timeout to prevent hanging
-                    processed_doc = await asyncio.wait_for(
-                        rag_module.process_document(
-                            file_content, 
-                            document.original_filename, 
-                            {}
-                        ),
-                        timeout=300.0  # 5 minute timeout
-                    )
-                    logger.info(f"Document processing completed for document {task.document_id}")
-                except asyncio.TimeoutError:
-                    logger.error(f"Document processing timed out for document {task.document_id}")
-                    raise Exception("Document processing timed out after 5 minutes")
-                except Exception as e:
-                    logger.error(f"Document processing failed for document {task.document_id}: {e}")
-                    raise
-                
-                # Update document with processed content
-                document.converted_content = processed_doc.content
-                document.word_count = processed_doc.word_count
-                document.character_count = len(processed_doc.content)
-                document.document_metadata = processed_doc.metadata
-                document.status = ProcessingStatus.PROCESSED
-                document.processed_at = datetime.utcnow()
+
+                # Special handling for JSONL files - skip processing phase
+                if document.file_type == 'jsonl':
+                    # For JSONL files, we don't need to process content here
+                    # The optimized JSONL processor will handle everything during indexing
+                    document.converted_content = f"JSONL file with {len(file_content)} bytes"
+                    document.word_count = 0  # Will be updated during indexing
+                    document.character_count = len(file_content)
+                    document.document_metadata = {"file_path": document.file_path, "processed": "jsonl"}
+                    document.status = ProcessingStatus.PROCESSED
+                    document.processed_at = datetime.utcnow()
+                    logger.info(f"JSONL document {task.document_id} marked for optimized processing")
+                else:
+                    # Standard processing for other file types
+                    try:
+                        # Add timeout to prevent hanging
+                        processed_doc = await asyncio.wait_for(
+                            rag_module.process_document(
+                                file_content,
+                                document.original_filename,
+                                {"file_path": document.file_path}
+                            ),
+                            timeout=300.0  # 5 minute timeout
+                        )
+                        logger.info(f"Document processing completed for document {task.document_id}")
+
+                        # Update document with processed content
+                        document.converted_content = processed_doc.content
+                        document.word_count = processed_doc.word_count
+                        document.character_count = len(processed_doc.content)
+                        document.document_metadata = processed_doc.metadata
+                        document.status = ProcessingStatus.PROCESSED
+                        document.processed_at = datetime.utcnow()
+                    except asyncio.TimeoutError:
+                        logger.error(f"Document processing timed out for document {task.document_id}")
+                        raise Exception("Document processing timed out after 5 minutes")
+                    except Exception as e:
+                        logger.error(f"Document processing failed for document {task.document_id}: {e}")
+                        raise
                 
                 # Index in RAG system using same RAG module
                 if rag_module and document.converted_content:
@@ -245,14 +268,57 @@ class DocumentProcessor:
                         }
                         
                         # Use the correct Qdrant collection name for this document
-                        await asyncio.wait_for(
-                            rag_module.index_document(
-                                content=document.converted_content,
-                                metadata=doc_metadata,
-                                collection_name=document.collection.qdrant_collection_name
-                            ),
-                            timeout=120.0  # 2 minute timeout for indexing
-                        )
+                        # For JSONL files, we need to use the processed document flow
+                        if document.file_type == 'jsonl':
+                            # Create a ProcessedDocument for the JSONL processor
+                            from app.modules.rag.main import ProcessedDocument
+                            from datetime import datetime
+                            import hashlib
+
+                            # Calculate file hash
+                            processed_at = datetime.utcnow()
+                            file_hash = hashlib.md5(str(document.id).encode()).hexdigest()
+
+                            processed_doc = ProcessedDocument(
+                                id=str(document.id),
+                                content="",  # Will be filled by JSONL processor
+                                extracted_text="",  # Will be filled by JSONL processor
+                                metadata={
+                                    **doc_metadata,
+                                    "file_path": document.file_path
+                                },
+                                original_filename=document.original_filename,
+                                file_type=document.file_type,
+                                mime_type=document.mime_type,
+                                language=document.document_metadata.get('language', 'EN'),
+                                word_count=0,  # Will be updated during processing
+                                sentence_count=0,  # Will be updated during processing
+                                entities=[],
+                                keywords=[],
+                                processing_time=0.0,
+                                processed_at=processed_at,
+                                file_hash=file_hash,
+                                file_size=document.file_size
+                            )
+
+                            # The JSONL processor will read the original file
+                            await asyncio.wait_for(
+                                rag_module.index_processed_document(
+                                    processed_doc=processed_doc,
+                                    collection_name=document.collection.qdrant_collection_name
+                                ),
+                                timeout=300.0  # 5 minute timeout for JSONL processing
+                            )
+                        else:
+                            # Use standard indexing for other file types
+                            await asyncio.wait_for(
+                                rag_module.index_document(
+                                    content=document.converted_content,
+                                    metadata=doc_metadata,
+                                    collection_name=document.collection.qdrant_collection_name
+                                ),
+                                timeout=120.0  # 2 minute timeout for indexing
+                            )
                         
                         logger.info(f"Document {task.document_id} indexed successfully in collection {document.collection.qdrant_collection_name}")
                         
@@ -271,7 +337,9 @@ class DocumentProcessor:
                         
                     except Exception as e:
                         logger.error(f"Failed to index document {task.document_id} in RAG: {e}")
-                        # Keep as processed even if indexing fails
+                        # Mark as error since indexing failed
+                        document.status = ProcessingStatus.ERROR
+                        document.processing_error = f"Indexing failed: {str(e)}"
                         # Don't raise the exception to avoid retries on indexing failures
                 
                 await session.commit()
diff --git a/backend/app/services/embedding_service.py b/backend/app/services/embedding_service.py
index 4032086..ab7e04f 100644
--- a/backend/app/services/embedding_service.py
+++ b/backend/app/services/embedding_service.py
@@ -28,9 +28,19 @@ class EmbeddingService:
                 await llm_service.initialize()
             
             # Test LLM service health
-            health_summary = llm_service.get_health_summary()
-            if health_summary.get("service_status") != "healthy":
-                logger.error(f"LLM service unhealthy: {health_summary}")
+            if not llm_service._initialized:
+                logger.error("LLM service not initialized")
+                return False
+
+            # Check if PrivateMode provider is available
+            try:
+                provider_status = await llm_service.get_provider_status()
+                privatemode_status = provider_status.get("privatemode")
+                if not privatemode_status or privatemode_status.status != "healthy":
+                    logger.error(f"PrivateMode provider not available: {privatemode_status}")
+                    return False
+            except Exception as e:
+                logger.error(f"Failed to check provider status: {e}")
                 return False
             
             self.initialized = True
@@ -75,6 +85,12 @@ class EmbeddingService:
                         else:
                             truncated_text = text
                         
+                        # Guard: skip empty inputs (validator rejects empty strings)
+                        if not truncated_text.strip():
+                            logger.debug("Empty input for embedding; using fallback vector")
+                            batch_embeddings.append(self._generate_fallback_embedding(text))
+                            continue
+
                         # Call LLM service embedding endpoint
                         from app.services.llm.service import llm_service
                         from app.services.llm.models import EmbeddingRequest
@@ -163,4 +179,4 @@ class EmbeddingService:
 
 
 # Global embedding service instance
-embedding_service = EmbeddingService()
\ No newline at end of file
+embedding_service = EmbeddingService()
diff --git a/backend/app/services/enhanced_embedding_service.py b/backend/app/services/enhanced_embedding_service.py
index 284773f..cc66e42 100644
--- a/backend/app/services/enhanced_embedding_service.py
+++ b/backend/app/services/enhanced_embedding_service.py
@@ -25,9 +25,10 @@ class EnhancedEmbeddingService(EmbeddingService):
             'requests_count': 0,
             'window_start': time.time(),
             'window_size': 60,  # 1 minute window
-            'max_requests_per_minute': int(getattr(settings, 'RAG_EMBEDDING_MAX_REQUESTS_PER_MINUTE', 60)),  # Configurable
+            'max_requests_per_minute': int(getattr(settings, 'RAG_EMBEDDING_MAX_REQUESTS_PER_MINUTE', 12)),  # Configurable
             'retry_delays': [int(x) for x in getattr(settings, 'RAG_EMBEDDING_RETRY_DELAYS', '1,2,4,8,16').split(',')],  # Exponential backoff
-            'delay_between_batches': float(getattr(settings, 'RAG_EMBEDDING_DELAY_BETWEEN_BATCHES', 0.5)),
+            'delay_between_batches': float(getattr(settings, 'RAG_EMBEDDING_DELAY_BETWEEN_BATCHES', 1.0)),
+            'delay_per_request': float(getattr(settings, 'RAG_EMBEDDING_DELAY_PER_REQUEST', 0.5)),
             'last_rate_limit_error': None
         }
 
@@ -38,7 +39,7 @@ class EnhancedEmbeddingService(EmbeddingService):
         if max_retries is None:
             max_retries = int(getattr(settings, 'RAG_EMBEDDING_RETRY_COUNT', 3))
 
-        batch_size = int(getattr(settings, 'RAG_EMBEDDING_BATCH_SIZE', 5))
+        batch_size = int(getattr(settings, 'RAG_EMBEDDING_BATCH_SIZE', 3))
 
         if not self.initialized:
             logger.warning("Embedding service not initialized, using fallback")
@@ -76,9 +77,6 @@ class EnhancedEmbeddingService(EmbeddingService):
                 # Make the request
                 embeddings = await self._get_embeddings_batch_impl(texts)
 
-                # Update rate limit tracker on success
-                self._update_rate_limit_tracker(success=True)
-
                 return embeddings, True
 
             except Exception as e:
@@ -120,6 +118,12 @@ class EnhancedEmbeddingService(EmbeddingService):
         embeddings = []
 
         for text in texts:
+            # Respect rate limit before each request
+            while self._is_rate_limited():
+                delay = self._get_rate_limit_delay()
+                logger.warning(f"Rate limit window exceeded, waiting {delay:.2f}s before next request")
+                await asyncio.sleep(delay)
+
             # Truncate text if needed
             max_chars = 1600
             truncated_text = text[:max_chars] if len(text) > max_chars else text
@@ -142,8 +146,14 @@ class EnhancedEmbeddingService(EmbeddingService):
                         self._dimension_confirmed = True
                 else:
                     raise ValueError("Empty embedding in response")
-            else:
-                raise ValueError("Invalid response structure")
+                else:
+                    raise ValueError("Invalid response structure")
+
+            # Count this successful request and optionally delay between requests
+            self._update_rate_limit_tracker(success=True)
+            per_req_delay = self.rate_limit_tracker.get('delay_per_request', 0)
+            if per_req_delay and per_req_delay > 0:
+                await asyncio.sleep(per_req_delay)
 
         return embeddings
 
@@ -198,4 +208,4 @@ class EnhancedEmbeddingService(EmbeddingService):
 
 
 # Global enhanced embedding service instance
-enhanced_embedding_service = EnhancedEmbeddingService()
\ No newline at end of file
+enhanced_embedding_service = EnhancedEmbeddingService()
diff --git a/backend/app/services/llm/config.py b/backend/app/services/llm/config.py
index 61a8576..b7aeb13 100644
--- a/backend/app/services/llm/config.py
+++ b/backend/app/services/llm/config.py
@@ -16,6 +16,7 @@ from .models import ResilienceConfig
 class ProviderConfig(BaseModel):
     """Configuration for an LLM provider"""
     name: str = Field(..., description="Provider name")
+    provider_type: str = Field(..., description="Provider type (e.g., 'openai', 'privatemode')")
     enabled: bool = Field(True, description="Whether provider is enabled")
     base_url: str = Field(..., description="Provider base URL")
     api_key_env_var: str = Field(..., description="Environment variable for API key")
@@ -53,9 +54,6 @@ class LLMServiceConfig(BaseModel):
     enable_security_checks: bool = Field(True, description="Enable security validation")
     enable_metrics_collection: bool = Field(True, description="Enable metrics collection")
     
-    # Security settings
-    security_risk_threshold: float = Field(0.8, ge=0.0, le=1.0, description="Risk threshold for blocking")
-    security_warning_threshold: float = Field(0.6, ge=0.0, le=1.0, description="Risk threshold for warnings")
     max_prompt_length: int = Field(50000, ge=1000, description="Maximum prompt length")
     max_response_length: int = Field(32000, ge=1000, description="Maximum response length")
     
@@ -78,12 +76,6 @@ class LLMServiceConfig(BaseModel):
     # Model routing (model_name -> provider_name)
     model_routing: Dict[str, str] = Field(default_factory=dict, description="Model to provider routing")
     
-    @validator('security_risk_threshold')
-    def validate_risk_threshold(cls, v, values):
-        warning_threshold = values.get('security_warning_threshold', 0.6)
-        if v <= warning_threshold:
-            raise ValueError("Risk threshold must be greater than warning threshold")
-        return v
 
 
 def create_default_config() -> LLMServiceConfig:
@@ -93,6 +85,7 @@ def create_default_config() -> LLMServiceConfig:
     # Models will be fetched dynamically from proxy /models endpoint
     privatemode_config = ProviderConfig(
         name="privatemode",
+        provider_type="privatemode",
         enabled=True,
         base_url=settings.PRIVATEMODE_PROXY_URL,
         api_key_env_var="PRIVATEMODE_API_KEY",
@@ -119,9 +112,6 @@ def create_default_config() -> LLMServiceConfig:
     config = LLMServiceConfig(
         default_provider="privatemode",
         enable_detailed_logging=settings.LOG_LLM_PROMPTS,
-        enable_security_checks=settings.API_SECURITY_ENABLED,
-        security_risk_threshold=settings.API_SECURITY_RISK_THRESHOLD,
-        security_warning_threshold=settings.API_SECURITY_WARNING_THRESHOLD,
         providers={
             "privatemode": privatemode_config
         },
diff --git a/backend/app/services/llm/metrics.py b/backend/app/services/llm/metrics.py
index 542dd7d..9a35fc4 100644
--- a/backend/app/services/llm/metrics.py
+++ b/backend/app/services/llm/metrics.py
@@ -124,7 +124,6 @@ class MetricsCollector:
         total_requests = len(self._metrics)
         successful_requests = sum(1 for m in self._metrics if m.success)
         failed_requests = total_requests - successful_requests
-        security_blocked = sum(1 for m in self._metrics if not m.success and m.security_risk_score > 0.8)
         
         # Calculate averages
         latencies = [m.latency_ms for m in self._metrics if m.latency_ms > 0]
@@ -143,7 +142,6 @@ class MetricsCollector:
             total_requests=total_requests,
             successful_requests=successful_requests,
             failed_requests=failed_requests,
-            security_blocked_requests=security_blocked,
             average_latency_ms=avg_latency,
             average_risk_score=avg_risk_score,
             provider_metrics=provider_metrics,
diff --git a/backend/app/services/llm/models.py b/backend/app/services/llm/models.py
index 903451d..b699b2c 100644
--- a/backend/app/services/llm/models.py
+++ b/backend/app/services/llm/models.py
@@ -157,7 +157,6 @@ class LLMMetrics(BaseModel):
     total_requests: int = Field(0, description="Total requests processed")
     successful_requests: int = Field(0, description="Successful requests")
     failed_requests: int = Field(0, description="Failed requests")
-    security_blocked_requests: int = Field(0, description="Security blocked requests")
     average_latency_ms: float = Field(0.0, description="Average response latency")
     average_risk_score: float = Field(0.0, description="Average security risk score")
     provider_metrics: Dict[str, Dict[str, Any]] = Field(default_factory=dict, description="Per-provider metrics")
diff --git a/backend/app/services/llm/providers/privatemode.py b/backend/app/services/llm/providers/privatemode.py
index 63f18ad..b136ccb 100644
--- a/backend/app/services/llm/providers/privatemode.py
+++ b/backend/app/services/llm/providers/privatemode.py
@@ -452,6 +452,8 @@ class PrivateModeProvider(BaseLLMProvider):
                 
                 else:
                     error_text = await response.text()
+                    # Log the detailed error response from the provider
+                    logger.error(f"PrivateMode embedding error - Status {response.status}: {error_text}")
                     self._handle_http_error(response.status, error_text, "embeddings")
         
         except aiohttp.ClientError as e:
diff --git a/backend/app/services/llm/security.py b/backend/app/services/llm/security.py
deleted file mode 100644
index 8aa37be..0000000
--- a/backend/app/services/llm/security.py
+++ /dev/null
@@ -1,325 +0,0 @@
-"""
-LLM Security Manager
-
-Handles prompt injection detection and audit logging.
-Provides comprehensive security for LLM interactions.
-"""
-
-import os
-import re
-import json
-import logging
-import hashlib
-from typing import Dict, Any, List, Optional, Tuple
-from datetime import datetime
-
-from app.core.config import settings
-
-logger = logging.getLogger(__name__)
-
-
-class SecurityManager:
-    """Manages security for LLM operations"""
-    
-    def __init__(self):
-        self._setup_prompt_injection_patterns()
-    
-    
-    def _setup_prompt_injection_patterns(self):
-        """Setup patterns for prompt injection detection"""
-        self.injection_patterns = [
-            # Direct instruction injection
-            r"(?i)(ignore|forget|disregard|override).{0,20}(instructions|rules|prompts)",
-            r"(?i)(new|updated|different)\s+(instructions|rules|system)",
-            r"(?i)act\s+as\s+(if|though)\s+you\s+(are|were)",
-            r"(?i)pretend\s+(to\s+be|you\s+are)",
-            r"(?i)you\s+are\s+now\s+(a|an)\s+",
-            
-            # System role manipulation
-            r"(?i)system\s*:\s*",
-            r"(?i)\[system\]",
-            r"(?i)<system>",
-            r"(?i)assistant\s*:\s*",
-            r"(?i)\[assistant\]",
-            
-            # Escape attempts
-            r"(?i)\\n\\n#+",
-            r"(?i)```\s*(system|assistant|user)",
-            r"(?i)---\s*(new|system|override)",
-            
-            # Role manipulation
-            r"(?i)(you|your)\s+(role|purpose|function)\s+(is|has\s+changed)",
-            r"(?i)switch\s+to\s+(admin|developer|debug)\s+mode",
-            r"(?i)(admin|root|sudo|developer)\s+(access|mode|privileges)",
-            
-            # Information extraction attempts
-            r"(?i)(show|display|reveal|expose)\s+(your|the)\s+(prompt|instructions|system)",
-            r"(?i)what\s+(are|were)\s+your\s+(original|initial)\s+(instructions|prompts)",
-            r"(?i)(debug|verbose|diagnostic)\s+mode",
-            
-            # Encoding/obfuscation attempts
-            r"(?i)base64\s*:",
-            r"(?i)hex\s*:",
-            r"(?i)unicode\s*:",
-            r"(?i)\b[A-Za-z0-9+/]{40,}={0,2}\b",  # More specific base64 pattern (longer sequences)
-            
-            # SQL injection patterns (more specific to reduce false positives)
-            r"(?i)(union\s+select|select\s+\*|insert\s+into|update\s+\w+\s+set|delete\s+from|drop\s+table|create\s+table)\s",
-            r"(?i)(or|and)\s+\d+\s*=\s*\d+",
-            r"(?i)';?\s*(drop\s+table|delete\s+from|insert\s+into)",
-            
-            # Command injection patterns
-            r"(?i)(exec|eval|system|shell|cmd)\s*\(",
-            r"(?i)(\$\(|\`)[^)]+(\)|\`)",
-            r"(?i)&&\s*(rm|del|format)",
-            
-            # Jailbreak attempts
-            r"(?i)jailbreak",
-            r"(?i)break\s+out\s+of",
-            r"(?i)escape\s+(the|your)\s+(rules|constraints)",
-            r"(?i)(DAN|Do\s+Anything\s+Now)",
-            r"(?i)unrestricted\s+mode",
-        ]
-        
-        self.compiled_patterns = [re.compile(pattern) for pattern in self.injection_patterns]
-        logger.info(f"Initialized {len(self.injection_patterns)} prompt injection patterns")
-    
-    
-    def validate_prompt_security(self, messages: List[Dict[str, str]]) -> Tuple[bool, float, List[str]]:
-        """
-        Validate messages for prompt injection attempts
-
-        Returns:
-            Tuple[bool, float, List[str]]: (is_safe, risk_score, detected_patterns)
-        """
-        detected_patterns = []
-        total_risk = 0.0
-
-        # Check if this is a system/RAG request
-        is_system_request = self._is_system_request(messages)
-
-        for message in messages:
-            content = message.get("content", "")
-            if not content:
-                continue
-
-            # Check against injection patterns with context awareness
-            for i, pattern in enumerate(self.compiled_patterns):
-                matches = pattern.findall(content)
-                if matches:
-                    # Apply context-aware risk calculation
-                    pattern_risk = self._calculate_pattern_risk(i, matches, message.get("role", "user"), is_system_request)
-                    total_risk += pattern_risk
-                    detected_patterns.append({
-                        "pattern_index": i,
-                        "pattern": self.injection_patterns[i],
-                        "matches": matches,
-                        "risk": pattern_risk
-                    })
-
-            # Additional security checks with context awareness
-            total_risk += self._check_message_characteristics(content, message.get("role", "user"), is_system_request)
-
-        # Normalize risk score (0.0 to 1.0)
-        risk_score = min(total_risk / len(messages) if messages else 0.0, 1.0)
-        # Never block - always return True for is_safe
-        is_safe = True
-
-        if detected_patterns:
-            logger.info(f"Detected {len(detected_patterns)} potential injection patterns, risk score: {risk_score} (system_request: {is_system_request})")
-
-        return is_safe, risk_score, detected_patterns
-    
-    def _calculate_pattern_risk(self, pattern_index: int, matches: List, role: str, is_system_request: bool) -> float:
-        """Calculate risk score for a detected pattern with context awareness"""
-        # Different patterns have different risk levels
-        high_risk_patterns = [0, 1, 2, 3, 4, 5, 6, 7, 22, 23, 24]  # System manipulation, jailbreak
-        medium_risk_patterns = [8, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21]  # Escape attempts, info extraction
-
-        # Base risk score
-        base_risk = 0.8 if pattern_index in high_risk_patterns else 0.5 if pattern_index in medium_risk_patterns else 0.3
-
-        # Apply context-specific risk reduction
-        if is_system_request or role == "system":
-            # Reduce risk for system messages and RAG content
-            if pattern_index in [14, 15, 16]:  # Encoding patterns (base64, hex, unicode)
-                base_risk *= 0.2  # Reduce encoding risk by 80% for system content
-            elif pattern_index in [17, 18, 19]:  # SQL patterns
-                base_risk *= 0.3  # Reduce SQL risk by 70% for system content
-            else:
-                base_risk *= 0.6  # Reduce other risks by 40% for system content
-
-        # Increase risk based on number of matches, but cap it
-        match_multiplier = min(1.0 + (len(matches) - 1) * 0.1, 1.5)  # Reduced multiplier
-
-        return base_risk * match_multiplier
-    
-    def _check_message_characteristics(self, content: str, role: str, is_system_request: bool) -> float:
-        """Check message characteristics for additional risk factors with context awareness"""
-        risk = 0.0
-
-        # Excessive length (potential stuffing attack) - less restrictive for system content
-        length_threshold = 50000 if is_system_request else 10000  # Much higher threshold for system content
-        if len(content) > length_threshold:
-            risk += 0.1 if is_system_request else 0.3
-
-        # High ratio of special characters - more lenient for system content
-        special_chars = sum(1 for c in content if not c.isalnum() and not c.isspace())
-        if len(content) > 0:
-            char_ratio = special_chars / len(content)
-            threshold = 0.8 if is_system_request else 0.5
-            if char_ratio > threshold:
-                risk += 0.2 if is_system_request else 0.4
-
-        # Multiple encoding indicators - reduced risk for system content
-        encoding_indicators = ["base64", "hex", "unicode", "url", "ascii"]
-        found_encodings = sum(1 for indicator in encoding_indicators if indicator.lower() in content.lower())
-        if found_encodings > 1:
-            risk += 0.1 if is_system_request else 0.3
-
-        # Excessive newlines or formatting - more lenient for system content
-        newline_threshold = 200 if is_system_request else 50
-        if content.count('\n') > newline_threshold or content.count('\\n') > newline_threshold:
-            risk += 0.1 if is_system_request else 0.2
-
-        return risk
-
-    def _is_system_request(self, messages: List[Dict[str, str]]) -> bool:
-        """Determine if this is a system/RAG request"""
-        if not messages:
-            return False
-
-        # Check for system messages
-        for message in messages:
-            if message.get("role") == "system":
-                return True
-
-        # Check message content for RAG indicators
-        for message in messages:
-            content = message.get("content", "")
-            if ("document:" in content.lower() or
-                "context:" in content.lower() or
-                "source:" in content.lower() or
-                "retrieved:" in content.lower() or
-                "citation:" in content.lower() or
-                "reference:" in content.lower()):
-                return True
-
-        return False
-
-    def create_audit_log(
-        self,
-        user_id: str,
-        api_key_id: int,
-        provider: str,
-        model: str,
-        request_type: str,
-        risk_score: float,
-        detected_patterns: List[str],
-        metadata: Optional[Dict[str, Any]] = None
-    ) -> Dict[str, Any]:
-        """Create comprehensive audit log for LLM request"""
-        audit_entry = {
-            "timestamp": datetime.utcnow().isoformat(),
-            "user_id": user_id,
-            "api_key_id": api_key_id,
-            "provider": provider,
-            "model": model,
-            "request_type": request_type,
-            "security": {
-                "risk_score": risk_score,
-                "detected_patterns": detected_patterns,
-                "security_check_passed": risk_score < settings.API_SECURITY_RISK_THRESHOLD
-            },
-            "metadata": metadata or {},
-            "audit_hash": None  # Will be set below
-        }
-        
-        # Create hash for audit integrity
-        audit_hash = self._create_audit_hash(audit_entry)
-        audit_entry["audit_hash"] = audit_hash
-        
-        # Log based on risk level (never block, only log)
-        if risk_score >= settings.API_SECURITY_RISK_THRESHOLD:
-            logger.warning(f"HIGH RISK LLM REQUEST DETECTED (NOT BLOCKED): {json.dumps(audit_entry)}")
-        elif risk_score >= settings.API_SECURITY_WARNING_THRESHOLD:
-            logger.info(f"MEDIUM RISK LLM REQUEST: {json.dumps(audit_entry)}")
-        else:
-            logger.info(f"LLM REQUEST AUDIT: user={user_id}, model={model}, risk={risk_score:.3f}")
-        
-        return audit_entry
-    
-    def _create_audit_hash(self, audit_entry: Dict[str, Any]) -> str:
-        """Create hash for audit trail integrity"""
-        # Create hash from key fields (excluding the hash itself)
-        hash_data = {
-            "timestamp": audit_entry["timestamp"],
-            "user_id": audit_entry["user_id"],
-            "api_key_id": audit_entry["api_key_id"],
-            "provider": audit_entry["provider"],
-            "model": audit_entry["model"],
-            "request_type": audit_entry["request_type"],
-            "risk_score": audit_entry["security"]["risk_score"]
-        }
-        
-        hash_string = json.dumps(hash_data, sort_keys=True)
-        return hashlib.sha256(hash_string.encode()).hexdigest()
-    
-    def log_detailed_request(
-        self,
-        messages: List[Dict[str, str]],
-        model: str,
-        user_id: str,
-        provider: str,
-        context_info: Optional[Dict[str, Any]] = None
-    ):
-        """Log detailed LLM request if LOG_LLM_PROMPTS is enabled"""
-        if not settings.LOG_LLM_PROMPTS:
-            return
-        
-        logger.info("=== DETAILED LLM REQUEST ===")
-        logger.info(f"Model: {model}")
-        logger.info(f"Provider: {provider}")
-        logger.info(f"User ID: {user_id}")
-        
-        if context_info:
-            for key, value in context_info.items():
-                logger.info(f"{key}: {value}")
-        
-        logger.info("Messages to LLM:")
-        for i, message in enumerate(messages):
-            role = message.get("role", "unknown")
-            content = message.get("content", "")[:500]  # Truncate for logging
-            logger.info(f"  Message {i+1} [{role}]: {content}{'...' if len(message.get('content', '')) > 500 else ''}")
-        
-        logger.info("=== END DETAILED LLM REQUEST ===")
-    
-    def log_detailed_response(
-        self,
-        response_content: str,
-        token_usage: Optional[Dict[str, int]] = None,
-        provider: str = "unknown"
-    ):
-        """Log detailed LLM response if LOG_LLM_PROMPTS is enabled"""
-        if not settings.LOG_LLM_PROMPTS:
-            return
-        
-        logger.info("=== DETAILED LLM RESPONSE ===")
-        logger.info(f"Provider: {provider}")
-        logger.info(f"Response content: {response_content[:500]}{'...' if len(response_content) > 500 else ''}")
-        
-        if token_usage:
-            logger.info(f"Token usage - Prompt: {token_usage.get('prompt_tokens', 0)}, "
-                       f"Completion: {token_usage.get('completion_tokens', 0)}, "
-                       f"Total: {token_usage.get('total_tokens', 0)}")
-        
-        logger.info("=== END DETAILED LLM RESPONSE ===")
-
-
-class SecurityError(Exception):
-    """Security-related errors in LLM operations"""
-    pass
-
-
-# Global security manager instance
-security_manager = SecurityManager()
\ No newline at end of file
diff --git a/backend/app/services/llm/service.py b/backend/app/services/llm/service.py
index bb8e683..d3f2503 100644
--- a/backend/app/services/llm/service.py
+++ b/backend/app/services/llm/service.py
@@ -17,9 +17,8 @@ from .models import (
 )
 from .config import config_manager, ProviderConfig
 from ...core.config import settings
-from .security import security_manager
 from .resilience import ResilienceManagerFactory
-from .metrics import metrics_collector
+# from .metrics import metrics_collector
 from .providers import BaseLLMProvider, PrivateModeProvider
 from .exceptions import (
     LLMError, ProviderError, SecurityError, ConfigurationError,
@@ -150,45 +149,8 @@ class LLMService:
         if not request.messages:
             raise ValidationError("Messages cannot be empty", field="messages")
         
-        # Security validation (only if enabled)
-        messages_dict = [{"role": msg.role, "content": msg.content} for msg in request.messages]
-
-        if settings.API_SECURITY_ENABLED:
-            is_safe, risk_score, detected_patterns = security_manager.validate_prompt_security(messages_dict)
-        else:
-            # Security disabled - always safe
-            is_safe, risk_score, detected_patterns = True, 0.0, []
-
-        if not is_safe:
-            # Log security violation
-            security_manager.create_audit_log(
-                user_id=request.user_id,
-                api_key_id=request.api_key_id,
-                provider="blocked",
-                model=request.model,
-                request_type="chat_completion",
-                risk_score=risk_score,
-                detected_patterns=[p.get("pattern", "") for p in detected_patterns]
-            )
-
-            # Record blocked request
-            metrics_collector.record_request(
-                provider="security",
-                model=request.model,
-                request_type="chat_completion",
-                success=False,
-                latency_ms=0,
-                security_risk_score=risk_score,
-                error_code="SECURITY_BLOCKED",
-                user_id=request.user_id,
-                api_key_id=request.api_key_id
-            )
-
-            raise SecurityError(
-                "Request blocked due to security concerns",
-                risk_score=risk_score,
-                details={"detected_patterns": detected_patterns}
-            )
+        # Security validation disabled - always allow requests
+        risk_score = 0.0
         
         # Get provider for model
         provider_name = self._get_provider_for_model(request.model)
@@ -197,18 +159,7 @@ class LLMService:
         if not provider:
             raise ProviderError(f"No available provider for model '{request.model}'", provider=provider_name)
         
-        # Log detailed request if enabled
-        security_manager.log_detailed_request(
-            messages=messages_dict,
-            model=request.model,
-            user_id=request.user_id,
-            provider=provider_name,
-            context_info={
-                "temperature": request.temperature,
-                "max_tokens": request.max_tokens,
-                "risk_score": f"{risk_score:.3f}"
-            }
-        )
+        # Security logging disabled
         
         # Execute with resilience
         resilience_manager = ResilienceManagerFactory.get_manager(provider_name)
@@ -222,85 +173,46 @@ class LLMService:
                 non_retryable_exceptions=(SecurityError, ValidationError)
             )
             
-            # Update response with security information
-            response.security_check = is_safe
-            response.risk_score = risk_score
-            response.detected_patterns = [p.get("pattern", "") for p in detected_patterns]
+            # Security features disabled
             
-            # Log detailed response if enabled
-            if response.choices:
-                content = response.choices[0].message.content
-                security_manager.log_detailed_response(
-                    response_content=content,
-                    token_usage=response.usage.model_dump() if response.usage else None,
-                    provider=provider_name
-                )
+            # Security logging disabled
             
-            # Record successful request
+            # Record successful request - metrics disabled
             total_latency = (time.time() - start_time) * 1000
-            metrics_collector.record_request(
-                provider=provider_name,
-                model=request.model,
-                request_type="chat_completion",
-                success=True,
-                latency_ms=total_latency,
-                token_usage=response.usage.model_dump() if response.usage else None,
-                security_risk_score=risk_score,
-                user_id=request.user_id,
-                api_key_id=request.api_key_id
-            )
+            # metrics_collector.record_request(
+            #     provider=provider_name,
+            #     model=request.model,
+            #     request_type="chat_completion",
+            #     success=True,
+            #     latency_ms=total_latency,
+            #     token_usage=response.usage.model_dump() if response.usage else None,
+            #     security_risk_score=risk_score,
+            #     user_id=request.user_id,
+            #     api_key_id=request.api_key_id
+            # )
             
-            # Create audit log
-            security_manager.create_audit_log(
-                user_id=request.user_id,
-                api_key_id=request.api_key_id,
-                provider=provider_name,
-                model=request.model,
-                request_type="chat_completion",
-                risk_score=risk_score,
-                detected_patterns=[p.get("pattern", "") for p in detected_patterns],
-                metadata={
-                    "success": True,
-                    "latency_ms": total_latency,
-                    "token_usage": response.usage.model_dump() if response.usage else None
-                }
-            )
+            # Security audit logging disabled
             
             return response
         
         except Exception as e:
-            # Record failed request
+            # Record failed request - metrics disabled
             total_latency = (time.time() - start_time) * 1000
             error_code = getattr(e, 'error_code', e.__class__.__name__)
+
+            # metrics_collector.record_request(
+            #     provider=provider_name,
+            #     model=request.model,
+            #     request_type="chat_completion",
+            #     success=False,
+            #     latency_ms=total_latency,
+            #     security_risk_score=risk_score,
+            #     error_code=error_code,
+            #     user_id=request.user_id,
+            #     api_key_id=request.api_key_id
+            # )
             
-            metrics_collector.record_request(
-                provider=provider_name,
-                model=request.model,
-                request_type="chat_completion",
-                success=False,
-                latency_ms=total_latency,
-                security_risk_score=risk_score,
-                error_code=error_code,
-                user_id=request.user_id,
-                api_key_id=request.api_key_id
-            )
-            
-            # Create audit log for failure
-            security_manager.create_audit_log(
-                user_id=request.user_id,
-                api_key_id=request.api_key_id,
-                provider=provider_name,
-                model=request.model,
-                request_type="chat_completion",
-                risk_score=risk_score,
-                detected_patterns=[p.get("pattern", "") for p in detected_patterns],
-                metadata={
-                    "success": False,
-                    "error": str(e),
-                    "error_code": error_code,
-                    "latency_ms": total_latency
-                }
-            )
+            # Security audit logging disabled
             
             raise
     
@@ -309,21 +221,8 @@ class LLMService:
         if not self._initialized:
             await self.initialize()
         
-        # Security validation (same as non-streaming)
-        messages_dict = [{"role": msg.role, "content": msg.content} for msg in request.messages]
-
-        if settings.API_SECURITY_ENABLED:
-            is_safe, risk_score, detected_patterns = security_manager.validate_prompt_security(messages_dict)
-        else:
-            # Security disabled - always safe
-            is_safe, risk_score, detected_patterns = True, 0.0, []
-
-        if not is_safe:
-            raise SecurityError(
-                "Streaming request blocked due to security concerns",
-                risk_score=risk_score,
-                details={"detected_patterns": detected_patterns}
-            )
+        # Security validation disabled - always allow streaming requests
+        risk_score = 0.0
         
         # Get provider
         provider_name = self._get_provider_for_model(request.model)
@@ -345,19 +244,19 @@ class LLMService:
                 yield chunk
         
         except Exception as e:
-            # Record streaming failure
+            # Record streaming failure - metrics disabled
             error_code = getattr(e, 'error_code', e.__class__.__name__)
-            metrics_collector.record_request(
-                provider=provider_name,
-                model=request.model,
-                request_type="chat_completion_stream",
-                success=False,
-                latency_ms=0,
-                security_risk_score=risk_score,
-                error_code=error_code,
-                user_id=request.user_id,
-                api_key_id=request.api_key_id
-            )
+            # metrics_collector.record_request(
+            #     provider=provider_name,
+            #     model=request.model,
+            #     request_type="chat_completion_stream",
+            #     success=False,
+            #     latency_ms=0,
+            #     security_risk_score=risk_score,
+            #     error_code=error_code,
+            #     user_id=request.user_id,
+            #     api_key_id=request.api_key_id
+            # )
             raise
     
     async def create_embedding(self, request: EmbeddingRequest) -> EmbeddingResponse:
@@ -365,23 +264,8 @@ class LLMService:
         if not self._initialized:
             await self.initialize()
         
-        # Security validation for embedding input
-        input_text = request.input if isinstance(request.input, str) else " ".join(request.input)
-
-        if settings.API_SECURITY_ENABLED:
-            is_safe, risk_score, detected_patterns = security_manager.validate_prompt_security([
-                {"role": "user", "content": input_text}
-            ])
-        else:
-            # Security disabled - always safe
-            is_safe, risk_score, detected_patterns = True, 0.0, []
-
-        if not is_safe:
-            raise SecurityError(
-                "Embedding request blocked due to security concerns",
-                risk_score=risk_score,
-                details={"detected_patterns": detected_patterns}
-            )
+        # Security validation disabled - always allow embedding requests
+        risk_score = 0.0
         
         # Get provider
         provider_name = self._get_provider_for_model(request.model)
@@ -402,42 +286,40 @@ class LLMService:
                 non_retryable_exceptions=(SecurityError, ValidationError)
             )
             
-            # Update response with security information
-            response.security_check = is_safe
-            response.risk_score = risk_score
+            # Security features disabled
             
-            # Record successful request
+            # Record successful request - metrics disabled
             total_latency = (time.time() - start_time) * 1000
-            metrics_collector.record_request(
-                provider=provider_name,
-                model=request.model,
-                request_type="embedding",
-                success=True,
-                latency_ms=total_latency,
-                token_usage=response.usage.model_dump() if response.usage else None,
-                security_risk_score=risk_score,
-                user_id=request.user_id,
-                api_key_id=request.api_key_id
-            )
+            # metrics_collector.record_request(
+            #     provider=provider_name,
+            #     model=request.model,
+            #     request_type="embedding",
+            #     success=True,
+            #     latency_ms=total_latency,
+            #     token_usage=response.usage.model_dump() if response.usage else None,
+            #     security_risk_score=risk_score,
+            #     user_id=request.user_id,
+            #     api_key_id=request.api_key_id
+            # )
             
             return response
         
         except Exception as e:
-            # Record failed request
+            # Record failed request - metrics disabled
             total_latency = (time.time() - start_time) * 1000
             error_code = getattr(e, 'error_code', e.__class__.__name__)
-            
-            metrics_collector.record_request(
-                provider=provider_name,
-                model=request.model,
-                request_type="embedding",
-                success=False,
-                latency_ms=total_latency,
-                security_risk_score=risk_score,
-                error_code=error_code,
-                user_id=request.user_id,
-                api_key_id=request.api_key_id
-            )
+
+            # metrics_collector.record_request(
+            #     provider=provider_name,
+            #     model=request.model,
+            #     request_type="embedding",
+            #     success=False,
+            #     latency_ms=total_latency,
+            #     security_risk_score=risk_score,
+            #     error_code=error_code,
+            #     user_id=request.user_id,
+            #     api_key_id=request.api_key_id
+            # )
             
             raise
     
@@ -492,20 +374,26 @@ class LLMService:
         return status_dict
     
     def get_metrics(self) -> LLMMetrics:
-        """Get service metrics"""
-        return metrics_collector.get_metrics()
+        """Get service metrics - metrics disabled"""
+        # return metrics_collector.get_metrics()
+        return LLMMetrics(
+            total_requests=0,
+            success_rate=0.0,
+            avg_latency_ms=0,
+            error_rates={}
+        )
     
     def get_health_summary(self) -> Dict[str, Any]:
-        """Get comprehensive health summary"""
-        metrics_health = metrics_collector.get_health_summary()
+        """Get comprehensive health summary - metrics disabled"""
+        # metrics_health = metrics_collector.get_health_summary()
         resilience_health = ResilienceManagerFactory.get_all_health_status()
-        
+
         return {
             "service_status": "healthy" if self._initialized else "initializing",
             "startup_time": self._startup_time.isoformat() if self._startup_time else None,
             "provider_count": len(self._providers),
             "active_providers": list(self._providers.keys()),
-            "metrics": metrics_health,
+            "metrics": {"status": "disabled"},
             "resilience": resilience_health
         }
     
diff --git a/backend/app/services/llm/token_rate_limiter.py b/backend/app/services/llm/token_rate_limiter.py
deleted file mode 100644
index 2338a03..0000000
--- a/backend/app/services/llm/token_rate_limiter.py
+++ /dev/null
@@ -1,153 +0,0 @@
-"""
-Token-based rate limiting for LLM service
-"""
-
-import time
-import redis
-from typing import Dict, Optional, Tuple
-from datetime import datetime, timedelta
-from ..core.config import settings
-from ..core.logging import get_logger
-
-logger = get_logger(__name__)
-
-
-class TokenRateLimiter:
-    """Token-based rate limiting implementation"""
-
-    def __init__(self):
-        try:
-            self.redis_client = redis.from_url(settings.REDIS_URL, decode_responses=True)
-            self.redis_client.ping()
-            logger.info("Token rate limiter initialized with Redis backend")
-        except Exception as e:
-            logger.warning(f"Redis not available for token rate limiting: {e}")
-            self.redis_client = None
-            # Fall back to in-memory rate limiting
-            self.in_memory_store = {}
-            logger.info("Token rate limiter using in-memory fallback")
-
-    async def check_token_limits(
-        self,
-        provider: str,
-        prompt_tokens: int,
-        completion_tokens: int = 0
-    ) -> Tuple[bool, Dict[str, str]]:
-        """
-        Check if token usage is within limits
-
-        Args:
-            provider: Provider name (e.g., "privatemode")
-            prompt_tokens: Number of prompt tokens to use
-            completion_tokens: Number of completion tokens to use
-
-        Returns:
-            Tuple of (is_allowed, headers)
-        """
-        # Get token limits from configuration
-        from .config import get_config
-        config = get_config()
-        token_limits = config.token_limits_per_minute
-
-        # Check organization-wide limits
-        org_key = f"tokens:org:{provider}"
-
-        # Get current usage
-        current_usage = await self._get_token_usage(org_key)
-
-        # Calculate new usage
-        new_prompt_tokens = current_usage.get("prompt_tokens", 0) + prompt_tokens
-        new_completion_tokens = current_usage.get("completion_tokens", 0) + completion_tokens
-
-        # Check limits
-        prompt_limit = token_limits.get("prompt_tokens", 20000)
-        completion_limit = token_limits.get("completion_tokens", 10000)
-
-        is_allowed = (
-            new_prompt_tokens <= prompt_limit and
-            new_completion_tokens <= completion_limit
-        )
-
-        if is_allowed:
-            # Update usage
-            await self._update_token_usage(org_key, prompt_tokens, completion_tokens)
-            logger.debug(f"Token usage updated: {new_prompt_tokens}/{prompt_limit} prompt, "
-                        f"{new_completion_tokens}/{completion_limit} completion")
-
-        # Calculate remaining tokens
-        remaining_prompt = max(0, prompt_limit - new_prompt_tokens)
-        remaining_completion = max(0, completion_limit - new_completion_tokens)
-
-        # Create headers
-        headers = {
-            "X-TokenLimit-Prompt-Remaining": str(remaining_prompt),
-            "X-TokenLimit-Completion-Remaining": str(remaining_completion),
-            "X-TokenLimit-Prompt-Limit": str(prompt_limit),
-            "X-TokenLimit-Completion-Limit": str(completion_limit),
-            "X-TokenLimit-Reset": str(int(time.time() + 60))  # Reset in 1 minute
-        }
-
-        if not is_allowed:
-            logger.warning(f"Token rate limit exceeded for {provider}. "
-                          f"Requested: {prompt_tokens} prompt, {completion_tokens} completion. "
-                          f"Current: {current_usage}")
-
-        return is_allowed, headers
-
-    async def _get_token_usage(self, key: str) -> Dict[str, int]:
-        """Get current token usage"""
-        if self.redis_client:
-            try:
-                data = self.redis_client.hgetall(key)
-                if data:
-                    return {
-                        "prompt_tokens": int(data.get("prompt_tokens", 0)),
-                        "completion_tokens": int(data.get("completion_tokens", 0)),
-                        "updated_at": float(data.get("updated_at", time.time()))
-                    }
-            except Exception as e:
-                logger.error(f"Error getting token usage from Redis: {e}")
-
-        # Fallback to in-memory
-        return self.in_memory_store.get(key, {"prompt_tokens": 0, "completion_tokens": 0})
-
-    async def _update_token_usage(self, key: str, prompt_tokens: int, completion_tokens: int):
-        """Update token usage"""
-        if self.redis_client:
-            try:
-                pipe = self.redis_client.pipeline()
-                pipe.hincrby(key, "prompt_tokens", prompt_tokens)
-                pipe.hincrby(key, "completion_tokens", completion_tokens)
-                pipe.hset(key, "updated_at", time.time())
-                pipe.expire(key, 60)  # Expire after 1 minute
-                pipe.execute()
-            except Exception as e:
-                logger.error(f"Error updating token usage in Redis: {e}")
-                # Fallback to in-memory
-                self._update_in_memory(key, prompt_tokens, completion_tokens)
-        else:
-            self._update_in_memory(key, prompt_tokens, completion_tokens)
-
-    def _update_in_memory(self, key: str, prompt_tokens: int, completion_tokens: int):
-        """Update in-memory token usage"""
-        if key not in self.in_memory_store:
-            self.in_memory_store[key] = {"prompt_tokens": 0, "completion_tokens": 0}
-
-        self.in_memory_store[key]["prompt_tokens"] += prompt_tokens
-        self.in_memory_store[key]["completion_tokens"] += completion_tokens
-        self.in_memory_store[key]["updated_at"] = time.time()
-
-    def cleanup_expired(self):
-        """Clean up expired entries (for in-memory store)"""
-        if not self.redis_client:
-            current_time = time.time()
-            expired_keys = [
-                key for key, data in self.in_memory_store.items()
-                if current_time - data.get("updated_at", 0) > 60
-            ]
-            for key in expired_keys:
-                del self.in_memory_store[key]
-
-
-# Global token rate limiter instance
-token_rate_limiter = TokenRateLimiter()
\ No newline at end of file
diff --git a/backend/app/services/rag_service.py b/backend/app/services/rag_service.py
index 119cb26..1741362 100644
--- a/backend/app/services/rag_service.py
+++ b/backend/app/services/rag_service.py
@@ -755,10 +755,11 @@ class RAGService:
                     
                     # Process with RAG module
                     try:
+                        # Pass file_path in metadata so JSONL indexing can reopen the source file
                         processed_doc = await rag_module.process_document(
-                            file_content, 
-                            document.original_filename, 
-                            {}
+                            file_content,
+                            document.original_filename,
+                            {"file_path": document.file_path}
                         )
                         
                         # Success case - update document with processed content
@@ -873,4 +874,4 @@ class RAGService:
                 
         except Exception as e:
             logger.error(f"Error reprocessing document {document_id}: {e}")
-            return False
\ No newline at end of file
+            return False
diff --git a/backend/modules/rag/main.py b/backend/modules/rag/main.py
index 7d75fbd..d56503c 100644
--- a/backend/modules/rag/main.py
+++ b/backend/modules/rag/main.py
@@ -638,11 +638,19 @@ class RAGModule(BaseModule):
             np.random.seed(hash(text) % 2**32)
             return np.random.random(self.embedding_model.get("dimension", 768)).tolist()
     
-    async def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
+    async def _generate_embeddings(self, texts: List[str], is_document: bool = True) -> List[List[float]]:
         """Generate embeddings for multiple texts (batch processing)"""
         if self.embedding_service:
+            # Add task-specific prefixes for better E5 model performance
+            if is_document:
+                # For document passages, use "passage:" prefix
+                prefixed_texts = [f"passage: {text}" for text in texts]
+            else:
+                # For queries, use "query:" prefix (handled in search method)
+                prefixed_texts = texts
+
             # Use real embedding service for batch processing
-            return await self.embedding_service.get_embeddings(texts)
+            return await self.embedding_service.get_embeddings(prefixed_texts)
         else:
             # Fallback to individual processing
             embeddings = []
@@ -917,69 +925,75 @@ class RAGModule(BaseModule):
     
     async def _process_jsonl(self, content: bytes, filename: str) -> str:
         """Process JSONL files (newline-delimited JSON)
-        
+
         Specifically optimized for helpjuice-export.jsonl format:
         - Each line contains a JSON object with 'id' and 'payload'
         - Payload contains 'question', 'language', and 'answer' fields
         - Combines question and answer into searchable content
+
+        Performance optimizations:
+        - Processes articles in smaller batches to reduce memory usage
+        - Uses streaming approach for large files
         """
         try:
+            # Use streaming approach for large files
             jsonl_content = content.decode('utf-8', errors='replace')
             lines = jsonl_content.strip().split('\n')
-            
+
             processed_articles = []
-            
+            batch_size = 50  # Process in batches of 50 articles
+
             for line_num, line in enumerate(lines, 1):
                 if not line.strip():
                     continue
-                
+
                 try:
                     # Parse each JSON line
                     data = json.loads(line)
-                    
+
                     # Handle helpjuice export format
                     if 'payload' in data:
                         payload = data['payload']
                         article_id = data.get('id', f'article_{line_num}')
-                        
+
                         # Extract fields
                         question = payload.get('question', '')
                         answer = payload.get('answer', '')
                         language = payload.get('language', 'EN')
-                        
+
                         # Combine question and answer for better search
                         if question or answer:
                             # Format as Q&A for better context
                             article_text = f"## {question}\n\n{answer}\n\n"
-                            
+
                             # Add language tag if not English
                             if language != 'EN':
                                 article_text = f"[{language}] {article_text}"
-                            
+
                             # Add metadata separator
                             article_text += f"---\nArticle ID: {article_id}\nLanguage: {language}\n\n"
-                            
+
                             processed_articles.append(article_text)
-                    
+
                     # Handle generic JSONL format
                     else:
                         # Convert the entire JSON object to readable text
                         json_text = json.dumps(data, indent=2, ensure_ascii=False)
                         processed_articles.append(json_text + "\n\n")
-                        
+
                 except json.JSONDecodeError as e:
                     logger.warning(f"Error parsing JSONL line {line_num}: {e}")
                     continue
                 except Exception as e:
                     logger.warning(f"Error processing JSONL line {line_num}: {e}")
                     continue
-            
+
             # Combine all articles
             combined_text = '\n'.join(processed_articles)
-            
+
             logger.info(f"Successfully processed {len(processed_articles)} articles from JSONL file {filename}")
             return combined_text
-            
+
         except Exception as e:
             logger.error(f"Error processing JSONL file {filename}: {e}")
             return ""
@@ -1153,7 +1167,7 @@ class RAGModule(BaseModule):
             chunks = self._chunk_text(content)
             
             # Generate embeddings for all chunks in batch (more efficient)
-            embeddings = await self._generate_embeddings(chunks)
+            embeddings = await self._generate_embeddings(chunks, is_document=True)
             
             # Create document points
             points = []
@@ -1200,10 +1214,28 @@ class RAGModule(BaseModule):
         """Index a processed document in the vector database"""
         if not self.enabled:
             raise RuntimeError("RAG module not initialized")
-        
+
         collection_name = collection_name or self.default_collection_name
-        
+
         try:
+            # Special handling for JSONL files
+            if processed_doc.file_type == 'jsonl':
+                # Import the optimized JSONL processor
+                from app.services.jsonl_processor import JSONLProcessor
+                jsonl_processor = JSONLProcessor(self)
+
+                # Read the original file content
+                with open(processed_doc.metadata.get('file_path', ''), 'rb') as f:
+                    file_content = f.read()
+
+                # Process using the optimized JSONL processor
+                return await jsonl_processor.process_and_index_jsonl(
+                    collection_name=collection_name,
+                    content=file_content,
+                    filename=processed_doc.original_filename,
+                    metadata=processed_doc.metadata
+                )
+
             # Ensure collection exists
             await self._ensure_collection_exists(collection_name)
             
@@ -1216,7 +1248,7 @@ class RAGModule(BaseModule):
             chunks = self._chunk_text(processed_doc.content)
             
             # Generate embeddings for all chunks in batch (more efficient)
-            embeddings = await self._generate_embeddings(chunks)
+            embeddings = await self._generate_embeddings(chunks, is_document=True)
             
             # Create document points with enhanced metadata
             points = []
@@ -1339,24 +1371,48 @@ class RAGModule(BaseModule):
             score_threshold=score_threshold / 2  # Lower threshold for initial search
         )
 
-        # Combine scores
+        # Combine scores with improved normalization
         hybrid_weights = self.config.get("hybrid_weights", {"vector": 0.7, "bm25": 0.3})
         vector_weight = hybrid_weights.get("vector", 0.7)
         bm25_weight = hybrid_weights.get("bm25", 0.3)
 
-        # Create hybrid results
+        # Get score distributions for better normalization
+        vector_scores = [r.score for r in vector_results]
+        bm25_scores_list = list(bm25_scores.values())
+
+        # Calculate statistics for normalization
+        if vector_scores:
+            v_max = max(vector_scores)
+            v_min = min(vector_scores)
+            v_range = v_max - v_min if v_max != v_min else 1
+        else:
+            v_max, v_min, v_range = 1, 0, 1
+
+        if bm25_scores_list:
+            bm25_max = max(bm25_scores_list)
+            bm25_min = min(bm25_scores_list)
+            bm25_range = bm25_max - bm25_min if bm25_max != bm25_min else 1
+        else:
+            bm25_max, bm25_min, bm25_range = 1, 0, 1
+
+        # Create hybrid results with improved scoring
         hybrid_results = []
         for result in vector_results:
             doc_id = result.payload.get("document_id", "")
             vector_score = result.score
             bm25_score = bm25_scores.get(doc_id, 0.0)
 
-            # Normalize scores (simple min-max normalization)
-            vector_norm = (vector_score - score_threshold) / (1.0 - score_threshold) if vector_score > score_threshold else 0
-            bm25_norm = min(bm25_score, 1.0)  # BM25 scores are typically 0-1
+            # Improved normalization using actual score distributions
+            vector_norm = (vector_score - v_min) / v_range if v_range > 0 else 0.5
+            bm25_norm = (bm25_score - bm25_min) / bm25_range if bm25_range > 0 else 0.5
 
-            # Calculate hybrid score
-            hybrid_score = (vector_weight * vector_norm) + (bm25_weight * bm25_norm)
+            # Apply reciprocal rank fusion for better combination
+            # This gives more weight to documents that rank highly in both methods
+            rrf_vector = 1.0 / (1.0 + vector_results.index(result) + 1)  # +1 to avoid division by zero
+            rrf_bm25 = 1.0 / (1.0 + sorted(bm25_scores_list, reverse=True).index(bm25_score) + 1) if bm25_score in bm25_scores_list else 0
+
+            # Calculate hybrid score using both normalized scores and RRF
+            hybrid_score = (vector_weight * vector_norm + bm25_weight * bm25_norm) * 0.7 + (rrf_vector + rrf_bm25) * 0.3
 
             # Create new point with hybrid score
             hybrid_point = ScoredPoint(
@@ -1435,7 +1491,7 @@ class RAGModule(BaseModule):
         # Normalize score to 0-1 range
         return min(score / 10.0, 1.0)  # Simple normalization
 
-    async def search_documents(self, query: str, max_results: int = None, filters: Dict[str, Any] = None, collection_name: str = None) -> List[SearchResult]:
+    async def search_documents(self, query: str, max_results: int = None, filters: Dict[str, Any] = None, collection_name: str = None, score_threshold: float = None) -> List[SearchResult]:
         """Search for relevant documents"""
         if not self.enabled:
             raise RuntimeError("RAG module not initialized")
@@ -1453,8 +1509,10 @@ class RAGModule(BaseModule):
             import time
             start_time = time.time()
             
-            # Generate query embedding
-            query_embedding = await self._generate_embedding(query)
+            # Generate query embedding with task-specific prefix for better retrieval
+            # The E5 model works better with "query:" prefix for search queries
+            optimized_query = f"query: {query}"
+            query_embedding = await self._generate_embedding(optimized_query)
             
             # Build filter
             search_filter = None
@@ -1474,7 +1532,8 @@ class RAGModule(BaseModule):
             
             # Check if hybrid search is enabled
             enable_hybrid = self.config.get("enable_hybrid", False)
-            score_threshold = self.config.get("score_threshold", 0.3)
+            # Use provided score_threshold or fall back to config
+            search_score_threshold = score_threshold if score_threshold is not None else self.config.get("score_threshold", 0.3)
 
             if enable_hybrid and NLTK_AVAILABLE:
                 # Perform hybrid search (vector + BM25)
@@ -1484,7 +1543,7 @@ class RAGModule(BaseModule):
                     query_vector=query_embedding,
                     query_filter=search_filter,
                     limit=max_results,
-                    score_threshold=score_threshold
+                    score_threshold=search_score_threshold
                 )
             else:
                 # Pure vector search with improved threshold
@@ -1493,7 +1552,7 @@ class RAGModule(BaseModule):
                     query_vector=query_embedding,
                     query_filter=search_filter,
                     limit=max_results,
-                    score_threshold=score_threshold
+                    score_threshold=search_score_threshold
                 )
             
             logger.info(f"Raw search results count: {len(search_results)}")
@@ -1841,9 +1900,9 @@ async def index_processed_document(processed_doc: ProcessedDocument, collection_
     """Index a processed document"""
     return await rag_module.index_processed_document(processed_doc, collection_name)
 
-async def search_documents(query: str, max_results: int = None, filters: Dict[str, Any] = None, collection_name: str = None) -> List[SearchResult]:
+async def search_documents(query: str, max_results: int = None, filters: Dict[str, Any] = None, collection_name: str = None, score_threshold: float = None) -> List[SearchResult]:
     """Search documents"""
-    return await rag_module.search_documents(query, max_results, filters, collection_name)
+    return await rag_module.search_documents(query, max_results, filters, collection_name, score_threshold)
 
 async def delete_document(document_id: str, collection_name: str = None) -> bool:
     """Delete a document"""
diff --git a/frontend/src/app/api/auth/login/route.ts b/frontend/src/app/api/auth/login/route.ts
index c32f93e..fefb7fe 100644
--- a/frontend/src/app/api/auth/login/route.ts
+++ b/frontend/src/app/api/auth/login/route.ts
@@ -7,7 +7,7 @@ export async function POST(request: NextRequest) {
     
     // Make request to backend auth endpoint without requiring existing auth
     const baseUrl = process.env.INTERNAL_API_URL || `http://enclava-backend:${process.env.BACKEND_INTERNAL_PORT || '8000'}`
-    const url = `${baseUrl}/api/auth/login`
+    const url = `${baseUrl}/api-internal/v1/auth/login`
     
     const response = await fetch(url, {
       method: 'POST',
diff --git a/frontend/src/app/rag/page.tsx b/frontend/src/app/rag/page.tsx
index 87616c1..48ae013 100644
--- a/frontend/src/app/rag/page.tsx
+++ b/frontend/src/app/rag/page.tsx
@@ -85,8 +85,31 @@ function RAGPageContent() {
   const loadStats = async () => {
     try {
       const data = await apiClient.get('/api-internal/v1/rag/stats')
-      setStats(data.stats)
+      console.log('Stats API response:', data)
+
+      // Check if the response has the expected structure
+      if (data && data.stats && data.stats.collections) {
+        console.log('✓ Stats has collections property')
+        setStats(data.stats)
+      } else {
+        console.error('✗ Invalid stats structure:', data)
+        // Set default empty stats to prevent error
+        setStats({
+          collections: { total: 0, active: 0 },
+          documents: { total: 0, processing: 0, processed: 0 },
+          storage: { total_size_bytes: 0, total_size_mb: 0 },
+          vectors: { total: 0 }
+        })
+      }
     } catch (error) {
+      console.error('Error loading stats:', error)
+      // Set default empty stats on error
+      setStats({
+        collections: { total: 0, active: 0 },
+        documents: { total: 0, processing: 0, processed: 0 },
+        storage: { total_size_bytes: 0, total_size_mb: 0 },
+        vectors: { total: 0 }
+      })
     }
   }
 
diff --git a/frontend/src/components/rag/document-browser.tsx b/frontend/src/components/rag/document-browser.tsx
index c3e643f..2643e9c 100644
--- a/frontend/src/components/rag/document-browser.tsx
+++ b/frontend/src/components/rag/document-browser.tsx
@@ -9,7 +9,7 @@ import { Badge } from "@/components/ui/badge"
 import { Separator } from "@/components/ui/separator"
 import { AlertDialog, AlertDialogAction, AlertDialogCancel, AlertDialogContent, AlertDialogDescription, AlertDialogFooter, AlertDialogHeader, AlertDialogTitle, AlertDialogTrigger } from "@/components/ui/alert-dialog"
 import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle, DialogTrigger } from "@/components/ui/dialog"
-import { Search, FileText, Trash2, Eye, Download, Calendar, Hash, FileIcon, Filter } from "lucide-react"
+import { Search, FileText, Trash2, Eye, Download, Calendar, Hash, FileIcon, Filter, RefreshCw } from "lucide-react"
 import { useToast } from "@/hooks/use-toast"
 import { apiClient } from "@/lib/api-client"
 import { config } from "@/lib/config"
@@ -56,6 +56,7 @@ export function DocumentBrowser({ collections, selectedCollection, onCollectionS
   const [filterStatus, setFilterStatus] = useState("all")
   const [selectedDocument, setSelectedDocument] = useState<Document | null>(null)
   const [deleting, setDeleting] = useState<string | null>(null)
+  const [reprocessing, setReprocessing] = useState<string | null>(null)
   const { toast } = useToast()
 
   useEffect(() => {
@@ -157,6 +158,43 @@ export function DocumentBrowser({ collections, selectedCollection, onCollectionS
     }
   }
 
+  const handleReprocessDocument = async (documentId: string) => {
+    setReprocessing(documentId)
+
+    try {
+      await apiClient.post(`/api-internal/v1/rag/documents/${documentId}/reprocess`)
+
+      // Update the document status to processing in the UI
+      setDocuments(prev => prev.map(doc =>
+        doc.id === documentId
+          ? { ...doc, status: 'processing' as const, processed_at: new Date().toISOString() }
+          : doc
+      ))
+
+      toast({
+        title: "Success",
+        description: "Document reprocessing started",
+      })
+
+      // Reload documents after a short delay to see status updates
+      setTimeout(() => {
+        loadDocuments()
+      }, 2000)
+
+    } catch (error) {
+      const errorMessage = error instanceof Error ? error.message : "Failed to reprocess document"
+      toast({
+        title: "Error",
+        description: errorMessage.includes("Cannot reprocess document with status 'processed'")
+          ? "Cannot reprocess documents that are already processed"
+          : errorMessage,
+        variant: "destructive",
+      })
+    } finally {
+      setReprocessing(null)
+    }
+  }
+
   const formatFileSize = (bytes: number) => {
     if (bytes === 0) return '0 Bytes'
     const k = 1024
@@ -432,6 +470,21 @@ export function DocumentBrowser({ collections, selectedCollection, onCollectionS
                       <Download className="h-4 w-4" />
                     </Button>
 
+                    <Button
+                      variant="ghost"
+                      size="sm"
+                      className="h-8 w-8 p-0 hover:bg-blue-100"
+                      onClick={() => handleReprocessDocument(document.id)}
+                      disabled={reprocessing === document.id || document.status === 'processed'}
+                      title={document.status === 'processed' ? "Document already processed" : "Reprocess document"}
+                    >
+                      {reprocessing === document.id ? (
+                        <RefreshCw className="h-4 w-4 animate-spin" />
+                      ) : (
+                        <RefreshCw className={`h-4 w-4 ${document.status === 'processed' ? 'text-gray-400' : ''}`} />
+                      )}
+                    </Button>
+
                     <AlertDialog>
                       <AlertDialogTrigger asChild>
                         <Button
diff --git a/frontend/src/components/ui/navigation.tsx b/frontend/src/components/ui/navigation.tsx
index 3743a69..df3bfd3 100644
--- a/frontend/src/components/ui/navigation.tsx
+++ b/frontend/src/components/ui/navigation.tsx
@@ -67,12 +67,13 @@ const Navigation = () => {
   // Core navigation items that are always visible
   const coreNavItems = [
     { href: "/dashboard", label: "Dashboard" },
-    { 
-      href: "/llm", 
+    {
+      href: "/llm",
       label: "LLM",
       children: [
         { href: "/llm", label: "Models & Config" },
         { href: "/playground", label: "Playground" },
+        { href: "/rag-demo", label: "RAG Demo" },
       ]
     },
     { 
diff --git a/nginx/nginx.conf b/nginx/nginx.conf
index 3d30567..a6e9e3e 100644
--- a/nginx/nginx.conf
+++ b/nginx/nginx.conf
@@ -25,6 +25,12 @@ http {
         listen 80;
         server_name localhost;
 
+        # Static files - serve directly from nginx
+        location = /login_helper.html {
+            root /usr/share/nginx/html;
+            try_files $uri =404;
+        }
+
         # Frontend routes
         location / {
             proxy_pass http://frontend;
@@ -32,7 +38,7 @@ http {
             proxy_set_header X-Real-IP $remote_addr;
             proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
             proxy_set_header X-Forwarded-Proto $scheme;
-            
+
             # WebSocket support for Next.js HMR
             proxy_http_version 1.1;
             proxy_set_header Upgrade $http_upgrade;
@@ -65,6 +71,58 @@ http {
             }
         }
 
+        # RAG debug API routes - proxy to frontend (for Next.js API routes)
+        location /api/rag/debug/ {
+            proxy_pass http://frontend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+
+            # CORS headers
+            add_header 'Access-Control-Allow-Origin' '*' always;
+            add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
+            add_header 'Access-Control-Allow-Headers' 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization' always;
+            add_header 'Access-Control-Allow-Expose-Headers' 'Content-Length,Content-Range' always;
+
+            # Handle preflight requests
+            if ($request_method = 'OPTIONS') {
+                add_header 'Access-Control-Allow-Origin' '*';
+                add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS';
+                add_header 'Access-Control-Allow-Headers' 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization';
+                add_header 'Access-Control-Max-Age' 1728000;
+                add_header 'Content-Type' 'text/plain; charset=utf-8';
+                add_header 'Content-Length' 0;
+                return 204;
+            }
+        }
+
+        # Frontend API routes for authentication - proxy to frontend
+        location /api/auth/ {
+            proxy_pass http://frontend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+
+            # CORS headers
+            add_header 'Access-Control-Allow-Origin' '*' always;
+            add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
+            add_header 'Access-Control-Allow-Headers' 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization' always;
+            add_header 'Access-Control-Expose-Headers' 'Content-Length,Content-Range' always;
+
+            # Handle preflight requests
+            if ($request_method = 'OPTIONS') {
+                add_header 'Access-Control-Allow-Origin' '*';
+                add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS';
+                add_header 'Access-Control-Allow-Headers' 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization';
+                add_header 'Access-Control-Max-Age' 1728000;
+                add_header 'Content-Type' 'text/plain; charset=utf-8';
+                add_header 'Content-Length' 0;
+                return 204;
+            }
+        }
+
         # Public API routes - proxy to backend (for external clients)
         location /api/ {
             proxy_pass http://backend;
@@ -72,13 +130,13 @@ http {
             proxy_set_header X-Real-IP $remote_addr;
             proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
             proxy_set_header X-Forwarded-Proto $scheme;
-            
+
             # CORS headers for external clients
             add_header 'Access-Control-Allow-Origin' '*' always;
             add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
             add_header 'Access-Control-Allow-Headers' 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization' always;
             add_header 'Access-Control-Expose-Headers' 'Content-Length,Content-Range' always;
-            
+
             # Handle preflight requests
             if ($request_method = 'OPTIONS') {
                 add_header 'Access-Control-Allow-Origin' '*';

From d4d420a03a3f40d38f29349d2b081b73d93b491f Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Tue, 23 Sep 2025 15:47:33 +0200
Subject: [PATCH 10/13] rag improvements 2

---
 .gitignore                                    |   66 +
 backend/app/modules/chatbot/__init__.py       |   21 +
 .../app/modules/chatbot/config_schema.json    |  126 ++
 .../examples/customer_support_workflow.json   |  182 ++
 backend/app/modules/chatbot/main.py           |  936 ++++++++
 backend/app/modules/chatbot/module.yaml       |  110 +
 backend/app/modules/factory.py                |  225 ++
 backend/app/modules/protocols.py              |  258 +++
 backend/app/modules/rag/__init__.py           |    6 +
 backend/app/modules/rag/main.py               | 1922 +++++++++++++++++
 backend/app/modules/rag/module.yaml           |   82 +
 backend/app/services/jsonl_processor.py       |  211 ++
 backend/app/services/qdrant_stats_service.py  |  163 ++
 13 files changed, 4308 insertions(+)
 create mode 100644 backend/app/modules/chatbot/__init__.py
 create mode 100644 backend/app/modules/chatbot/config_schema.json
 create mode 100644 backend/app/modules/chatbot/examples/customer_support_workflow.json
 create mode 100644 backend/app/modules/chatbot/main.py
 create mode 100644 backend/app/modules/chatbot/module.yaml
 create mode 100644 backend/app/modules/factory.py
 create mode 100644 backend/app/modules/protocols.py
 create mode 100644 backend/app/modules/rag/__init__.py
 create mode 100644 backend/app/modules/rag/main.py
 create mode 100644 backend/app/modules/rag/module.yaml
 create mode 100644 backend/app/services/jsonl_processor.py
 create mode 100644 backend/app/services/qdrant_stats_service.py

diff --git a/.gitignore b/.gitignore
index e69de29..6642c56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,66 @@
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.env
+*.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+*.sqlite3
+*.db
+
+# FastAPI logs
+*.log
+
+# Node.js
+node_modules/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+
+# Next.js build
+frontend/.next/
+frontend/out/
+frontend/.env.local
+frontend/.env.production
+frontend/.env.development
+
+
+backend/storage/
+# TypeScript
+*.tsbuildinfo
+
+# Coverage reports
+htmlcov/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.pyc
+*.pyo
+*.pyd
+.pytest_cache/
+backend/.pytest_cache/
+backend/.mypy_cache/
+.mypy_cache/
+*.prof
+
+backend/_to_delete/ 
+backend/__pycache__/ 
+backend/app/core/__pycache__/
+backend/app/services/__pycache__/
+backend/app/services/llm/__pycache__/
+backend/app/services/llm/providers/__pycache__/
+backend/app/utils/__pycache__/
+backend/modules/rag/__pycache__/
+frontend/.next/ 
+frontend/node_modules/
+node_modules/
+venv/ 
\ No newline at end of file
diff --git a/backend/app/modules/chatbot/__init__.py b/backend/app/modules/chatbot/__init__.py
new file mode 100644
index 0000000..5131eeb
--- /dev/null
+++ b/backend/app/modules/chatbot/__init__.py
@@ -0,0 +1,21 @@
+"""
+Chatbot Module - AI Chatbot with RAG Integration
+
+This module provides AI chatbot capabilities with:
+- Multiple personality types (Assistant, Customer Support, Teacher, etc.)
+- RAG integration for knowledge-based responses
+- Conversation memory and context management
+- Workflow integration as building blocks
+- UI-configurable settings
+"""
+
+from .main import ChatbotModule, create_module
+
+__version__ = "1.0.0"
+__author__ = "Enclava Team"
+
+# Export main classes for easy importing
+__all__ = [
+    "ChatbotModule", 
+    "create_module"
+]
\ No newline at end of file
diff --git a/backend/app/modules/chatbot/config_schema.json b/backend/app/modules/chatbot/config_schema.json
new file mode 100644
index 0000000..d4a5dc7
--- /dev/null
+++ b/backend/app/modules/chatbot/config_schema.json
@@ -0,0 +1,126 @@
+{
+  "title": "Chatbot Configuration",
+  "type": "object",
+  "properties": {
+    "name": {
+      "type": "string",
+      "title": "Chatbot Name",
+      "description": "Display name for this chatbot instance",
+      "minLength": 1,
+      "maxLength": 100
+    },
+    "chatbot_type": {
+      "type": "string",
+      "title": "Chatbot Type",
+      "description": "Select the type of chatbot personality",
+      "enum": ["assistant", "customer_support", "teacher", "researcher", "creative_writer", "custom"],
+      "enumNames": ["General Assistant", "Customer Support", "Teacher", "Researcher", "Creative Writer", "Custom"],
+      "default": "assistant"
+    },
+    "model": {
+      "type": "string",
+      "title": "AI Model",
+      "description": "Choose the LLM model for responses",
+      "enum": ["gpt-4", "gpt-3.5-turbo", "claude-3-sonnet", "claude-3-opus", "llama-70b"],
+      "default": "gpt-3.5-turbo"
+    },
+    "system_prompt": {
+      "type": "string",
+      "title": "System Prompt",
+      "description": "Define the chatbot's personality and behavior instructions",
+      "ui:widget": "textarea",
+      "ui:options": {
+        "rows": 6,
+        "placeholder": "You are a helpful AI assistant..."
+      }
+    },
+    "use_rag": {
+      "type": "boolean",
+      "title": "Enable Knowledge Base",
+      "description": "Use RAG to search knowledge base for context",
+      "default": false
+    },
+    "rag_collection": {
+      "type": "string",
+      "title": "Knowledge Base Collection",
+      "description": "Select which document collection to search",
+      "ui:widget": "rag-collection-selector",
+      "ui:condition": "use_rag === true"
+    },
+    "rag_top_k": {
+      "type": "integer",
+      "title": "Knowledge Base Results",
+      "description": "Number of relevant documents to include",
+      "minimum": 1,
+      "maximum": 10,
+      "default": 5,
+      "ui:condition": "use_rag === true"
+    },
+    "temperature": {
+      "type": "number",
+      "title": "Response Creativity",
+      "description": "Controls randomness (0.0 = focused, 1.0 = creative)",
+      "minimum": 0,
+      "maximum": 1,
+      "default": 0.7,
+      "ui:widget": "range",
+      "ui:options": {
+        "step": 0.1
+      }
+    },
+    "max_tokens": {
+      "type": "integer",
+      "title": "Maximum Response Length",
+      "description": "Maximum number of tokens in response",
+      "minimum": 50,
+      "maximum": 4000,
+      "default": 1000,
+      "ui:widget": "range",
+      "ui:options": {
+        "step": 50
+      }
+    },
+    "memory_length": {
+      "type": "integer",
+      "title": "Conversation Memory",
+      "description": "Number of previous message pairs to remember",
+      "minimum": 1,
+      "maximum": 50,
+      "default": 10,
+      "ui:widget": "range"
+    },
+    "fallback_responses": {
+      "type": "array",
+      "title": "Fallback Responses",
+      "description": "Responses to use when the AI cannot answer",
+      "items": {
+        "type": "string",
+        "title": "Fallback Response"
+      },
+      "default": [
+        "I'm not sure how to help with that. Could you please rephrase your question?",
+        "I don't have enough information to answer that question accurately.",
+        "That's outside my knowledge area. Is there something else I can help you with?"
+      ],
+      "ui:options": {
+        "orderable": true,
+        "addable": true,
+        "removable": true
+      }
+    }
+  },
+  "required": ["name", "chatbot_type", "model"],
+  "ui:order": [
+    "name", 
+    "chatbot_type", 
+    "model", 
+    "system_prompt", 
+    "use_rag", 
+    "rag_collection", 
+    "rag_top_k", 
+    "temperature", 
+    "max_tokens", 
+    "memory_length", 
+    "fallback_responses"
+  ]
+}
\ No newline at end of file
diff --git a/backend/app/modules/chatbot/examples/customer_support_workflow.json b/backend/app/modules/chatbot/examples/customer_support_workflow.json
new file mode 100644
index 0000000..7d22781
--- /dev/null
+++ b/backend/app/modules/chatbot/examples/customer_support_workflow.json
@@ -0,0 +1,182 @@
+{
+  "name": "Customer Support Workflow",
+  "description": "Intelligent customer support workflow with intent classification, knowledge base search, and chatbot response generation",
+  "version": "1.0",
+  "variables": {
+    "support_chatbot_id": "cs-bot-001",
+    "escalation_threshold": 0.3,
+    "max_attempts": 3
+  },
+  "steps": [
+    {
+      "id": "classify_intent",
+      "name": "Classify Customer Intent",
+      "type": "llm_call",
+      "model": "gpt-3.5-turbo",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are an intent classifier for customer support. Classify the customer message into one of these categories: technical_issue, billing_question, feature_request, complaint, general_inquiry. Also provide a confidence score between 0 and 1. Respond with JSON: {\"intent\": \"category\", \"confidence\": 0.95, \"reasoning\": \"explanation\"}"
+        },
+        {
+          "role": "user", 
+          "content": "{{ inputs.customer_message }}"
+        }
+      ],
+      "output_variable": "intent_classification"
+    },
+    
+    {
+      "id": "search_knowledge_base",
+      "name": "Search Knowledge Base",
+      "type": "workflow_step",
+      "module": "rag",
+      "action": "search",
+      "config": {
+        "query": "{{ inputs.customer_message }}",
+        "collection": "support_documentation",
+        "top_k": 5,
+        "include_metadata": true
+      },
+      "output_variable": "knowledge_results"
+    },
+    
+    {
+      "id": "check_confidence",
+      "name": "Check Intent Confidence",
+      "type": "condition",
+      "condition": "JSON.parse(steps.classify_intent.result).confidence > variables.escalation_threshold",
+      "true_steps": [
+        {
+          "id": "generate_chatbot_response",
+          "name": "Generate Chatbot Response",
+          "type": "workflow_step",
+          "module": "chatbot",
+          "action": "workflow_chat_step", 
+          "config": {
+            "message": "{{ inputs.customer_message }}",
+            "chatbot_id": "{{ variables.support_chatbot_id }}",
+            "use_rag": true,
+            "context": {
+              "intent": "{{ steps.classify_intent.result }}",
+              "knowledge_base_results": "{{ steps.search_knowledge_base.result }}",
+              "customer_history": "{{ inputs.customer_history }}",
+              "additional_instructions": "Be empathetic and professional. If you cannot fully resolve the issue, offer to escalate to a human agent."
+            }
+          },
+          "output_variable": "chatbot_response"
+        },
+        
+        {
+          "id": "analyze_response_quality",
+          "name": "Analyze Response Quality",
+          "type": "llm_call",
+          "model": "gpt-3.5-turbo",
+          "messages": [
+            {
+              "role": "system",
+              "content": "Analyze if this customer support response adequately addresses the customer's question. Consider completeness, accuracy, and helpfulness. Respond with JSON: {\"quality_score\": 0.85, \"is_adequate\": true, \"requires_escalation\": false, \"reasoning\": \"explanation\"}"
+            },
+            {
+              "role": "user",
+              "content": "Customer Question: {{ inputs.customer_message }}\\n\\nChatbot Response: {{ steps.generate_chatbot_response.result.response }}\\n\\nKnowledge Base Context: {{ steps.search_knowledge_base.result }}"
+            }
+          ],
+          "output_variable": "response_quality"
+        },
+        
+        {
+          "id": "final_response_decision",
+          "name": "Final Response Decision", 
+          "type": "condition",
+          "condition": "JSON.parse(steps.analyze_response_quality.result).is_adequate === true",
+          "true_steps": [
+            {
+              "id": "send_chatbot_response",
+              "name": "Send Chatbot Response",
+              "type": "output",
+              "config": {
+                "response_type": "chatbot_response",
+                "message": "{{ steps.generate_chatbot_response.result.response }}",
+                "sources": "{{ steps.generate_chatbot_response.result.sources }}",
+                "confidence": "{{ JSON.parse(steps.classify_intent.result).confidence }}",
+                "quality_score": "{{ JSON.parse(steps.analyze_response_quality.result).quality_score }}"
+              }
+            }
+          ],
+          "false_steps": [
+            {
+              "id": "escalate_to_human",
+              "name": "Escalate to Human Agent",
+              "type": "output",
+              "config": {
+                "response_type": "human_escalation",
+                "message": "I'd like to connect you with one of our human support agents who can better assist with your specific situation. Please hold on while I transfer you.",
+                "escalation_reason": "Response quality below threshold",
+                "intent": "{{ steps.classify_intent.result }}",
+                "attempted_response": "{{ steps.generate_chatbot_response.result.response }}",
+                "priority": "normal"
+              }
+            }
+          ]
+        }
+      ],
+      "false_steps": [
+        {
+          "id": "low_confidence_escalation",
+          "name": "Low Confidence Escalation",
+          "type": "output", 
+          "config": {
+            "response_type": "human_escalation",
+            "message": "I want to make sure you get the best possible help. Let me connect you with one of our human support agents.",
+            "escalation_reason": "Low intent classification confidence",
+            "intent": "{{ steps.classify_intent.result }}",
+            "priority": "high"
+          }
+        }
+      ]
+    },
+    
+    {
+      "id": "log_interaction",
+      "name": "Log Customer Interaction",
+      "type": "workflow_step",
+      "module": "analytics",
+      "action": "log_event",
+      "config": {
+        "event_type": "customer_support_interaction",
+        "data": {
+          "customer_message": "{{ inputs.customer_message }}",
+          "intent_classification": "{{ steps.classify_intent.result }}",
+          "response_generated": "{{ steps.generate_chatbot_response.result.response }}",
+          "knowledge_base_used": "{{ steps.search_knowledge_base.result }}",
+          "escalated": "{{ outputs.response_type === 'human_escalation' }}",
+          "workflow_execution_time": "{{ execution_time }}",
+          "timestamp": "{{ current_timestamp }}"
+        }
+      }
+    }
+  ],
+  
+  "outputs": {
+    "response_type": "string",
+    "message": "string", 
+    "sources": "array",
+    "escalation_reason": "string",
+    "confidence": "number",
+    "quality_score": "number"
+  },
+  
+  "error_handling": {
+    "retry_failed_steps": true,
+    "max_retries": 2,
+    "fallback_response": "I apologize, but I'm experiencing technical difficulties. Please contact our support team directly for assistance."
+  },
+  
+  "metadata": {
+    "created_by": "support_team",
+    "use_case": "customer_support_automation",
+    "tags": ["customer_support", "chatbot", "rag", "escalation"],
+    "estimated_execution_time": "5-15 seconds"
+  }
+}
\ No newline at end of file
diff --git a/backend/app/modules/chatbot/main.py b/backend/app/modules/chatbot/main.py
new file mode 100644
index 0000000..3f9b8dc
--- /dev/null
+++ b/backend/app/modules/chatbot/main.py
@@ -0,0 +1,936 @@
+"""
+Chatbot Module Implementation
+
+Provides AI chatbot capabilities with:
+- RAG integration for knowledge-based responses
+- Custom prompts and personalities
+- Conversation memory and context
+- Workflow integration as building blocks
+- UI-configurable settings
+"""
+
+import json
+from pprint import pprint
+import uuid
+from datetime import datetime, timedelta
+from typing import Dict, List, Any, Optional, Union
+from dataclasses import dataclass
+from pydantic import BaseModel, Field
+from enum import Enum
+
+from fastapi import APIRouter, HTTPException, Depends
+from sqlalchemy.orm import Session
+
+from app.core.logging import get_logger
+from app.services.llm.service import llm_service
+from app.services.llm.models import ChatRequest as LLMChatRequest, ChatMessage as LLMChatMessage
+from app.services.llm.exceptions import LLMError, ProviderError, SecurityError
+from app.services.base_module import BaseModule, Permission
+from app.models.user import User
+from app.models.chatbot import ChatbotInstance as DBChatbotInstance, ChatbotConversation as DBConversation, ChatbotMessage as DBMessage, ChatbotAnalytics
+from app.core.security import get_current_user
+from app.db.database import get_db
+from app.core.config import settings
+
+# Import protocols for type hints and dependency injection
+from ..protocols import RAGServiceProtocol
+# Note: LiteLLMClientProtocol replaced with direct LLM service usage
+
+logger = get_logger(__name__)
+
+
+class ChatbotType(str, Enum):
+    """Types of chatbot personalities"""
+    ASSISTANT = "assistant"
+    CUSTOMER_SUPPORT = "customer_support"
+    TEACHER = "teacher" 
+    RESEARCHER = "researcher"
+    CREATIVE_WRITER = "creative_writer"
+    CUSTOM = "custom"
+
+
+class MessageRole(str, Enum):
+    """Message roles in conversation"""
+    USER = "user"
+    ASSISTANT = "assistant" 
+    SYSTEM = "system"
+
+
+@dataclass
+class ChatbotConfig:
+    """Chatbot configuration"""
+    name: str
+    chatbot_type: str  # Changed from ChatbotType enum to str to allow custom types
+    model: str
+    rag_collection: Optional[str] = None
+    system_prompt: str = ""
+    temperature: float = 0.7
+    max_tokens: int = 1000
+    memory_length: int = 10  # Number of previous messages to remember
+    use_rag: bool = False
+    rag_top_k: int = 5
+    rag_score_threshold: float = 0.02  # Lowered from default 0.3 to allow more results
+    fallback_responses: List[str] = None
+    
+    def __post_init__(self):
+        if self.fallback_responses is None:
+            self.fallback_responses = [
+                "I'm not sure how to help with that. Could you please rephrase your question?",
+                "I don't have enough information to answer that question accurately.",
+                "That's outside my knowledge area. Is there something else I can help you with?"
+            ]
+
+
+class ChatMessage(BaseModel):
+    """Individual chat message"""
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    role: MessageRole
+    content: str
+    timestamp: datetime = Field(default_factory=datetime.utcnow)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    sources: Optional[List[Dict[str, Any]]] = None
+
+
+class Conversation(BaseModel):
+    """Conversation state"""
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    chatbot_id: str
+    user_id: str
+    messages: List[ChatMessage] = Field(default_factory=list)
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+
+class ChatRequest(BaseModel):
+    """Chat completion request"""
+    message: str
+    conversation_id: Optional[str] = None
+    chatbot_id: str
+    use_rag: Optional[bool] = None
+    context: Optional[Dict[str, Any]] = None
+
+
+class ChatResponse(BaseModel):
+    """Chat completion response"""
+    response: str
+    conversation_id: str
+    message_id: str
+    sources: Optional[List[Dict[str, Any]]] = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+
+class ChatbotInstance(BaseModel):
+    """Configured chatbot instance"""
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    name: str
+    config: ChatbotConfig
+    created_by: str
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+    is_active: bool = True
+
+
+class ChatbotModule(BaseModule):
+    """Main chatbot module implementation"""
+    
+    def __init__(self, rag_service: Optional[RAGServiceProtocol] = None):
+        super().__init__("chatbot")
+        self.rag_module = rag_service  # Keep same name for compatibility
+        self.db_session = None
+        
+        # System prompts will be loaded from database
+        self.system_prompts = {}
+    
+    async def initialize(self, **kwargs):
+        """Initialize the chatbot module"""
+        await super().initialize(**kwargs)
+        
+        # Initialize the LLM service
+        await llm_service.initialize()
+        
+        # Get RAG module dependency if not already injected
+        if not self.rag_module:
+            try:
+                # Try to get RAG module from module manager
+                from app.services.module_manager import module_manager
+                if hasattr(module_manager, 'modules') and 'rag' in module_manager.modules:
+                    self.rag_module = module_manager.modules['rag']
+                    logger.info("RAG module injected from module manager")
+            except Exception as e:
+                logger.warning(f"Could not inject RAG module: {e}")
+        
+        # Load prompt templates from database
+        await self._load_prompt_templates()
+        
+        logger.info("Chatbot module initialized")
+        logger.info(f"LLM service available: {llm_service._initialized}")
+        logger.info(f"RAG module available after init: {self.rag_module is not None}")
+        logger.info(f"Loaded {len(self.system_prompts)} prompt templates")
+    
+    async def _ensure_dependencies(self):
+        """Lazy load dependencies if not available"""
+        # Ensure LLM service is initialized
+        if not llm_service._initialized:
+            await llm_service.initialize()
+            logger.info("LLM service lazy loaded")
+        
+        if not self.rag_module:
+            try:
+                # Try to get RAG module from module manager
+                from app.services.module_manager import module_manager
+                if hasattr(module_manager, 'modules') and 'rag' in module_manager.modules:
+                    self.rag_module = module_manager.modules['rag']
+                    logger.info("RAG module lazy loaded from module manager")
+            except Exception as e:
+                logger.warning(f"Could not lazy load RAG module: {e}")
+    
+    async def _load_prompt_templates(self):
+        """Load prompt templates from database"""
+        try:
+            from app.db.database import SessionLocal
+            from app.models.prompt_template import PromptTemplate
+            from sqlalchemy import select
+            
+            db = SessionLocal()
+            try:
+                result = db.execute(
+                    select(PromptTemplate)
+                    .where(PromptTemplate.is_active == True)
+                )
+                templates = result.scalars().all()
+                
+                for template in templates:
+                    self.system_prompts[template.type_key] = template.system_prompt
+                    
+                logger.info(f"Loaded {len(self.system_prompts)} prompt templates from database")
+                
+            finally:
+                db.close()
+                
+        except Exception as e:
+            logger.warning(f"Could not load prompt templates from database: {e}")
+            # Fallback to hardcoded prompts
+            self.system_prompts = {
+                "assistant": "You are a helpful AI assistant. Provide accurate, concise, and friendly responses. Always aim to be helpful while being honest about your limitations.",
+                "customer_support": "You are a professional customer support representative. Be empathetic, professional, and solution-focused in all interactions.",
+                "teacher": "You are an experienced educational tutor. Break down complex concepts into understandable parts. Be patient, supportive, and encouraging.",
+                "researcher": "You are a thorough research assistant with a focus on accuracy and evidence-based information.",
+                "creative_writer": "You are an experienced creative writing mentor and storytelling expert.",
+                "custom": "You are a helpful AI assistant. Your personality and behavior will be defined by custom instructions."
+            }
+    
+    async def get_system_prompt_for_type(self, chatbot_type: str) -> str:
+        """Get system prompt for a specific chatbot type"""
+        if chatbot_type in self.system_prompts:
+            return self.system_prompts[chatbot_type]
+        
+        # If not found, try to reload templates
+        await self._load_prompt_templates()
+        
+        return self.system_prompts.get(chatbot_type, self.system_prompts.get("assistant", 
+            "You are a helpful AI assistant. Provide accurate, concise, and friendly responses."))
+    
+    async def create_chatbot(self, config: ChatbotConfig, user_id: str, db: Session) -> ChatbotInstance:
+        """Create a new chatbot instance"""
+        
+        # Set system prompt based on type if not provided or empty
+        if not config.system_prompt or config.system_prompt.strip() == "":
+            config.system_prompt = await self.get_system_prompt_for_type(config.chatbot_type)
+        
+        # Create database record
+        db_chatbot = DBChatbotInstance(
+            name=config.name,
+            description=f"{config.chatbot_type.replace('_', ' ').title()} chatbot",
+            config=config.__dict__,
+            created_by=user_id
+        )
+        
+        db.add(db_chatbot)
+        db.commit()
+        db.refresh(db_chatbot)
+        
+        # Convert to response model
+        chatbot = ChatbotInstance(
+            id=db_chatbot.id,
+            name=db_chatbot.name,
+            config=ChatbotConfig(**db_chatbot.config),
+            created_by=db_chatbot.created_by,
+            created_at=db_chatbot.created_at,
+            updated_at=db_chatbot.updated_at,
+            is_active=db_chatbot.is_active
+        )
+        
+        logger.info(f"Created new chatbot: {chatbot.name} ({chatbot.id})")
+        return chatbot
+    
+    async def chat_completion(self, request: ChatRequest, user_id: str, db: Session) -> ChatResponse:
+        """Generate chat completion response"""
+        
+        # Get chatbot configuration from database
+        db_chatbot = db.query(DBChatbotInstance).filter(DBChatbotInstance.id == request.chatbot_id).first()
+        if not db_chatbot:
+            raise HTTPException(status_code=404, detail="Chatbot not found")
+        
+        chatbot_config = ChatbotConfig(**db_chatbot.config)
+        
+        # Get or create conversation
+        conversation = await self._get_or_create_conversation(
+            request.conversation_id, request.chatbot_id, user_id, db
+        )
+        
+        # Create user message
+        user_message = DBMessage(
+            conversation_id=conversation.id,
+            role=MessageRole.USER.value,
+            content=request.message
+        )
+        db.add(user_message)
+        db.commit()
+        db.refresh(user_message)
+        
+        logger.info(f"Created user message with ID {user_message.id} for conversation {conversation.id}")
+        
+        try:
+            # Force the session to see the committed changes
+            db.expire_all()
+            
+            # Get conversation history for context - includes the current message we just created
+            # Fetch up to memory_length pairs of messages (user + assistant)
+            # The +1 ensures we include the current message if we're at the limit
+            messages = db.query(DBMessage).filter(
+                DBMessage.conversation_id == conversation.id
+            ).order_by(DBMessage.timestamp.desc()).limit(chatbot_config.memory_length * 2 + 1).all()
+            
+            logger.info(f"Query for conversation_id={conversation.id}, memory_length={chatbot_config.memory_length}")
+            logger.info(f"Found {len(messages)} messages in conversation history")
+            
+            # If we don't have any messages, manually add the user message we just created
+            if len(messages) == 0:
+                logger.warning(f"No messages found in query, but we just created message {user_message.id}")
+                logger.warning(f"Using the user message we just created")
+                messages = [user_message]
+            
+            for idx, msg in enumerate(messages):
+                logger.info(f"Message {idx}: id={msg.id}, role={msg.role}, content_preview={msg.content[:50] if msg.content else 'None'}...")
+            
+            # Generate response
+            response_content, sources = await self._generate_response(
+                request.message, messages, chatbot_config, request.context, db
+            )
+            
+            # Create assistant message
+            assistant_message = DBMessage(
+                conversation_id=conversation.id,
+                role=MessageRole.ASSISTANT.value,
+                content=response_content,
+                sources=sources,
+                metadata={"model": chatbot_config.model, "temperature": chatbot_config.temperature}
+            )
+            db.add(assistant_message)
+            db.commit()
+            db.refresh(assistant_message)
+            
+            # Update conversation timestamp
+            conversation.updated_at = datetime.utcnow()
+            db.commit()
+            
+            return ChatResponse(
+                response=response_content,
+                conversation_id=conversation.id,
+                message_id=assistant_message.id,
+                sources=sources
+            )
+            
+        except Exception as e:
+            logger.error(f"Chat completion failed: {e}")
+            # Return fallback response
+            fallback = chatbot_config.fallback_responses[0] if chatbot_config.fallback_responses else "I'm having trouble responding right now."
+            
+            assistant_message = DBMessage(
+                conversation_id=conversation.id,
+                role=MessageRole.ASSISTANT.value,
+                content=fallback,
+                metadata={"error": str(e), "fallback": True}
+            )
+            db.add(assistant_message)
+            db.commit()
+            db.refresh(assistant_message)
+            
+            return ChatResponse(
+                response=fallback,
+                conversation_id=conversation.id,
+                message_id=assistant_message.id,
+                metadata={"error": str(e), "fallback": True}
+            )
+    
+    async def _generate_response(self, message: str, db_messages: List[DBMessage], 
+                               config: ChatbotConfig, context: Optional[Dict] = None, db: Session = None) -> tuple[str, Optional[List]]:
+        """Generate response using LLM with optional RAG"""
+        
+        # Lazy load dependencies if not available
+        await self._ensure_dependencies()
+        
+        sources = None
+        rag_context = ""
+        
+        # Helper: detect encryption-related queries for extra care
+        def _is_encryption_query(q: str) -> bool:
+            ql = (q or "").lower()
+            return any(k in ql for k in ["encrypt", "encryption", "encrypted", "decrypt", "decryption", "sd card", "microsd", "micro-sd"])
+
+        is_encryption = _is_encryption_query(message)
+
+        # RAG search if enabled
+        if config.use_rag and config.rag_collection and self.rag_module:
+            logger.info(f"RAG search enabled for collection: {config.rag_collection}")
+            try:
+                # Get the Qdrant collection name from RAG collection
+                qdrant_collection_name = await self._get_qdrant_collection_name(config.rag_collection, db)
+                logger.info(f"Qdrant collection name: {qdrant_collection_name}")
+                
+                if qdrant_collection_name:
+                    logger.info(f"Searching RAG documents: query='{message[:50]}...', max_results={config.rag_top_k}")
+                    rag_results = await self.rag_module.search_documents(
+                        query=message,
+                        max_results=config.rag_top_k,
+                        collection_name=qdrant_collection_name,
+                        score_threshold=config.rag_score_threshold
+                    )
+                    
+                    # If the user asks about encryption, prefer results that explicitly mention it
+                    if rag_results and is_encryption:
+                        kw = ["encrypt", "encryption", "encrypted", "decrypt", "decryption"]
+                        filtered = [r for r in rag_results if any(k in (r.document.content or "").lower() for k in kw)]
+                        if filtered:
+                            rag_results = filtered + [r for r in rag_results if r not in filtered]
+
+                    if rag_results:
+                        logger.info(f"RAG search found {len(rag_results)} results")
+                        sources = [{"title": f"Document {i+1}", "content": result.document.content[:200]} 
+                                  for i, result in enumerate(rag_results)]
+                        
+                        # Build full RAG context from all results
+                        rag_context = "\n\nRelevant information from knowledge base:\n" + "\n\n".join([
+                            f"[Document {i+1}]:\n{result.document.content}" for i, result in enumerate(rag_results)
+                        ])
+                        
+                        # Detailed RAG logging - ALWAYS log for debugging
+                        logger.info("=== COMPREHENSIVE RAG SEARCH RESULTS ===")
+                        logger.info(f"Query: '{message}'")
+                        logger.info(f"Collection: {qdrant_collection_name}")
+                        logger.info(f"Number of results: {len(rag_results)}")
+                        for i, result in enumerate(rag_results):
+                            logger.info(f"\n--- RAG Result {i+1} ---")
+                            logger.info(f"Score: {getattr(result, 'score', 'N/A')}")
+                            logger.info(f"Document ID: {getattr(result.document, 'id', 'N/A')}")
+                            logger.info(f"Full Content ({len(result.document.content)} chars):")
+                            logger.info(f"{result.document.content}")
+                            if hasattr(result.document, 'metadata'):
+                                logger.info(f"Metadata: {result.document.metadata}")
+                        logger.info(f"\n=== RAG CONTEXT BEING ADDED TO PROMPT ({len(rag_context)} chars) ===")
+                        logger.info(rag_context)
+                        logger.info("=== END RAG SEARCH RESULTS ===")
+                    else:
+                        logger.warning("RAG search returned no results")
+                else:
+                    logger.warning(f"RAG collection '{config.rag_collection}' not found in database")
+                    
+            except Exception as e:
+                logger.warning(f"RAG search failed: {e}")
+                import traceback
+                logger.warning(f"RAG search traceback: {traceback.format_exc()}")
+        
+        # Build conversation context (includes the current message from db_messages)
+        # Inject strict grounding instructions when RAG is used, especially for encryption questions
+        extra_instructions = {}
+        if config.use_rag:
+            guardrails = (
+                "Answer strictly using the 'Relevant information' provided. "
+                "If the information does not explicitly answer the question, say you don't have enough information instead of guessing. "
+            )
+            if is_encryption:
+                guardrails += (
+                    "When asked about encryption or SD-card backups, do not claim that backups are encrypted unless the provided context explicitly uses wording like 'encrypt', 'encrypted', or 'encryption'. "
+                    "If such wording is absent, state clearly that the SD-card backup is not encrypted. "
+                )
+            extra_instructions["additional_instructions"] = guardrails
+
+        messages = self._build_conversation_messages(db_messages, config, rag_context, extra_instructions)
+        
+        # Note: Current user message is already included in db_messages from the query
+        logger.info(f"Built conversation context with {len(messages)} messages")
+        
+        # LLM completion
+        logger.info(f"Attempting LLM completion with model: {config.model}")
+        logger.info(f"Messages to send: {len(messages)} messages")
+        
+        # Always log detailed prompts for debugging
+        logger.info("=== COMPREHENSIVE LLM REQUEST ===")
+        logger.info(f"Model: {config.model}")
+        logger.info(f"Temperature: {config.temperature}")
+        logger.info(f"Max tokens: {config.max_tokens}")
+        logger.info(f"RAG enabled: {config.use_rag}")
+        logger.info(f"RAG collection: {config.rag_collection}")
+        if config.use_rag and rag_context:
+            logger.info(f"RAG context added: {len(rag_context)} characters")
+            logger.info(f"RAG sources: {len(sources) if sources else 0} documents")
+        logger.info("\n=== COMPLETE MESSAGES SENT TO LLM ===")
+        for i, msg in enumerate(messages):
+            logger.info(f"\n--- Message {i+1} ---")
+            logger.info(f"Role: {msg['role']}")
+            logger.info(f"Content ({len(msg['content'])} chars):")
+            # Truncate long content for logging (full RAG context can be very long)
+            if len(msg['content']) > 500:
+                logger.info(f"{msg['content'][:500]}... [truncated, total {len(msg['content'])} chars]")
+            else:
+                logger.info(msg['content'])
+        logger.info("=== END COMPREHENSIVE LLM REQUEST ===")
+        
+        try:
+            logger.info("Calling LLM service create_chat_completion...")
+            
+            # Convert messages to LLM service format
+            llm_messages = [LLMChatMessage(role=msg["role"], content=msg["content"]) for msg in messages]
+            
+            # Create LLM service request
+            llm_request = LLMChatRequest(
+                model=config.model,
+                messages=llm_messages,
+                temperature=config.temperature,
+                max_tokens=config.max_tokens,
+                user_id="chatbot_user",
+                api_key_id=0  # Chatbot module uses internal service
+            )
+            
+            # Make request to LLM service
+            llm_response = await llm_service.create_chat_completion(llm_request)
+            
+            # Extract response content
+            if llm_response.choices:
+                content = llm_response.choices[0].message.content
+                logger.info(f"Response content length: {len(content)}")
+                
+                # Always log response for debugging
+                logger.info("=== COMPREHENSIVE LLM RESPONSE ===")
+                logger.info(f"Response content ({len(content)} chars):")
+                logger.info(content)
+                if llm_response.usage:
+                    usage = llm_response.usage
+                    logger.info(f"Token usage - Prompt: {usage.prompt_tokens}, Completion: {usage.completion_tokens}, Total: {usage.total_tokens}")
+                if sources:
+                    logger.info(f"RAG sources included: {len(sources)} documents")
+                logger.info("=== END COMPREHENSIVE LLM RESPONSE ===")
+                
+                return content, sources
+            else:
+                logger.warning("No choices in LLM response")
+                return "I received an empty response from the AI model.", sources
+                
+        except SecurityError as e:
+            logger.error(f"Security error in LLM completion: {e}")
+            raise HTTPException(status_code=400, detail=f"Security validation failed: {e.message}")
+        except ProviderError as e:
+            logger.error(f"Provider error in LLM completion: {e}")
+            raise HTTPException(status_code=503, detail="LLM service temporarily unavailable")
+        except LLMError as e:
+            logger.error(f"LLM service error: {e}")
+            raise HTTPException(status_code=500, detail="LLM service error")
+        except Exception as e:
+            logger.error(f"LLM completion failed: {e}")
+            # Return fallback if available
+            return "I'm currently unable to process your request. Please try again later.", None
+    
+    def _build_conversation_messages(self, db_messages: List[DBMessage], config: ChatbotConfig, 
+                                   rag_context: str = "", context: Optional[Dict] = None) -> List[Dict]:
+        """Build messages array for LLM completion"""
+        
+        messages = []
+        
+        # System prompt
+        system_prompt = config.system_prompt
+        if rag_context:
+            # Add explicit instruction to use RAG context
+            system_prompt += "\n\nIMPORTANT: Use the following information from the knowledge base to answer the user's question. " \
+                           "This information is directly relevant to their query and should be your primary source:\n" + rag_context
+        if context and context.get('additional_instructions'):
+            system_prompt += f"\n\nAdditional instructions: {context['additional_instructions']}"
+            
+        messages.append({"role": "system", "content": system_prompt})
+        
+        logger.info(f"Building messages from {len(db_messages)} database messages")
+        
+        # Conversation history (messages are already limited by memory_length in the query)
+        # Reverse to get chronological order
+        # Include ALL messages - the current user message is needed for the LLM to respond!
+        for idx, msg in enumerate(reversed(db_messages)):
+            logger.info(f"Processing message {idx}: role={msg.role}, content_preview={msg.content[:50] if msg.content else 'None'}...")
+            if msg.role in ["user", "assistant"]:
+                messages.append({
+                    "role": msg.role,
+                    "content": msg.content
+                })
+                logger.info(f"Added message with role {msg.role} to LLM messages")
+            else:
+                logger.info(f"Skipped message with role {msg.role}")
+        
+        logger.info(f"Final messages array has {len(messages)} messages")  # For debugging, can be removed in production
+        return messages
+    
+    async def _get_or_create_conversation(self, conversation_id: Optional[str], 
+                                        chatbot_id: str, user_id: str, db: Session) -> DBConversation:
+        """Get existing conversation or create new one"""
+        
+        if conversation_id:
+            conversation = db.query(DBConversation).filter(DBConversation.id == conversation_id).first()
+            if conversation:
+                return conversation
+        
+        # Create new conversation
+        conversation = DBConversation(
+            chatbot_id=chatbot_id,
+            user_id=user_id,
+            title="New Conversation"
+        )
+        
+        db.add(conversation)
+        db.commit()
+        db.refresh(conversation)
+        return conversation
+    
+    def get_router(self) -> APIRouter:
+        """Get FastAPI router for chatbot endpoints"""
+        router = APIRouter(prefix="/chatbot", tags=["chatbot"])
+        
+        @router.post("/chat", response_model=ChatResponse)
+        async def chat_endpoint(
+            request: ChatRequest,
+            current_user: User = Depends(get_current_user),
+            db: Session = Depends(get_db)
+        ):
+            """Chat completion endpoint"""
+            return await self.chat_completion(request, str(current_user['id']), db)
+        
+        @router.post("/create", response_model=ChatbotInstance)
+        async def create_chatbot_endpoint(
+            config: ChatbotConfig,
+            current_user: User = Depends(get_current_user),
+            db: Session = Depends(get_db)
+        ):
+            """Create new chatbot instance"""
+            return await self.create_chatbot(config, str(current_user['id']), db)
+        
+        @router.get("/list", response_model=List[ChatbotInstance])
+        async def list_chatbots_endpoint(
+            current_user: User = Depends(get_current_user),
+            db: Session = Depends(get_db)
+        ):
+            """List user's chatbots"""
+            db_chatbots = db.query(DBChatbotInstance).filter(
+                (DBChatbotInstance.created_by == str(current_user['id'])) | 
+                (DBChatbotInstance.created_by == "system")
+            ).all()
+            
+            chatbots = []
+            for db_chatbot in db_chatbots:
+                chatbot = ChatbotInstance(
+                    id=db_chatbot.id,
+                    name=db_chatbot.name,
+                    config=ChatbotConfig(**db_chatbot.config),
+                    created_by=db_chatbot.created_by,
+                    created_at=db_chatbot.created_at,
+                    updated_at=db_chatbot.updated_at,
+                    is_active=db_chatbot.is_active
+                )
+                chatbots.append(chatbot)
+            
+            return chatbots
+        
+        @router.get("/conversations/{conversation_id}", response_model=Conversation)
+        async def get_conversation_endpoint(
+            conversation_id: str,
+            current_user: User = Depends(get_current_user),
+            db: Session = Depends(get_db)
+        ):
+            """Get conversation history"""
+            conversation = db.query(DBConversation).filter(
+                DBConversation.id == conversation_id
+            ).first()
+            
+            if not conversation:
+                raise HTTPException(status_code=404, detail="Conversation not found")
+            
+            # Check if user owns this conversation
+            if conversation.user_id != str(current_user['id']):
+                raise HTTPException(status_code=403, detail="Not authorized")
+            
+            # Get messages
+            messages = db.query(DBMessage).filter(
+                DBMessage.conversation_id == conversation_id
+            ).order_by(DBMessage.timestamp).all()
+            
+            # Convert to response model
+            chat_messages = []
+            for msg in messages:
+                chat_message = ChatMessage(
+                    id=msg.id,
+                    role=MessageRole(msg.role),
+                    content=msg.content,
+                    timestamp=msg.timestamp,
+                    metadata=msg.metadata or {},
+                    sources=msg.sources
+                )
+                chat_messages.append(chat_message)
+            
+            response_conversation = Conversation(
+                id=conversation.id,
+                chatbot_id=conversation.chatbot_id,
+                user_id=conversation.user_id,
+                messages=chat_messages,
+                created_at=conversation.created_at,
+                updated_at=conversation.updated_at,
+                metadata=conversation.context_data or {}
+            )
+            
+            return response_conversation
+        
+        @router.get("/types", response_model=List[Dict[str, str]])
+        async def get_chatbot_types_endpoint():
+            """Get available chatbot types and their descriptions"""
+            return [
+                {"type": "assistant", "name": "General Assistant", "description": "Helpful AI assistant for general questions"},
+                {"type": "customer_support", "name": "Customer Support", "description": "Professional customer service chatbot"},
+                {"type": "teacher", "name": "Teacher", "description": "Educational tutor and learning assistant"},
+                {"type": "researcher", "name": "Researcher", "description": "Research assistant with fact-checking focus"},
+                {"type": "creative_writer", "name": "Creative Writer", "description": "Creative writing and storytelling assistant"},
+                {"type": "custom", "name": "Custom", "description": "Custom chatbot with user-defined personality"}
+            ]
+        
+        return router
+    
+    # API Compatibility Methods
+    async def chat(self, chatbot_config: Dict[str, Any], message: str, 
+                   conversation_history: List = None, user_id: str = "anonymous") -> Dict[str, Any]:
+        """Chat method for API compatibility"""
+        logger.info(f"Chat method called with message: {message[:50]}... by user: {user_id}")
+        
+        # Lazy load dependencies
+        await self._ensure_dependencies()
+        
+        logger.info(f"LLM service available: {llm_service._initialized}")
+        logger.info(f"RAG module available: {self.rag_module is not None}")
+        
+        try:
+            # Create a minimal database session for the chat
+            from app.db.database import SessionLocal
+            db = SessionLocal()
+            
+            try:
+                # Convert config dict to ChatbotConfig
+                config = ChatbotConfig(
+                    name=chatbot_config.get("name", "Unknown"),
+                    chatbot_type=chatbot_config.get("chatbot_type", "assistant"),
+                    model=chatbot_config.get("model", "gpt-3.5-turbo"),
+                    system_prompt=chatbot_config.get("system_prompt", ""),
+                    temperature=chatbot_config.get("temperature", 0.7),
+                    max_tokens=chatbot_config.get("max_tokens", 1000),
+                    memory_length=chatbot_config.get("memory_length", 10),
+                    use_rag=chatbot_config.get("use_rag", False),
+                    rag_collection=chatbot_config.get("rag_collection"),
+                    rag_top_k=chatbot_config.get("rag_top_k", 5),
+                    fallback_responses=chatbot_config.get("fallback_responses", [])
+                )
+                
+                # Generate response using internal method
+                # Create a temporary message object for the current user message
+                temp_messages = [
+                    DBMessage(
+                        id=0,
+                        conversation_id=0,
+                        role="user",
+                        content=message,
+                        timestamp=datetime.utcnow(),
+                        metadata={}
+                    )
+                ]
+
+                response_content, sources = await self._generate_response(
+                    message, temp_messages, config, None, db
+                )
+                
+                return {
+                    "response": response_content,
+                    "sources": sources,
+                    "conversation_id": None,
+                    "message_id": f"msg_{uuid.uuid4()}"
+                }
+                
+            finally:
+                db.close()
+                
+        except Exception as e:
+            logger.error(f"Chat method failed: {e}")
+            fallback_responses = chatbot_config.get("fallback_responses", [
+                "I'm sorry, I'm having trouble processing your request right now."
+            ])
+            return {
+                "response": fallback_responses[0] if fallback_responses else "I'm sorry, I couldn't process your request.",
+                "sources": None,
+                "conversation_id": None,
+                "message_id": f"msg_{uuid.uuid4()}"
+            }
+
+    # Workflow Integration Methods
+    async def workflow_chat_step(self, context: Dict[str, Any], step_config: Dict[str, Any], db: Session) -> Dict[str, Any]:
+        """Execute chatbot as a workflow step"""
+        
+        message = step_config.get('message', '')
+        chatbot_id = step_config.get('chatbot_id')
+        use_rag = step_config.get('use_rag', False)
+        
+        # Template substitution from context
+        message = self._substitute_template_variables(message, context)
+        
+        request = ChatRequest(
+            message=message,
+            chatbot_id=chatbot_id,
+            use_rag=use_rag,
+            context=step_config.get('context', {})
+        )
+        
+        # Use system user for workflow executions
+        response = await self.chat_completion(request, "workflow_system", db)
+        
+        return {
+            "response": response.response,
+            "conversation_id": response.conversation_id,
+            "sources": response.sources,
+            "metadata": response.metadata
+        }
+    
+    def _substitute_template_variables(self, template: str, context: Dict[str, Any]) -> str:
+        """Simple template variable substitution"""
+        import re
+        
+        def replace_var(match):
+            var_path = match.group(1)
+            try:
+                # Simple dot notation support: context.user.name
+                value = context
+                for part in var_path.split('.'):
+                    value = value[part]
+                return str(value)
+            except (KeyError, TypeError):
+                return match.group(0)  # Return original if not found
+        
+        return re.sub(r'\\{\\{\\s*([^}]+)\\s*\\}\\}', replace_var, template)
+    
+    async def _get_qdrant_collection_name(self, collection_identifier: str, db: Session) -> Optional[str]:
+        """Get Qdrant collection name from RAG collection ID, name, or direct Qdrant collection"""
+        try:
+            from app.models.rag_collection import RagCollection
+            from sqlalchemy import select
+            
+            logger.info(f"Looking up RAG collection with identifier: '{collection_identifier}'")
+            
+            # First check if this might be a direct Qdrant collection name
+            # (e.g., starts with "ext_", "rag_", or contains specific patterns)
+            if collection_identifier.startswith(("ext_", "rag_", "test_")) or "_" in collection_identifier:
+                # Check if this collection exists in Qdrant directly
+                actual_collection_name = collection_identifier
+                # Remove "ext_" prefix if present
+                if collection_identifier.startswith("ext_"):
+                    actual_collection_name = collection_identifier[4:]
+                
+                logger.info(f"Checking if '{actual_collection_name}' exists in Qdrant directly")
+                if self.rag_module:
+                    try:
+                        # Try to verify the collection exists in Qdrant
+                        from qdrant_client import QdrantClient
+                        qdrant_client = QdrantClient(host="enclava-qdrant", port=6333)
+                        collections = qdrant_client.get_collections()
+                        collection_names = [c.name for c in collections.collections]
+                        
+                        if actual_collection_name in collection_names:
+                            logger.info(f"Found Qdrant collection directly: {actual_collection_name}")
+                            return actual_collection_name
+                    except Exception as e:
+                        logger.warning(f"Error checking Qdrant collections: {e}")
+            
+            rag_collection = None
+            
+            # Then try PostgreSQL lookup by ID if numeric
+            if collection_identifier.isdigit():
+                logger.info(f"Treating '{collection_identifier}' as collection ID")
+                stmt = select(RagCollection).where(
+                    RagCollection.id == int(collection_identifier),
+                    RagCollection.is_active == True
+                )
+                result = db.execute(stmt)
+                rag_collection = result.scalar_one_or_none()
+            
+            # If not found by ID, try to look up by name in PostgreSQL
+            if not rag_collection:
+                logger.info(f"Collection not found by ID, trying by name: '{collection_identifier}'")
+                stmt = select(RagCollection).where(
+                    RagCollection.name == collection_identifier,
+                    RagCollection.is_active == True
+                )
+                result = db.execute(stmt)
+                rag_collection = result.scalar_one_or_none()
+            
+            if rag_collection:
+                logger.info(f"Found RAG collection: ID={rag_collection.id}, name='{rag_collection.name}', qdrant_collection='{rag_collection.qdrant_collection_name}'")
+                return rag_collection.qdrant_collection_name
+            else:
+                logger.warning(f"RAG collection '{collection_identifier}' not found in database (tried both ID and name)")
+                return None
+                
+        except Exception as e:
+            logger.error(f"Error looking up RAG collection '{collection_identifier}': {e}")
+            import traceback
+            logger.error(f"Traceback: {traceback.format_exc()}")
+            return None
+
+    # Required abstract methods from BaseModule
+    
+    async def cleanup(self):
+        """Cleanup chatbot module resources"""
+        logger.info("Chatbot module cleanup completed")
+    
+    def get_required_permissions(self) -> List[Permission]:
+        """Get required permissions for chatbot module"""
+        return [
+            Permission("chatbots", "create", "Create chatbot instances"),
+            Permission("chatbots", "configure", "Configure chatbot settings"),
+            Permission("chatbots", "chat", "Use chatbot for conversations"),
+            Permission("chatbots", "manage", "Manage all chatbots")
+        ]
+    
+    async def process_request(self, request_type: str, data: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
+        """Process chatbot requests"""
+        if request_type == "chat":
+            # Handle chat requests
+            chat_request = ChatRequest(**data)
+            user_id = context.get("user_id", "anonymous")
+            db = context.get("db")
+            
+            if db:
+                response = await self.chat_completion(chat_request, user_id, db)
+                return {
+                    "success": True,
+                    "response": response.response,
+                    "conversation_id": response.conversation_id,
+                    "sources": response.sources
+                }
+        
+        return {"success": False, "error": f"Unknown request type: {request_type}"}
+
+
+# Module factory function
+def create_module(rag_service: Optional[RAGServiceProtocol] = None) -> ChatbotModule:
+    """Factory function to create chatbot module instance"""
+    return ChatbotModule(rag_service=rag_service)
+
+# Create module instance (dependencies will be injected via factory)
+chatbot_module = ChatbotModule()
diff --git a/backend/app/modules/chatbot/module.yaml b/backend/app/modules/chatbot/module.yaml
new file mode 100644
index 0000000..7d9fbd3
--- /dev/null
+++ b/backend/app/modules/chatbot/module.yaml
@@ -0,0 +1,110 @@
+name: chatbot
+version: 1.0.0
+description: "AI Chatbot with RAG integration and customizable prompts"
+author: "Enclava Team"
+category: "conversation"
+
+# Module lifecycle
+enabled: true
+auto_start: true
+dependencies: 
+  - rag
+optional_dependencies:
+  - analytics
+
+# Configuration
+config_schema: "./config_schema.json"
+ui_components: "./ui_components/"
+
+# Module capabilities
+provides:
+  - "chat_completion"
+  - "conversation_management"
+  - "chatbot_configuration"
+
+consumes:
+  - "rag_search"
+  - "llm_completion"
+
+# API endpoints
+endpoints:
+  - path: "/chatbot/chat"
+    method: "POST"
+    description: "Generate chat completion"
+    
+  - path: "/chatbot/create"
+    method: "POST" 
+    description: "Create new chatbot instance"
+    
+  - path: "/chatbot/list"
+    method: "GET"
+    description: "List user chatbots"
+
+# UI Configuration
+ui_config:
+  icon: "message-circle"
+  color: "#10B981"
+  category: "AI & ML"
+  
+  # Configuration forms
+  forms:
+    - name: "basic_config"
+      title: "Basic Settings"
+      fields: ["name", "chatbot_type", "model"]
+      
+    - name: "personality"
+      title: "Personality & Prompts"
+      fields: ["system_prompt", "temperature", "fallback_responses"]
+      
+    - name: "knowledge_base"
+      title: "Knowledge Base"
+      fields: ["use_rag", "rag_collection", "rag_top_k"]
+      
+    - name: "advanced"
+      title: "Advanced Settings"
+      fields: ["max_tokens", "memory_length"]
+
+# Permissions
+permissions:
+  - name: "chatbot.create"
+    description: "Create new chatbot instances"
+    
+  - name: "chatbot.configure" 
+    description: "Configure chatbot settings"
+    
+  - name: "chatbot.chat"
+    description: "Use chatbot for conversations"
+    
+  - name: "chatbot.manage"
+    description: "Manage all chatbots (admin)"
+
+# Analytics events
+analytics_events:
+  - name: "chatbot_created"
+    description: "New chatbot instance created"
+    
+  - name: "chat_message_sent"
+    description: "User sent message to chatbot"
+    
+  - name: "chat_response_generated" 
+    description: "Chatbot generated response"
+    
+  - name: "rag_context_used"
+    description: "RAG context was used in response"
+
+# Health checks
+health_checks:
+  - name: "llm_connectivity"
+    description: "Check LLM client connection"
+    
+  - name: "rag_availability"
+    description: "Check RAG module availability"
+    
+  - name: "conversation_memory"
+    description: "Check conversation storage health"
+
+# Documentation
+documentation:
+  readme: "./README.md"
+  examples: "./examples/"
+  api_docs: "./docs/api.md"
\ No newline at end of file
diff --git a/backend/app/modules/factory.py b/backend/app/modules/factory.py
new file mode 100644
index 0000000..7e4590a
--- /dev/null
+++ b/backend/app/modules/factory.py
@@ -0,0 +1,225 @@
+"""
+Module Factory for Confidential Empire
+
+This factory creates and wires up all modules with their dependencies.
+It ensures proper dependency injection while maintaining optimal performance
+through direct method calls and minimal indirection.
+"""
+
+from typing import Dict, Optional, Any
+import logging
+
+# Import all modules
+from .rag.main import RAGModule  
+from .chatbot.main import ChatbotModule, create_module as create_chatbot_module
+from .workflow.main import WorkflowModule
+
+# Import services that modules depend on
+from app.services.litellm_client import LiteLLMClient
+
+# Import protocols for type safety
+from .protocols import (
+    RAGServiceProtocol, 
+    ChatbotServiceProtocol, 
+    LiteLLMClientProtocol,
+    WorkflowServiceProtocol,
+    ServiceRegistry
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ModuleFactory:
+    """Factory for creating and wiring module dependencies"""
+    
+    def __init__(self):
+        self.modules: Dict[str, Any] = {}
+        self.initialized = False
+    
+    async def create_all_modules(self, config: Optional[Dict[str, Any]] = None) -> ServiceRegistry:
+        """
+        Create all modules with proper dependency injection
+        
+        Args:
+            config: Optional configuration for modules
+            
+        Returns:
+            Dictionary of created modules with their dependencies wired
+        """
+        config = config or {}
+        
+        logger.info("Creating modules with dependency injection...")
+        
+        # Step 1: Create LiteLLM client (shared dependency)
+        litellm_client = LiteLLMClient()
+        
+        # Step 2: Create RAG module (no dependencies on other modules)
+        rag_module = RAGModule(config=config.get("rag", {}))
+        
+        # Step 3: Create chatbot module with RAG dependency
+        chatbot_module = create_chatbot_module(
+            litellm_client=litellm_client,
+            rag_service=rag_module  # RAG module implements RAGServiceProtocol
+        )
+        
+        # Step 4: Create workflow module with chatbot dependency  
+        workflow_module = WorkflowModule(
+            chatbot_service=chatbot_module  # Chatbot module implements ChatbotServiceProtocol
+        )
+        
+        # Store all modules
+        modules = {
+            "rag": rag_module,
+            "chatbot": chatbot_module,
+            "workflow": workflow_module
+        }
+        
+        logger.info(f"Created {len(modules)} modules with dependencies wired")
+        
+        # Initialize all modules
+        await self._initialize_modules(modules, config)
+        
+        self.modules = modules
+        self.initialized = True
+        
+        return modules
+    
+    async def _initialize_modules(self, modules: Dict[str, Any], config: Dict[str, Any]):
+        """Initialize all modules in dependency order"""
+        
+        # Initialize in dependency order (modules with no deps first)
+        initialization_order = [
+            ("rag", modules["rag"]),
+            ("chatbot", modules["chatbot"]),  # Depends on RAG
+            ("workflow", modules["workflow"])  # Depends on Chatbot
+        ]
+        
+        for module_name, module in initialization_order:
+            try:
+                logger.info(f"Initializing {module_name} module...")
+                module_config = config.get(module_name, {})
+                
+                # Different modules have different initialization patterns
+                if hasattr(module, 'initialize'):
+                    if module_name == "rag":
+                        await module.initialize()
+                    else:
+                        await module.initialize(**module_config)
+                
+                logger.info(f"✅ {module_name} module initialized successfully")
+                
+            except Exception as e:
+                logger.error(f"❌ Failed to initialize {module_name} module: {e}")
+                raise RuntimeError(f"Module initialization failed: {module_name}") from e
+    
+    async def cleanup_all_modules(self):
+        """Cleanup all modules in reverse dependency order"""
+        if not self.initialized:
+            return
+            
+        # Cleanup in reverse order
+        cleanup_order = ["workflow", "chatbot", "rag"]
+        
+        for module_name in cleanup_order:
+            if module_name in self.modules:
+                try:
+                    logger.info(f"Cleaning up {module_name} module...")
+                    module = self.modules[module_name]
+                    if hasattr(module, 'cleanup'):
+                        await module.cleanup()
+                    logger.info(f"✅ {module_name} module cleaned up")
+                except Exception as e:
+                    logger.error(f"❌ Error cleaning up {module_name}: {e}")
+        
+        self.modules.clear()
+        self.initialized = False
+    
+    def get_module(self, name: str) -> Optional[Any]:
+        """Get a module by name"""
+        return self.modules.get(name)
+    
+    def is_initialized(self) -> bool:
+        """Check if factory is initialized"""
+        return self.initialized
+
+
+# Global factory instance
+module_factory = ModuleFactory()
+
+
+# Convenience functions for external use
+async def create_modules(config: Optional[Dict[str, Any]] = None) -> ServiceRegistry:
+    """Create all modules with dependencies wired"""
+    return await module_factory.create_all_modules(config)
+
+
+async def cleanup_modules():
+    """Cleanup all modules"""
+    await module_factory.cleanup_all_modules()
+
+
+def get_module(name: str) -> Optional[Any]:
+    """Get a module by name"""
+    return module_factory.get_module(name)
+
+
+def get_all_modules() -> Dict[str, Any]:
+    """Get all modules"""
+    return module_factory.modules.copy()
+
+
+# Factory functions for individual modules (for testing/special cases)
+def create_rag_module(config: Optional[Dict[str, Any]] = None) -> RAGModule:
+    """Create RAG module"""
+    return RAGModule(config=config or {})
+
+
+def create_chatbot_with_rag(rag_service: RAGServiceProtocol, 
+                           litellm_client: LiteLLMClientProtocol) -> ChatbotModule:
+    """Create chatbot module with RAG dependency"""
+    return create_chatbot_module(litellm_client=litellm_client, rag_service=rag_service)
+
+
+def create_workflow_with_chatbot(chatbot_service: ChatbotServiceProtocol) -> WorkflowModule:
+    """Create workflow module with chatbot dependency"""
+    return WorkflowModule(chatbot_service=chatbot_service)
+
+
+# Module registry for backward compatibility
+class ModuleRegistry:
+    """Registry that provides access to modules (for backward compatibility)"""
+    
+    def __init__(self, factory: ModuleFactory):
+        self._factory = factory
+    
+    @property
+    def modules(self) -> Dict[str, Any]:
+        """Get all modules (compatible with existing module_manager interface)"""
+        return self._factory.modules
+    
+    def get(self, name: str) -> Optional[Any]:
+        """Get module by name"""
+        return self._factory.get_module(name)
+    
+    def __getitem__(self, name: str) -> Any:
+        """Support dictionary-style access"""
+        module = self.get(name)
+        if module is None:
+            raise KeyError(f"Module '{name}' not found")
+        return module
+    
+    def keys(self):
+        """Get module names"""
+        return self._factory.modules.keys()
+    
+    def values(self):
+        """Get module instances"""  
+        return self._factory.modules.values()
+    
+    def items(self):
+        """Get module name-instance pairs"""
+        return self._factory.modules.items()
+
+
+# Create registry instance for backward compatibility
+module_registry = ModuleRegistry(module_factory)
\ No newline at end of file
diff --git a/backend/app/modules/protocols.py b/backend/app/modules/protocols.py
new file mode 100644
index 0000000..2aec3b2
--- /dev/null
+++ b/backend/app/modules/protocols.py
@@ -0,0 +1,258 @@
+"""
+Module Protocols for Confidential Empire
+
+This file defines the interface contracts that modules must implement for inter-module communication.
+Using Python protocols provides compile-time type checking with zero runtime overhead.
+"""
+
+from typing import Protocol, Dict, List, Any, Optional, Union
+from datetime import datetime
+from abc import abstractmethod
+
+
+class RAGServiceProtocol(Protocol):
+    """Protocol for RAG (Retrieval-Augmented Generation) service interface"""
+    
+    @abstractmethod
+    async def search(self, query: str, collection_name: str, top_k: int) -> Dict[str, Any]:
+        """
+        Search for relevant documents
+        
+        Args:
+            query: Search query string
+            collection_name: Name of the collection to search in
+            top_k: Number of top results to return
+            
+        Returns:
+            Dictionary containing search results with 'results' key
+        """
+        ...
+    
+    @abstractmethod
+    async def index_document(self, content: str, metadata: Dict[str, Any] = None) -> str:
+        """
+        Index a document in the vector database
+        
+        Args:
+            content: Document content to index
+            metadata: Optional metadata for the document
+            
+        Returns:
+            Document ID
+        """
+        ...
+    
+    @abstractmethod
+    async def delete_document(self, document_id: str) -> bool:
+        """
+        Delete a document from the vector database
+        
+        Args:
+            document_id: ID of document to delete
+            
+        Returns:
+            True if successfully deleted
+        """
+        ...
+
+
+class ChatbotServiceProtocol(Protocol):
+    """Protocol for Chatbot service interface"""
+    
+    @abstractmethod
+    async def chat_completion(self, request: Any, user_id: str, db: Any) -> Any:
+        """
+        Generate chat completion response
+        
+        Args:
+            request: Chat request object
+            user_id: ID of the user making the request
+            db: Database session
+            
+        Returns:
+            Chat response object
+        """
+        ...
+    
+    @abstractmethod
+    async def create_chatbot(self, config: Any, user_id: str, db: Any) -> Any:
+        """
+        Create a new chatbot instance
+        
+        Args:
+            config: Chatbot configuration
+            user_id: ID of the user creating the chatbot
+            db: Database session
+            
+        Returns:
+            Created chatbot instance
+        """
+        ...
+
+
+class LiteLLMClientProtocol(Protocol):
+    """Protocol for LiteLLM client interface"""
+    
+    @abstractmethod
+    async def completion(self, model: str, messages: List[Dict[str, str]], **kwargs) -> Any:
+        """
+        Create a completion using the specified model
+        
+        Args:
+            model: Model name to use
+            messages: List of messages for the conversation
+            **kwargs: Additional parameters for the completion
+            
+        Returns:
+            Completion response object
+        """
+        ...
+    
+    @abstractmethod
+    async def create_chat_completion(self, model: str, messages: List[Dict[str, str]], 
+                                   user_id: str, api_key_id: str, **kwargs) -> Any:
+        """
+        Create a chat completion with user tracking
+        
+        Args:
+            model: Model name to use
+            messages: List of messages for the conversation
+            user_id: ID of the user making the request
+            api_key_id: API key identifier
+            **kwargs: Additional parameters
+            
+        Returns:
+            Chat completion response
+        """
+        ...
+
+
+class CacheServiceProtocol(Protocol):
+    """Protocol for Cache service interface"""
+    
+    @abstractmethod
+    async def get(self, key: str, default: Any = None) -> Any:
+        """
+        Get value from cache
+        
+        Args:
+            key: Cache key
+            default: Default value if key not found
+            
+        Returns:
+            Cached value or default
+        """
+        ...
+    
+    @abstractmethod
+    async def set(self, key: str, value: Any, ttl: Optional[int] = None) -> bool:
+        """
+        Set value in cache
+        
+        Args:
+            key: Cache key
+            value: Value to cache
+            ttl: Time to live in seconds
+            
+        Returns:
+            True if successfully cached
+        """
+        ...
+    
+    @abstractmethod
+    async def delete(self, key: str) -> bool:
+        """
+        Delete key from cache
+        
+        Args:
+            key: Cache key to delete
+            
+        Returns:
+            True if successfully deleted
+        """
+        ...
+
+
+class SecurityServiceProtocol(Protocol):
+    """Protocol for Security service interface"""
+    
+    @abstractmethod
+    async def analyze_request(self, request: Any) -> Any:
+        """
+        Perform security analysis on a request
+        
+        Args:
+            request: Request object to analyze
+            
+        Returns:
+            Security analysis result
+        """
+        ...
+    
+    @abstractmethod
+    async def validate_request(self, request: Any) -> bool:
+        """
+        Validate request for security compliance
+        
+        Args:
+            request: Request object to validate
+            
+        Returns:
+            True if request is valid/safe
+        """
+        ...
+
+
+class WorkflowServiceProtocol(Protocol):
+    """Protocol for Workflow service interface"""
+    
+    @abstractmethod
+    async def execute_workflow(self, workflow: Any, input_data: Dict[str, Any] = None) -> Any:
+        """
+        Execute a workflow definition
+        
+        Args:
+            workflow: Workflow definition to execute
+            input_data: Optional input data for the workflow
+            
+        Returns:
+            Workflow execution result
+        """
+        ...
+    
+    @abstractmethod
+    async def get_execution(self, execution_id: str) -> Any:
+        """
+        Get workflow execution status
+        
+        Args:
+            execution_id: ID of the execution to retrieve
+            
+        Returns:
+            Execution status object
+        """
+        ...
+
+
+class ModuleServiceProtocol(Protocol):
+    """Base protocol for all module services"""
+    
+    @abstractmethod
+    async def initialize(self, **kwargs) -> None:
+        """Initialize the module"""
+        ...
+    
+    @abstractmethod
+    async def cleanup(self) -> None:
+        """Cleanup module resources"""
+        ...
+    
+    @abstractmethod
+    def get_required_permissions(self) -> List[Any]:
+        """Get required permissions for this module"""
+        ...
+
+
+# Type aliases for common service combinations
+ServiceRegistry = Dict[str, ModuleServiceProtocol]
+ServiceDependencies = Dict[str, Optional[ModuleServiceProtocol]]
\ No newline at end of file
diff --git a/backend/app/modules/rag/__init__.py b/backend/app/modules/rag/__init__.py
new file mode 100644
index 0000000..7cd6008
--- /dev/null
+++ b/backend/app/modules/rag/__init__.py
@@ -0,0 +1,6 @@
+"""
+RAG (Retrieval-Augmented Generation) module for Confidential Empire platform
+"""
+from .main import RAGModule
+
+__all__ = ["RAGModule"]
\ No newline at end of file
diff --git a/backend/app/modules/rag/main.py b/backend/app/modules/rag/main.py
new file mode 100644
index 0000000..b708d97
--- /dev/null
+++ b/backend/app/modules/rag/main.py
@@ -0,0 +1,1922 @@
+"""
+RAG module implementation with vector database and document processing
+Includes comprehensive document processing, content extraction, and NLP analysis
+"""
+import asyncio
+import io
+import json
+import logging
+import mimetypes
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+from datetime import datetime
+from dataclasses import dataclass, asdict
+from pathlib import Path
+import hashlib
+import base64
+import numpy as np
+import uuid
+
+# Initialize logger early
+logger = logging.getLogger(__name__)
+
+# Document processing libraries (with graceful fallbacks)
+try:
+    import nltk
+    from nltk.tokenize import sent_tokenize, word_tokenize
+    from nltk.corpus import stopwords
+    from nltk.stem import WordNetLemmatizer
+    NLTK_AVAILABLE = True
+except ImportError:
+    logger.warning("NLTK not available - NLP features will be limited")
+    NLTK_AVAILABLE = False
+    
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+except ImportError:
+    logger.warning("spaCy not available - entity extraction will be disabled")
+    SPACY_AVAILABLE = False
+    
+try:
+    from markitdown import MarkItDown
+    MARKITDOWN_AVAILABLE = True
+except ImportError:
+    logger.warning("MarkItDown not available - document conversion will be limited")
+    MARKITDOWN_AVAILABLE = False
+
+try:
+    from docx import Document as DocxDocument
+    PYTHON_DOCX_AVAILABLE = True
+except ImportError:
+    logger.warning("python-docx not available - DOCX processing will be limited")
+    PYTHON_DOCX_AVAILABLE = False
+
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams, PointStruct, ScoredPoint, Filter, FieldCondition, MatchValue
+from qdrant_client.http import models
+import tiktoken
+
+from app.core.config import settings
+from app.core.logging import log_module_event
+from app.services.base_module import BaseModule, Permission
+
+
+@dataclass
+class ProcessedDocument:
+    """Processed document data structure"""
+    id: str
+    original_filename: str
+    file_type: str
+    mime_type: str
+    content: str
+    extracted_text: str
+    metadata: Dict[str, Any]
+    word_count: int
+    sentence_count: int
+    language: str
+    entities: List[Dict[str, Any]]
+    keywords: List[str]
+    processing_time: float
+    processed_at: datetime
+    file_hash: str
+    file_size: int
+    embedding: Optional[List[float]] = None
+    created_at: datetime = None
+    
+    def __post_init__(self):
+        if self.created_at is None:
+            self.created_at = datetime.utcnow()
+
+
+@dataclass
+class ContentValidationResult:
+    """Content validation result"""
+    is_valid: bool
+    issues: List[str]
+    security_score: float
+    content_type: str
+    language_confidence: float
+
+
+# Keep Document class for backward compatibility
+@dataclass
+class Document:
+    """Simple document data structure for backward compatibility"""
+    id: str
+    content: str
+    metadata: Dict[str, Any]
+    embedding: Optional[List[float]] = None
+    created_at: datetime = None
+    
+    def __post_init__(self):
+        if self.created_at is None:
+            self.created_at = datetime.utcnow()
+
+
+@dataclass
+class SearchResult:
+    """Search result data structure"""
+    document: Document
+    score: float
+    relevance_score: float
+
+
+class RAGModule(BaseModule):
+    """RAG module for document storage, retrieval, and augmented generation with integrated content processing"""
+    
+    def __init__(self, config: Dict[str, Any] = None):
+        super().__init__(module_id="rag", config=config)
+        self.enabled = False
+        self.qdrant_client: Optional[QdrantClient] = None
+        self.default_collection_name = "documents"  # Keep for backward compatibility
+        self.embedding_model = None
+        self.embedding_service = None
+        self.tokenizer = None
+
+        # Set improved default configuration
+        self.config = {
+            "chunk_size": 300,      # Reduced from 400 for better precision
+            "chunk_overlap": 50,    # Added overlap for context preservation
+            "max_results": 10,
+            "score_threshold": 0.3, # Increased from 0.0 to filter low-quality results
+            "enable_hybrid": True,   # Enable hybrid search (vector + BM25)
+            "hybrid_weights": {"vector": 0.7, "bm25": 0.3}  # Weight for hybrid scoring
+        }
+        # Update with any provided config
+        if config:
+            self.config.update(config)
+        
+        # Content processing components
+        self.nlp_model = None
+        self.lemmatizer = None
+        self.stop_words = set()
+        self.markitdown = None
+        self.supported_types = {
+            'text/plain': self._process_text,
+            'application/pdf': self._process_with_markitdown,
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._process_docx,
+            'application/msword': self._process_docx,
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': self._process_with_markitdown,
+            'application/vnd.ms-excel': self._process_with_markitdown,
+            'text/html': self._process_html,
+            'application/json': self._process_json,
+            'application/x-ndjson': self._process_jsonl,  # JSONL support
+            'text/markdown': self._process_markdown,
+            'text/csv': self._process_csv
+        }
+        
+        self.stats = {
+            "documents_indexed": 0,
+            "documents_processed": 0,
+            "total_processing_time": 0,
+            "average_processing_time": 0,
+            "searches_performed": 0,
+            "average_search_time": 0.0,
+            "cache_hits": 0,
+            "errors": 0,
+            "supported_types": len(self.supported_types)
+        }
+        self.search_cache = {}
+    
+    def get_required_permissions(self) -> List[Permission]:
+        """Return list of permissions this module requires"""
+        return [
+            Permission("documents", "index", "Index new documents"),
+            Permission("documents", "search", "Search documents"),
+            Permission("documents", "delete", "Delete documents"),
+            Permission("collections", "manage", "Manage collections"),
+            Permission("settings", "configure", "Configure RAG settings")
+        ]
+    
+    async def initialize(self):
+        """Initialize the RAG module with content processing capabilities"""
+        
+        try:
+            # Initialize Qdrant client
+            qdrant_host = getattr(settings, 'QDRANT_HOST', 'localhost')
+            qdrant_port = getattr(settings, 'QDRANT_PORT', 6333)
+            qdrant_url = f"http://{qdrant_host}:{qdrant_port}"
+            self.qdrant_client = QdrantClient(url=qdrant_url)
+            
+            # Initialize tokenizer
+            self.tokenizer = tiktoken.get_encoding("cl100k_base")
+            
+            # Initialize embedding model
+            self.embedding_model = await self._initialize_embedding_model()
+            
+            # Initialize content processing components
+            await self._initialize_content_processing()
+            
+            # Create default collection if it doesn't exist
+            await self._ensure_collection_exists(self.default_collection_name)
+            
+            self.enabled = True
+            self.initialized = True
+            log_module_event("rag", "initialized", {
+                "vector_db": self.config.get("vector_db", "qdrant"),
+                "embedding_model": self.embedding_model.get("model_name", "intfloat/multilingual-e5-large-instruct"),
+                "chunk_size": self.config.get("chunk_size", 400),
+                "max_results": self.config.get("max_results", 10),
+                "supported_file_types": list(self.supported_types.keys()),
+                "nltk_ready": True,
+                "spacy_ready": self.nlp_model is not None,
+                "markitdown_ready": self.markitdown is not None
+            })
+            
+        except Exception as e:
+            logger.error(f"Failed to initialize RAG module: {e}")
+            log_module_event("rag", "initialization_failed", {"error": str(e)})
+            self.enabled = False
+            raise
+    
+    def _generate_file_hash(self, content: bytes) -> str:
+        """Generate SHA-256 hash of file content"""
+        return hashlib.sha256(content).hexdigest()
+    
+    def _detect_mime_type(self, filename: str, content: bytes) -> str:
+        """Detect MIME type of file"""
+        # Try to detect from filename
+        mime_type, _ = mimetypes.guess_type(filename)
+        if mime_type:
+            return mime_type
+        
+        # Check for JSONL file extension
+        if filename.lower().endswith('.jsonl'):
+            return 'application/x-ndjson'
+        
+        # Try to detect from content
+        if content.startswith(b'%PDF'):
+            return 'application/pdf'
+        elif content.startswith(b'PK'):
+            # This could be DOCX, XLSX, or other Office formats
+            if filename.lower().endswith(('.docx', '.docm')):
+                return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+            elif filename.lower().endswith(('.xlsx', '.xlsm')):
+                return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+            else:
+                return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+        elif content.startswith(b'\xd0\xcf\x11\xe0'):
+            # Old Office format (DOC, XLS)
+            if filename.lower().endswith('.xls'):
+                return 'application/vnd.ms-excel'
+            else:
+                return 'application/msword'
+        elif content.startswith(b'<html') or content.startswith(b'<!DOCTYPE'):
+            return 'text/html'
+        elif content.startswith(b'{') or content.startswith(b'['):
+            # Check if it's JSONL by looking for newline-delimited JSON
+            try:
+                lines = content.decode('utf-8', errors='ignore').split('\n')
+                if len(lines) > 1 and all(line.strip().startswith('{') for line in lines[:3] if line.strip()):
+                    return 'application/x-ndjson'
+            except:
+                pass
+            return 'application/json'
+        else:
+            return 'text/plain'
+    
+    def _detect_language(self, text: str) -> Tuple[str, float]:
+        """Detect language of text (simplified implementation)"""
+        if len(text) < 50:
+            return 'unknown', 0.0
+        
+        # Simple heuristic based on common English words
+        english_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'shall'}
+        
+        if NLTK_AVAILABLE:
+            words = word_tokenize(text.lower())
+        else:
+            # Fallback to simple whitespace tokenization
+            words = text.lower().split()
+            
+        english_count = sum(1 for word in words if word in english_words)
+        confidence = min(english_count / len(words), 1.0) if words else 0.0
+        
+        return 'en' if confidence > 0.1 else 'unknown', confidence
+    
+    def _extract_entities(self, text: str) -> List[Dict[str, Any]]:
+        """Extract named entities from text"""
+        if not self.nlp_model:
+            return []
+        
+        try:
+            doc = self.nlp_model(text[:10000])  # Limit text length for performance
+            entities = []
+            
+            for ent in doc.ents:
+                entities.append({
+                    "text": ent.text,
+                    "label": ent.label_,
+                    "start": ent.start_char,
+                    "end": ent.end_char,
+                    "confidence": float(ent._.get("score", 0.0)) if hasattr(ent._, "score") else 0.0
+                })
+            
+            return entities
+            
+        except Exception as e:
+            logger.error(f"Error extracting entities: {e}")
+            return []
+    
+    def _extract_keywords(self, text: str, max_keywords: int = 20) -> List[str]:
+        """Extract keywords from text"""
+        try:
+            if NLTK_AVAILABLE:
+                words = word_tokenize(text.lower())
+            else:
+                # Fallback to simple whitespace tokenization
+                words = text.lower().split()
+                
+            words = [word for word in words if word.isalpha() and word not in self.stop_words]
+            
+            if self.lemmatizer and NLTK_AVAILABLE:
+                words = [self.lemmatizer.lemmatize(word) for word in words]
+            
+            # Simple frequency-based keyword extraction
+            word_freq = {}
+            for word in words:
+                word_freq[word] = word_freq.get(word, 0) + 1
+            
+            # Sort by frequency and return top keywords
+            keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+            return [word for word, freq in keywords[:max_keywords] if freq > 1]
+            
+        except Exception as e:
+            logger.error(f"Error extracting keywords: {e}")
+            return []
+    
+    def _clean_text(self, text: str) -> str:
+        """Clean and normalize text"""
+        if not text:
+            return ""
+        
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        
+        # Remove control characters except newlines and tabs
+        text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
+        
+        # Normalize quotes
+        text = re.sub(r'[""''`]', '"', text)
+        
+        # Remove excessive punctuation
+        text = re.sub(r'[.]{3,}', '...', text)
+        text = re.sub(r'[!]{2,}', '!', text)
+        text = re.sub(r'[?]{2,}', '?', text)
+        
+        return text.strip()
+    
+    def _validate_content(self, content: str, file_type: str) -> ContentValidationResult:
+        """Validate and score content for security and quality"""
+        issues = []
+        security_score = 100.0
+        
+        # Check for potentially malicious content
+        if '<script' in content.lower() or 'javascript:' in content.lower():
+            issues.append("Potentially malicious JavaScript content detected")
+            security_score -= 30
+        
+        if re.search(r'<iframe|<object|<embed', content, re.IGNORECASE):
+            issues.append("Embedded content detected")
+            security_score -= 20
+        
+        # Check for suspicious URLs
+        if re.search(r'https?://[^\s]+\.(exe|bat|cmd|scr|vbs|js)', content, re.IGNORECASE):
+            issues.append("Suspicious executable URLs detected")
+            security_score -= 40
+        
+        # Check content length
+        if len(content) > 1000000:  # 1MB limit
+            issues.append("Content exceeds maximum size limit")
+            security_score -= 10
+        
+        # Detect language
+        language, lang_confidence = self._detect_language(content)
+        
+        return ContentValidationResult(
+            is_valid=len(issues) == 0,
+            issues=issues,
+            security_score=max(0, security_score),
+            content_type=file_type,
+            language_confidence=lang_confidence
+        )
+    
+    async def cleanup(self):
+        """Cleanup RAG resources"""
+        if self.qdrant_client:
+            self.qdrant_client.close()
+            self.qdrant_client = None
+        
+        if self.embedding_service:
+            await self.embedding_service.cleanup()
+            self.embedding_service = None
+        
+        # Cleanup content processing resources
+        self.nlp_model = None
+        self.lemmatizer = None
+        self.markitdown = None
+        self.stop_words.clear()
+        
+        self.enabled = False
+        self.search_cache.clear()
+        log_module_event("rag", "cleanup", {"success": True})
+    
+    async def _initialize_embedding_model(self):
+        """Initialize embedding model"""
+        # Prefer enhanced embedding service (rate limiting + retry)
+        from app.services.enhanced_embedding_service import enhanced_embedding_service as embedding_service
+        
+        # Use intfloat/multilingual-e5-large-instruct for LLM service integration
+        model_name = self.config.get("embedding_model", "intfloat/multilingual-e5-large-instruct")
+        embedding_service.model_name = model_name
+        
+        # Initialize the embedding service
+        success = await embedding_service.initialize()
+        
+        if success:
+            self.embedding_service = embedding_service
+            logger.info(f"Successfully initialized embedding service with {model_name}")
+            return {
+                "model_name": model_name,
+                "dimension": embedding_service.dimension or 768
+            }
+        else:
+            # Fallback to mock implementation
+            logger.warning("Failed to initialize embedding model, using fallback")
+            self.embedding_service = None
+            return {
+                "model_name": model_name,
+                "dimension": 1024  # Default dimension for intfloat/multilingual-e5-large-instruct
+            }
+    
+    async def _initialize_content_processing(self):
+        """Initialize content processing components"""
+        try:
+            # Download required NLTK data
+            await self._download_nltk_data()
+            
+            # Initialize NLP components
+            if NLTK_AVAILABLE:
+                self.lemmatizer = WordNetLemmatizer()
+                self.stop_words = set(stopwords.words('english'))
+            else:
+                self.lemmatizer = None
+                self.stop_words = set()
+            
+            # Initialize spaCy model
+            await self._initialize_spacy_model()
+            
+            # Initialize MarkItDown
+            if MARKITDOWN_AVAILABLE:
+                self.markitdown = MarkItDown()
+            else:
+                self.markitdown = None
+            
+        except Exception as e:
+            logger.warning(f"Failed to initialize some content processing components: {e}")
+    
+    async def _download_nltk_data(self):
+        """Download required NLTK data"""
+        if not NLTK_AVAILABLE:
+            return
+            
+        try:
+            nltk.download('punkt', quiet=True)
+            nltk.download('stopwords', quiet=True)
+            nltk.download('wordnet', quiet=True)
+            nltk.download('averaged_perceptron_tagger', quiet=True)
+            nltk.download('omw-1.4', quiet=True)
+        except Exception as e:
+            logger.warning(f"Failed to download NLTK data: {e}")
+    
+    async def _initialize_spacy_model(self):
+        """Initialize spaCy model for NLP tasks"""
+        if not SPACY_AVAILABLE:
+            self.nlp_model = None
+            return
+            
+        try:
+            self.nlp_model = spacy.load("en_core_web_sm")
+        except OSError:
+            logger.warning("spaCy model 'en_core_web_sm' not found. NLP features will be limited.")
+            self.nlp_model = None
+    
+    async def _get_collections_safely(self) -> List[str]:
+        """Get list of collections using raw HTTP to avoid Pydantic validation issues"""
+        try:
+            import httpx
+            qdrant_host = getattr(settings, 'QDRANT_HOST', 'localhost')
+            qdrant_port = getattr(settings, 'QDRANT_PORT', 6333)
+            qdrant_url = f"http://{qdrant_host}:{qdrant_port}"
+            
+            async with httpx.AsyncClient() as client:
+                response = await client.get(f"{qdrant_url}/collections")
+                if response.status_code == 200:
+                    data = response.json()
+                    result = data.get("result", {})
+                    collections = result.get("collections", [])
+                    return [col.get("name", "") for col in collections if col.get("name")]
+                else:
+                    logger.warning(f"Failed to get collections via HTTP: {response.status_code}")
+                    return []
+        except Exception as e:
+            logger.error(f"Error getting collections safely: {e}")
+            # Fallback to direct client call with error handling
+            try:
+                collections = self.qdrant_client.get_collections()
+                return [col.name for col in collections.collections]
+            except Exception as fallback_error:
+                logger.error(f"Fallback collection fetch also failed: {fallback_error}")
+                return []
+
+    async def _get_collection_info_safely(self, collection_name: str) -> Dict[str, Any]:
+        """Get collection information using raw HTTP to avoid Pydantic validation issues"""
+        try:
+            import httpx
+            qdrant_host = getattr(settings, 'QDRANT_HOST', 'localhost')
+            qdrant_port = getattr(settings, 'QDRANT_PORT', 6333)
+            qdrant_url = f"http://{qdrant_host}:{qdrant_port}"
+            
+            async with httpx.AsyncClient() as client:
+                response = await client.get(f"{qdrant_url}/collections/{collection_name}")
+                if response.status_code == 200:
+                    data = response.json()
+                    result = data.get("result", {})
+                    
+                    # Extract relevant information safely
+                    collection_info = {
+                        "points_count": result.get("points_count", 0),
+                        "status": result.get("status", "unknown"),
+                        "vector_size": 384  # Default fallback
+                    }
+                    
+                    # Try to get vector dimension from config
+                    try:
+                        config = result.get("config", {})
+                        params = config.get("params", {})
+                        vectors = params.get("vectors", {})
+                        
+                        if isinstance(vectors, dict) and "size" in vectors:
+                            collection_info["vector_size"] = vectors["size"]
+                        elif isinstance(vectors, dict):
+                            # Handle named vectors or default vector
+                            if 'default' in vectors:
+                                collection_info["vector_size"] = vectors['default'].get('size', 384)
+                            else:
+                                # Take first vector config if no default
+                                first_vector = next(iter(vectors.values()), {})
+                                collection_info["vector_size"] = first_vector.get('size', 384)
+                    except Exception:
+                        # Keep default fallback
+                        pass
+                    
+                    return collection_info
+                else:
+                    logger.warning(f"Failed to get collection info via HTTP: {response.status_code}")
+                    return {"points_count": 0, "status": "error", "vector_size": 384}
+        except Exception as e:
+            logger.error(f"Error getting collection info safely: {e}")
+            return {"points_count": 0, "status": "error", "vector_size": 384}
+
+    async def _ensure_collection_exists(self, collection_name: str = None):
+        """Ensure the specified collection exists"""
+        collection_name = collection_name or self.default_collection_name
+        
+        try:
+            # Use safe collection fetching to avoid Pydantic validation errors
+            collection_names = await self._get_collections_safely()
+            
+            if collection_name not in collection_names:
+                # Create collection
+                self.qdrant_client.create_collection(
+                    collection_name=collection_name,
+                    vectors_config=VectorParams(
+                        size=self.embedding_model.get("dimension", 768),
+                        distance=Distance.COSINE
+                    )
+                )
+                log_module_event("rag", "collection_created", {"collection": collection_name})
+            
+        except Exception as e:
+            logger.error(f"Error ensuring collection exists: {e}")
+            raise
+    
+    async def create_collection(self, collection_name: str) -> bool:
+        """Create a new Qdrant collection"""
+        try:
+            await self._ensure_collection_exists(collection_name)
+            return True
+        except Exception as e:
+            logger.error(f"Error creating collection {collection_name}: {e}")
+            return False
+    
+    async def delete_collection(self, collection_name: str) -> bool:
+        """Delete a Qdrant collection"""
+        try:
+            # Use safe collection fetching to avoid Pydantic validation errors
+            collection_names = await self._get_collections_safely()
+            
+            if collection_name in collection_names:
+                self.qdrant_client.delete_collection(collection_name)
+                log_module_event("rag", "collection_deleted", {"collection": collection_name})
+                return True
+            else:
+                logger.warning(f"Collection {collection_name} does not exist")
+                return False
+                
+        except Exception as e:
+            logger.error(f"Error deleting collection {collection_name}: {e}")
+            return False
+    
+    async def _generate_embedding(self, text: str) -> List[float]:
+        """Generate embedding for text"""
+        if self.embedding_service:
+            # Use real embedding service
+            return await self.embedding_service.get_embedding(text)
+        else:
+            # Fallback to deterministic random embedding for consistency
+            np.random.seed(hash(text) % 2**32)
+            return np.random.random(self.embedding_model.get("dimension", 768)).tolist()
+    
+    async def _generate_embeddings(self, texts: List[str], is_document: bool = True) -> List[List[float]]:
+        """Generate embeddings for multiple texts (batch processing)"""
+        if self.embedding_service:
+            # Add task-specific prefixes for better E5 model performance
+            if is_document:
+                # For document passages, use "passage:" prefix
+                prefixed_texts = [f"passage: {text}" for text in texts]
+            else:
+                # For queries, use "query:" prefix (handled in search method)
+                prefixed_texts = texts
+
+            # Use real embedding service for batch processing
+            return await self.embedding_service.get_embeddings(prefixed_texts)
+        else:
+            # Fallback to individual processing
+            embeddings = []
+            for text in texts:
+                embedding = await self._generate_embedding(text)
+                embeddings.append(embedding)
+            return embeddings
+    
+    def _chunk_text(self, text: str, chunk_size: int = None) -> List[str]:
+        """Split text into overlapping chunks for better context preservation"""
+        chunk_size = chunk_size or self.config.get("chunk_size", 300)
+        chunk_overlap = self.config.get("chunk_overlap", 50)
+
+        # Tokenize text
+        tokens = self.tokenizer.encode(text)
+
+        # Split into chunks with overlap
+        chunks = []
+        start_idx = 0
+
+        while start_idx < len(tokens):
+            end_idx = min(start_idx + chunk_size, len(tokens))
+            chunk_tokens = tokens[start_idx:end_idx]
+            chunk_text = self.tokenizer.decode(chunk_tokens)
+
+            # Only add non-empty chunks
+            if chunk_text.strip():
+                chunks.append(chunk_text)
+
+            # Move to next chunk with overlap
+            start_idx = end_idx - chunk_overlap
+
+            # Ensure progress (in case overlap >= chunk_size)
+            if start_idx >= end_idx:
+                start_idx = end_idx
+
+        return chunks
+    
+    async def _process_text(self, content: bytes, filename: str) -> str:
+        """Process plain text files"""
+        try:
+            # Try different encodings
+            for encoding in ['utf-8', 'latin-1', 'cp1252']:
+                try:
+                    return content.decode(encoding)
+                except UnicodeDecodeError:
+                    continue
+            
+            # Fallback to utf-8 with error handling
+            return content.decode('utf-8', errors='replace')
+            
+        except Exception as e:
+            logger.error(f"Error processing text file: {e}")
+            return ""
+    
+    async def _process_with_markitdown(self, content: bytes, filename: str) -> str:
+        """Process documents using MarkItDown (PDF, DOCX, DOC, XLSX, XLS)"""
+        try:
+            if not self.markitdown:
+                raise RuntimeError("MarkItDown not initialized")
+            
+            # Create a temporary file path for the content
+            import tempfile
+            import os
+            
+            # Get file extension from filename
+            file_ext = Path(filename).suffix.lower()
+            if not file_ext:
+                # Try to determine extension from mime type
+                mime_type = self._detect_mime_type(filename, content)
+                if mime_type == 'application/pdf':
+                    file_ext = '.pdf'
+                elif mime_type in ['application/vnd.openxmlformats-officedocument.wordprocessingml.document']:
+                    file_ext = '.docx'
+                elif mime_type == 'application/msword':
+                    file_ext = '.doc'
+                elif mime_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
+                    file_ext = '.xlsx'
+                elif mime_type == 'application/vnd.ms-excel':
+                    file_ext = '.xls'
+                else:
+                    file_ext = '.bin'
+            
+            # Write content to temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
+                temp_file.write(content)
+                temp_path = temp_file.name
+            
+            try:
+                # Convert document to markdown using MarkItDown in a thread pool to avoid blocking
+                import concurrent.futures
+                import asyncio
+                
+                logger.info(f"Starting MarkItDown conversion for {filename}")
+                
+                def convert_sync():
+                    """Synchronous conversion function to run in thread pool"""
+                    return self.markitdown.convert(temp_path)
+                
+                # Run the synchronous conversion in a thread pool with timeout
+                loop = asyncio.get_event_loop()
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    try:
+                        result = await asyncio.wait_for(
+                            loop.run_in_executor(executor, convert_sync),
+                            timeout=120.0  # 2 minute timeout for MarkItDown conversion
+                        )
+                    except asyncio.TimeoutError:
+                        logger.error(f"MarkItDown conversion timed out for {filename}")
+                        raise RuntimeError(f"Document conversion timed out after 2 minutes for {filename}")
+                
+                if result and hasattr(result, 'text_content'):
+                    converted_text = result.text_content
+                elif result and isinstance(result, str):
+                    converted_text = result
+                else:
+                    # Fallback if result format is unexpected
+                    converted_text = str(result) if result else ""
+                
+                logger.info(f"Successfully converted {filename} using MarkItDown ({len(converted_text)} characters)")
+                return converted_text
+                
+            finally:
+                # Clean up temporary file
+                try:
+                    os.unlink(temp_path)
+                except OSError:
+                    pass
+                    
+        except Exception as e:
+            logger.error(f"Error processing {filename} with MarkItDown: {e}")
+            # Fallback to basic text extraction attempt
+            try:
+                return content.decode('utf-8', errors='replace')
+            except:
+                return f"Error processing {filename}: {str(e)}"
+    
+    async def _process_docx(self, content: bytes, filename: str) -> str:
+        """Process DOCX files using python-docx (more reliable than MarkItDown)"""
+        try:
+            if not PYTHON_DOCX_AVAILABLE:
+                logger.warning(f"python-docx not available, falling back to MarkItDown for {filename}")
+                return await self._process_with_markitdown(content, filename)
+            
+            # Create a temporary file for python-docx processing
+            import tempfile
+            import os
+            
+            logger.info(f"Starting DOCX processing for {filename} using python-docx")
+            
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
+                temp_file.write(content)
+                temp_path = temp_file.name
+            
+            try:
+                # Process in a thread pool to avoid blocking
+                import concurrent.futures
+                import asyncio
+                
+                def extract_docx_text():
+                    """Extract text from DOCX file synchronously"""
+                    doc = DocxDocument(temp_path)
+                    text_parts = []
+                    
+                    # Extract paragraphs
+                    for paragraph in doc.paragraphs:
+                        if paragraph.text.strip():
+                            text_parts.append(paragraph.text.strip())
+                    
+                    # Extract text from tables
+                    for table in doc.tables:
+                        for row in table.rows:
+                            row_text = []
+                            for cell in row.cells:
+                                if cell.text.strip():
+                                    row_text.append(cell.text.strip())
+                            if row_text:
+                                text_parts.append(" | ".join(row_text))
+                    
+                    return "\n\n".join(text_parts)
+                
+                # Run extraction in thread pool with timeout
+                loop = asyncio.get_event_loop()
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    try:
+                        extracted_text = await asyncio.wait_for(
+                            loop.run_in_executor(executor, extract_docx_text),
+                            timeout=30.0  # 30 second timeout for DOCX processing
+                        )
+                    except asyncio.TimeoutError:
+                        logger.error(f"DOCX processing timed out for {filename}")
+                        raise RuntimeError(f"DOCX processing timed out after 30 seconds for {filename}")
+                
+                logger.info(f"Successfully processed {filename} using python-docx ({len(extracted_text)} characters)")
+                return extracted_text
+                
+            finally:
+                # Clean up temporary file
+                try:
+                    os.unlink(temp_path)
+                except OSError:
+                    pass
+                    
+        except Exception as e:
+            logger.error(f"Error processing DOCX file {filename}: {e}")
+            # Fallback to MarkItDown if python-docx fails
+            try:
+                logger.info(f"Falling back to MarkItDown for {filename}")
+                return await self._process_with_markitdown(content, filename)
+            except Exception as fallback_error:
+                logger.error(f"Both python-docx and MarkItDown failed for {filename}: {fallback_error}")
+                return f"Error processing DOCX {filename}: {str(e)}"
+    
+    async def _process_html(self, content: bytes, filename: str) -> str:
+        """Process HTML files"""
+        try:
+            html_content = content.decode('utf-8', errors='replace')
+            # Simple HTML tag removal
+            text = re.sub(r'<[^>]+>', '', html_content)
+            # Decode HTML entities
+            text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', "'")
+            return text
+            
+        except Exception as e:
+            logger.error(f"Error processing HTML file: {e}")
+            return ""
+    
+    async def _process_json(self, content: bytes, filename: str) -> str:
+        """Process JSON files"""
+        try:
+            json_data = json.loads(content.decode('utf-8'))
+            # Convert JSON to readable text
+            return json.dumps(json_data, indent=2)
+            
+        except Exception as e:
+            logger.error(f"Error processing JSON file: {e}")
+            return ""
+    
+    async def _process_markdown(self, content: bytes, filename: str) -> str:
+        """Process Markdown files"""
+        try:
+            md_content = content.decode('utf-8', errors='replace')
+            # Simple markdown processing - remove formatting
+            text = re.sub(r'#+\s*', '', md_content)  # Remove headers
+            text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)  # Bold
+            text = re.sub(r'\*(.+?)\*', r'\1', text)  # Italic
+            text = re.sub(r'`(.+?)`', r'\1', text)  # Code
+            text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)  # Links
+            return text
+            
+        except Exception as e:
+            logger.error(f"Error processing Markdown file: {e}")
+            return ""
+    
+    async def _process_csv(self, content: bytes, filename: str) -> str:
+        """Process CSV files"""
+        try:
+            csv_content = content.decode('utf-8', errors='replace')
+            # Convert CSV to readable text
+            lines = csv_content.split('\n')
+            processed_lines = []
+            
+            for line in lines[:100]:  # Limit to first 100 lines
+                if line.strip():
+                    processed_lines.append(line.replace(',', ' | '))
+            
+            return '\n'.join(processed_lines)
+            
+        except Exception as e:
+            logger.error(f"Error processing CSV file: {e}")
+            return ""
+    
+    async def _process_jsonl(self, content: bytes, filename: str) -> str:
+        """Process JSONL files (newline-delimited JSON)
+
+        Specifically optimized for helpjuice-export.jsonl format:
+        - Each line contains a JSON object with 'id' and 'payload'
+        - Payload contains 'question', 'language', and 'answer' fields
+        - Combines question and answer into searchable content
+
+        Performance optimizations:
+        - Processes articles in smaller batches to reduce memory usage
+        - Uses streaming approach for large files
+        """
+        try:
+            # Use streaming approach for large files
+            jsonl_content = content.decode('utf-8', errors='replace')
+            lines = jsonl_content.strip().split('\n')
+
+            processed_articles = []
+            batch_size = 50  # Process in batches of 50 articles
+
+            for line_num, line in enumerate(lines, 1):
+                if not line.strip():
+                    continue
+
+                try:
+                    # Parse each JSON line
+                    data = json.loads(line)
+
+                    # Handle helpjuice export format
+                    if 'payload' in data:
+                        payload = data['payload']
+                        article_id = data.get('id', f'article_{line_num}')
+
+                        # Extract fields
+                        question = payload.get('question', '')
+                        answer = payload.get('answer', '')
+                        language = payload.get('language', 'EN')
+
+                        # Combine question and answer for better search
+                        if question or answer:
+                            # Format as Q&A for better context
+                            article_text = f"## {question}\n\n{answer}\n\n"
+
+                            # Add language tag if not English
+                            if language != 'EN':
+                                article_text = f"[{language}] {article_text}"
+
+                            # Add metadata separator
+                            article_text += f"---\nArticle ID: {article_id}\nLanguage: {language}\n\n"
+
+                            processed_articles.append(article_text)
+
+                    # Handle generic JSONL format
+                    else:
+                        # Convert the entire JSON object to readable text
+                        json_text = json.dumps(data, indent=2, ensure_ascii=False)
+                        processed_articles.append(json_text + "\n\n")
+
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Error parsing JSONL line {line_num}: {e}")
+                    continue
+                except Exception as e:
+                    logger.warning(f"Error processing JSONL line {line_num}: {e}")
+                    continue
+
+            # Combine all articles
+            combined_text = '\n'.join(processed_articles)
+
+            logger.info(f"Successfully processed {len(processed_articles)} articles from JSONL file {filename}")
+            return combined_text
+
+        except Exception as e:
+            logger.error(f"Error processing JSONL file {filename}: {e}")
+            return ""
+    
+    def _generate_document_id(self, content: str, metadata: Dict[str, Any]) -> str:
+        """Generate unique document ID"""
+        content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
+        metadata_hash = hashlib.sha256(json.dumps(metadata, sort_keys=True).encode()).hexdigest()[:8]
+        return f"{content_hash}_{metadata_hash}"
+    
+    async def process_document(self, file_data: bytes, filename: str, metadata: Dict[str, Any] = None) -> ProcessedDocument:
+        """Process a document and extract content"""
+        if not self.enabled:
+            raise RuntimeError("RAG module not initialized")
+        
+        import time
+        start_time = time.time()
+        
+        try:
+            logger.info(f"Starting document processing pipeline for {filename}")
+            
+            # Generate file hash and ID
+            file_hash = self._generate_file_hash(file_data)
+            doc_id = f"{file_hash}_{int(time.time())}"
+            logger.info(f"Generated document ID: {doc_id}")
+            
+            # Detect MIME type
+            mime_type = self._detect_mime_type(filename, file_data)
+            file_type = mime_type.split('/')[0]
+            logger.info(f"Detected MIME type: {mime_type}, file type: {file_type}")
+            
+            # Check if file type is supported
+            if mime_type not in self.supported_types:
+                raise ValueError(f"Unsupported file type: {mime_type}")
+            
+            # Extract content using appropriate processor
+            processor = self.supported_types[mime_type]
+            logger.info(f"Using processor: {processor.__name__} for {filename}")
+            extracted_text = await processor(file_data, filename)
+            logger.info(f"Content extraction completed for {filename}, extracted {len(extracted_text)} characters")
+            
+            # Clean the extracted text
+            logger.info(f"Starting text cleaning for {filename}")
+            cleaned_text = self._clean_text(extracted_text)
+            logger.info(f"Text cleaning completed for {filename}, final text length: {len(cleaned_text)}")
+            
+            # Validate content
+            logger.info(f"Starting content validation for {filename}")
+            validation_result = self._validate_content(cleaned_text, file_type)
+            logger.info(f"Content validation completed for {filename}")
+            
+            if not validation_result.is_valid:
+                logger.warning(f"Content validation issues: {validation_result.issues}")
+            
+            # Extract linguistic features
+            logger.info(f"Starting linguistic analysis for {filename}")
+            if NLTK_AVAILABLE and cleaned_text:
+                logger.info(f"Using NLTK for tokenization of {filename}")
+                sentences = sent_tokenize(cleaned_text)
+                words = word_tokenize(cleaned_text)
+            elif cleaned_text:
+                logger.info(f"Using fallback tokenization for {filename}")
+                # Fallback to simple tokenization
+                sentences = cleaned_text.split('.')
+                words = cleaned_text.split()
+            else:
+                logger.warning(f"No text content for linguistic analysis in {filename}")
+                sentences = []
+                words = []
+            
+            logger.info(f"Tokenization completed for {filename}: {len(sentences)} sentences, {len(words)} words")
+            
+            # Detect language
+            logger.info(f"Starting language detection for {filename}")
+            language, lang_confidence = self._detect_language(cleaned_text)
+            logger.info(f"Language detection completed for {filename}: {language} (confidence: {lang_confidence:.2f})")
+            
+            # Extract entities and keywords
+            logger.info(f"Starting entity extraction for {filename}")
+            entities = self._extract_entities(cleaned_text)
+            logger.info(f"Entity extraction completed for {filename}: found {len(entities)} entities")
+            
+            logger.info(f"Starting keyword extraction for {filename}")
+            keywords = self._extract_keywords(cleaned_text)
+            logger.info(f"Keyword extraction completed for {filename}: found {len(keywords)} keywords")
+            
+            # Calculate processing time
+            processing_time = time.time() - start_time
+            
+            # Create processed document
+            logger.info(f"Creating ProcessedDocument object for {filename}")
+            processed_doc = ProcessedDocument(
+                id=doc_id,
+                original_filename=filename,
+                file_type=file_type,
+                mime_type=mime_type,
+                content=cleaned_text,
+                extracted_text=extracted_text,
+                metadata={
+                    **(metadata or {}),
+                    "validation": asdict(validation_result),
+                    "file_size": len(file_data),
+                    "processing_stats": {
+                        "processing_time": processing_time,
+                        "processor_used": processor.__name__
+                    }
+                },
+                word_count=len(words),
+                sentence_count=len(sentences),
+                language=language,
+                entities=entities,
+                keywords=keywords,
+                processing_time=processing_time,
+                processed_at=datetime.utcnow(),
+                file_hash=file_hash,
+                file_size=len(file_data)
+            )
+            logger.info(f"ProcessedDocument created for {filename}")
+            
+            # Update stats
+            self.stats["documents_processed"] += 1
+            self.stats["total_processing_time"] += processing_time
+            self.stats["average_processing_time"] = (
+                self.stats["total_processing_time"] / self.stats["documents_processed"]
+            )
+            
+            log_module_event("rag", "document_processed", {
+                "document_id": doc_id,
+                "filename": filename,
+                "file_type": file_type,
+                "word_count": len(words),
+                "processing_time": processing_time,
+                "language": language,
+                "entities_count": len(entities),
+                "keywords_count": len(keywords)
+            })
+            
+            logger.info(f"Document processing completed successfully for {filename} in {processing_time:.2f} seconds")
+            return processed_doc
+            
+        except Exception as e:
+            self.stats["errors"] += 1
+            logger.error(f"Error processing document {filename}: {e}")
+            log_module_event("rag", "processing_failed", {
+                "filename": filename,
+                "error": str(e)
+            })
+            raise
+    
+    async def index_document(self, content: str, metadata: Dict[str, Any] = None, collection_name: str = None) -> str:
+        """Index a document in the vector database (backward compatibility method)"""
+        if not self.enabled:
+            raise RuntimeError("RAG module not initialized")
+        
+        collection_name = collection_name or self.default_collection_name
+        metadata = metadata or {}
+        
+        try:
+            # Ensure collection exists
+            await self._ensure_collection_exists(collection_name)
+            
+            # Generate document ID
+            doc_id = self._generate_document_id(content, metadata)
+            
+            # Check if document already exists
+            if await self._document_exists(doc_id, collection_name):
+                log_module_event("rag", "document_exists", {"document_id": doc_id, "collection": collection_name})
+                return doc_id
+            
+            # Chunk the document
+            chunks = self._chunk_text(content)
+            
+            # Generate embeddings for all chunks in batch (more efficient)
+            embeddings = await self._generate_embeddings(chunks, is_document=True)
+            
+            # Create document points
+            points = []
+            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+                chunk_id = str(uuid.uuid4())
+                
+                chunk_metadata = {
+                    **metadata,
+                    "document_id": doc_id,
+                    "chunk_index": i,
+                    "chunk_count": len(chunks),
+                    "content": chunk,
+                    "indexed_at": datetime.utcnow().isoformat()
+                }
+                
+                points.append(PointStruct(
+                    id=chunk_id,
+                    vector=embedding,
+                    payload=chunk_metadata
+                ))
+            
+            # Insert points into Qdrant
+            self.qdrant_client.upsert(
+                collection_name=collection_name,
+                points=points
+            )
+            
+            self.stats["documents_indexed"] += 1
+            log_module_event("rag", "document_indexed", {
+                "document_id": doc_id,
+                "collection": collection_name,
+                "chunks": len(chunks),
+                "metadata": metadata
+            })
+            
+            return doc_id
+            
+        except Exception as e:
+            logger.error(f"Error indexing document: {e}")
+            log_module_event("rag", "indexing_failed", {"error": str(e)})
+            raise
+    
+    async def index_processed_document(self, processed_doc: ProcessedDocument, collection_name: str = None) -> str:
+        """Index a processed document in the vector database"""
+        if not self.enabled:
+            raise RuntimeError("RAG module not initialized")
+
+        collection_name = collection_name or self.default_collection_name
+
+        try:
+            # Special handling for JSONL files
+            if processed_doc.file_type == 'jsonl':
+                # Import the optimized JSONL processor
+                from app.services.jsonl_processor import JSONLProcessor
+                jsonl_processor = JSONLProcessor(self)
+
+                # Read the original file content
+                with open(processed_doc.metadata.get('file_path', ''), 'rb') as f:
+                    file_content = f.read()
+
+                # Process using the optimized JSONL processor
+                return await jsonl_processor.process_and_index_jsonl(
+                    collection_name=collection_name,
+                    content=file_content,
+                    filename=processed_doc.original_filename,
+                    metadata=processed_doc.metadata
+                )
+
+            # Ensure collection exists
+            await self._ensure_collection_exists(collection_name)
+            
+            # Check if document already exists
+            if await self._document_exists(processed_doc.id, collection_name):
+                log_module_event("rag", "document_exists", {"document_id": processed_doc.id, "collection": collection_name})
+                return processed_doc.id
+            
+            # Chunk the document
+            chunks = self._chunk_text(processed_doc.content)
+            
+            # Generate embeddings for all chunks in batch (more efficient)
+            embeddings = await self._generate_embeddings(chunks, is_document=True)
+            
+            # Create document points with enhanced metadata
+            points = []
+            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+                chunk_id = str(uuid.uuid4())
+                
+                chunk_metadata = {
+                    **processed_doc.metadata,
+                    "document_id": processed_doc.id,
+                    "original_filename": processed_doc.original_filename,
+                    "file_type": processed_doc.file_type,
+                    "mime_type": processed_doc.mime_type,
+                    "language": processed_doc.language,
+                    "entities": processed_doc.entities,
+                    "keywords": processed_doc.keywords,
+                    "word_count": processed_doc.word_count,
+                    "sentence_count": processed_doc.sentence_count,
+                    "file_hash": processed_doc.file_hash,
+                    "processed_at": processed_doc.processed_at.isoformat(),
+                    "chunk_index": i,
+                    "chunk_count": len(chunks),
+                    "content": chunk,
+                    "indexed_at": datetime.utcnow().isoformat()
+                }
+                
+                points.append(PointStruct(
+                    id=chunk_id,
+                    vector=embedding,
+                    payload=chunk_metadata
+                ))
+            
+            # Insert points into Qdrant
+            self.qdrant_client.upsert(
+                collection_name=collection_name,
+                points=points
+            )
+            
+            self.stats["documents_indexed"] += 1
+            log_module_event("rag", "processed_document_indexed", {
+                "document_id": processed_doc.id,
+                "filename": processed_doc.original_filename,
+                "collection": collection_name,
+                "chunks": len(chunks),
+                "file_type": processed_doc.file_type,
+                "language": processed_doc.language
+            })
+            
+            return processed_doc.id
+            
+        except Exception as e:
+            logger.error(f"Error indexing processed document: {e}")
+            log_module_event("rag", "indexing_failed", {"error": str(e)})
+            raise
+    
+    async def _document_exists(self, document_id: str, collection_name: str = None) -> bool:
+        """Check if document exists in the collection"""
+        collection_name = collection_name or self.default_collection_name
+        
+        try:
+            result = self.qdrant_client.search(
+                collection_name=collection_name,
+                query_filter=Filter(
+                    must=[FieldCondition(key="document_id", match=MatchValue(value=document_id))]
+                ),
+                limit=1
+            )
+            return len(result) > 0
+        except Exception:
+            return False
+    
+    async def _hybrid_search(self, collection_name: str, query: str, query_vector: List[float],
+                         query_filter: Optional[Filter], limit: int, score_threshold: float) -> List[Any]:
+        """Perform hybrid search combining vector similarity and BM25 scoring"""
+
+        # Preprocess query for BM25
+        query_terms = self._preprocess_text_for_bm25(query)
+
+        # Get all documents from the collection (for BM25 scoring)
+        # Note: In production, you'd want to optimize this with a proper BM25 index
+        scroll_filter = query_filter or Filter()
+        all_points = []
+
+        # Use scroll to get all points
+        offset = None
+        batch_size = 100
+        while True:
+            search_result = self.qdrant_client.scroll(
+                collection_name=collection_name,
+                scroll_filter=scroll_filter,
+                limit=batch_size,
+                offset=offset,
+                with_payload=True,
+                with_vectors=False
+            )
+
+            points = search_result[0]
+            all_points.extend(points)
+
+            if len(points) < batch_size:
+                break
+
+            offset = points[-1].id
+
+        # Calculate BM25 scores for each document
+        bm25_scores = {}
+        for point in all_points:
+            doc_id = point.payload.get("document_id", "")
+            content = point.payload.get("content", "")
+
+            # Calculate BM25 score
+            bm25_score = self._calculate_bm25_score(query_terms, content)
+            bm25_scores[doc_id] = bm25_score
+
+        # Perform vector search
+        vector_results = self.qdrant_client.search(
+            collection_name=collection_name,
+            query_vector=query_vector,
+            query_filter=query_filter,
+            limit=limit * 2,  # Get more results for re-ranking
+            score_threshold=score_threshold / 2  # Lower threshold for initial search
+        )
+
+        # Combine scores with improved normalization
+        hybrid_weights = self.config.get("hybrid_weights", {"vector": 0.7, "bm25": 0.3})
+        vector_weight = hybrid_weights.get("vector", 0.7)
+        bm25_weight = hybrid_weights.get("bm25", 0.3)
+
+        # Get score distributions for better normalization
+        vector_scores = [r.score for r in vector_results]
+        bm25_scores_list = list(bm25_scores.values())
+
+        # Calculate statistics for normalization
+        if vector_scores:
+            v_max = max(vector_scores)
+            v_min = min(vector_scores)
+            v_range = v_max - v_min if v_max != v_min else 1
+        else:
+            v_max, v_min, v_range = 1, 0, 1
+
+        if bm25_scores_list:
+            bm25_max = max(bm25_scores_list)
+            bm25_min = min(bm25_scores_list)
+            bm25_range = bm25_max - bm25_min if bm25_max != bm25_min else 1
+        else:
+            bm25_max, bm25_min, bm25_range = 1, 0, 1
+
+        # Create hybrid results with improved scoring
+        hybrid_results = []
+        for result in vector_results:
+            doc_id = result.payload.get("document_id", "")
+            vector_score = result.score
+            bm25_score = bm25_scores.get(doc_id, 0.0)
+
+            # Improved normalization using actual score distributions
+            vector_norm = (vector_score - v_min) / v_range if v_range > 0 else 0.5
+            bm25_norm = (bm25_score - bm25_min) / bm25_range if bm25_range > 0 else 0.5
+
+            # Apply reciprocal rank fusion for better combination
+            # This gives more weight to documents that rank highly in both methods
+            rrf_vector = 1.0 / (1.0 + vector_results.index(result) + 1)  # +1 to avoid division by zero
+            rrf_bm25 = 1.0 / (1.0 + sorted(bm25_scores_list, reverse=True).index(bm25_score) + 1) if bm25_score in bm25_scores_list else 0
+
+            # Calculate hybrid score using both normalized scores and RRF
+            hybrid_score = (vector_weight * vector_norm + bm25_weight * bm25_norm) * 0.7 + (rrf_vector + rrf_bm25) * 0.3
+
+            # Create new point with hybrid score
+            hybrid_point = ScoredPoint(
+                id=result.id,
+                payload=result.payload,
+                score=hybrid_score,
+                vector=result.vector,
+                shard_key=None,
+                order_value=None
+            )
+            hybrid_results.append(hybrid_point)
+
+        # Sort by hybrid score and apply final threshold
+        hybrid_results.sort(key=lambda x: x.score, reverse=True)
+        final_results = [r for r in hybrid_results if r.score >= score_threshold][:limit]
+
+        logger.info(f"Hybrid search: {len(vector_results)} vector results, {len(final_results)} final results")
+        return final_results
+
+    def _preprocess_text_for_bm25(self, text: str) -> List[str]:
+        """Preprocess text for BM25 scoring"""
+        if not NLTK_AVAILABLE:
+            return text.lower().split()
+
+        try:
+            # Tokenize
+            tokens = word_tokenize(text.lower())
+
+            # Remove stopwords and non-alphabetic tokens
+            stop_words = set(stopwords.words('english'))
+            filtered_tokens = [
+                token for token in tokens
+                if token.isalpha() and token not in stop_words and len(token) > 2
+            ]
+
+            return filtered_tokens
+        except:
+            # Fallback to simple splitting
+            return text.lower().split()
+
+    def _calculate_bm25_score(self, query_terms: List[str], document: str) -> float:
+        """Calculate BM25 score for a document against query terms"""
+        if not query_terms:
+            return 0.0
+
+        # Preprocess document
+        doc_terms = self._preprocess_text_for_bm25(document)
+        if not doc_terms:
+            return 0.0
+
+        # Calculate term frequencies
+        doc_len = len(doc_terms)
+        avg_doc_len = 300  # Average document length (configurable)
+
+        # BM25 parameters
+        k1 = 1.2  # Controls term frequency saturation
+        b = 0.75  # Controls document length normalization
+
+        score = 0.0
+
+        # Calculate IDF for each query term
+        for term in set(query_terms):
+            # Term frequency in document
+            tf = doc_terms.count(term)
+
+            # Simple IDF (log(N/n) + 1)
+            # In production, you'd use the actual document frequency
+            idf = 2.0  # Simplified IDF
+
+            # BM25 formula
+            numerator = tf * (k1 + 1)
+            denominator = tf + k1 * (1 - b + b * (doc_len / avg_doc_len))
+
+            score += idf * (numerator / denominator)
+
+        # Normalize score to 0-1 range
+        return min(score / 10.0, 1.0)  # Simple normalization
+
+    async def search_documents(self, query: str, max_results: int = None, filters: Dict[str, Any] = None, collection_name: str = None, score_threshold: float = None) -> List[SearchResult]:
+        """Search for relevant documents"""
+        if not self.enabled:
+            raise RuntimeError("RAG module not initialized")
+        
+        collection_name = collection_name or self.default_collection_name
+        max_results = max_results or self.config.get("max_results", 10)
+        
+        # Check cache (include collection name in cache key)
+        cache_key = f"{collection_name}_{query}_{max_results}_{hash(str(filters))}"
+        if cache_key in self.search_cache:
+            self.stats["cache_hits"] += 1
+            return self.search_cache[cache_key]
+        
+        try:
+            import time
+            start_time = time.time()
+            
+            # Generate query embedding with task-specific prefix for better retrieval
+            # The E5 model works better with "query:" prefix for search queries
+            optimized_query = f"query: {query}"
+            query_embedding = await self._generate_embedding(optimized_query)
+            
+            # Build filter
+            search_filter = None
+            if filters:
+                conditions = []
+                for key, value in filters.items():
+                    conditions.append(FieldCondition(key=key, match=MatchValue(value=value)))
+                search_filter = Filter(must=conditions)
+            
+            # Enhanced debugging for search
+            logger.info("=== ENHANCED RAG SEARCH DEBUGGING ===")
+            logger.info(f"Collection: {collection_name}")
+            logger.info(f"Query: '{query}'")
+            logger.info(f"Max results requested: {max_results}")
+            logger.info(f"Query embedding (first 10 values): {query_embedding[:10] if query_embedding else 'None'}")
+            logger.info(f"Embedding service available: {self.embedding_service is not None}")
+            
+            # Check if hybrid search is enabled
+            enable_hybrid = self.config.get("enable_hybrid", False)
+            # Use provided score_threshold or fall back to config
+            search_score_threshold = score_threshold if score_threshold is not None else self.config.get("score_threshold", 0.3)
+
+            if enable_hybrid and NLTK_AVAILABLE:
+                # Perform hybrid search (vector + BM25)
+                search_results = await self._hybrid_search(
+                    collection_name=collection_name,
+                    query=query,
+                    query_vector=query_embedding,
+                    query_filter=search_filter,
+                    limit=max_results,
+                    score_threshold=search_score_threshold
+                )
+            else:
+                # Pure vector search with improved threshold
+                search_results = self.qdrant_client.search(
+                    collection_name=collection_name,
+                    query_vector=query_embedding,
+                    query_filter=search_filter,
+                    limit=max_results,
+                    score_threshold=search_score_threshold
+                )
+            
+            logger.info(f"Raw search results count: {len(search_results)}")
+            
+            # Process results
+            results = []
+            document_scores = {}
+            
+            for i, result in enumerate(search_results):
+                doc_id = result.payload.get("document_id")
+                content = result.payload.get("content", "")
+                score = result.score
+                
+                # Log each raw result for debugging
+                logger.info(f"\n--- Raw Result {i+1} ---")
+                logger.info(f"Score: {score}")
+                logger.info(f"Document ID: {doc_id}")
+                logger.info(f"Content preview (first 200 chars): {content[:200]}")
+                logger.info(f"Metadata keys: {list(result.payload.keys())}")
+                
+                # Aggregate scores by document
+                if doc_id in document_scores:
+                    document_scores[doc_id]["score"] = max(document_scores[doc_id]["score"], score)
+                    document_scores[doc_id]["content"] += "\n" + content
+                else:
+                    document_scores[doc_id] = {
+                        "score": score,
+                        "content": content,
+                        "metadata": {k: v for k, v in result.payload.items() if k not in ["content", "document_id"]}
+                    }
+            
+            logger.info(f"\nAggregated documents count: {len(document_scores)}")
+            logger.info("=== END ENHANCED RAG SEARCH DEBUGGING ===")
+            
+            # Create SearchResult objects
+            for doc_id, data in document_scores.items():
+                document = Document(
+                    id=doc_id,
+                    content=data["content"],
+                    metadata=data["metadata"]
+                )
+                
+                search_result = SearchResult(
+                    document=document,
+                    score=data["score"],
+                    relevance_score=min(data["score"] * 100, 100)
+                )
+                
+                results.append(search_result)
+            
+            # Sort by score
+            results.sort(key=lambda x: x.score, reverse=True)
+            
+            # Update stats
+            search_time = time.time() - start_time
+            self.stats["searches_performed"] += 1
+            self.stats["average_search_time"] = (
+                (self.stats["average_search_time"] * (self.stats["searches_performed"] - 1) + search_time) /
+                self.stats["searches_performed"]
+            )
+            
+            # Cache results
+            self.search_cache[cache_key] = results
+            
+            log_module_event("rag", "search_completed", {
+                "query": query,
+                "collection": collection_name,
+                "results_count": len(results),
+                "search_time": search_time
+            })
+            
+            return results
+            
+        except Exception as e:
+            logger.error(f"Error searching documents in collection {collection_name}: {e}")
+            log_module_event("rag", "search_failed", {"error": str(e), "collection": collection_name})
+            raise
+    
+    async def delete_document(self, document_id: str, collection_name: str = None) -> bool:
+        """Delete a document from the vector database"""
+        if not self.enabled:
+            raise RuntimeError("RAG module not initialized")
+        
+        collection_name = collection_name or self.default_collection_name
+        
+        try:
+            # Delete all chunks for this document
+            self.qdrant_client.delete(
+                collection_name=collection_name,
+                points_selector=models.FilterSelector(
+                    filter=Filter(
+                        must=[FieldCondition(key="document_id", match=MatchValue(value=document_id))]
+                    )
+                )
+            )
+            
+            log_module_event("rag", "document_deleted", {"document_id": document_id, "collection": collection_name})
+            return True
+            
+        except Exception as e:
+            logger.error(f"Error deleting document from collection {collection_name}: {e}")
+            log_module_event("rag", "deletion_failed", {"error": str(e), "collection": collection_name})
+            return False
+    
+    async def get_stats(self) -> Dict[str, Any]:
+        """Get RAG module statistics"""
+        stats = self.stats.copy()
+        
+        if self.enabled:
+            try:
+                # Use raw HTTP call to avoid Pydantic validation issues
+                import httpx
+                
+                # Direct HTTP call to Qdrant API instead of using client to avoid Pydantic issues
+                qdrant_url = f"http://{settings.QDRANT_HOST}:{settings.QDRANT_PORT}"
+                
+                async with httpx.AsyncClient() as client:
+                    response = await client.get(f"{qdrant_url}/collections/{self.default_collection_name}")
+                    
+                    if response.status_code == 200:
+                        collection_data = response.json()
+                        
+                        # Safely extract stats from raw JSON
+                        result = collection_data.get("result", {})
+                        
+                        basic_stats = {
+                            "total_points": result.get("points_count", 0),
+                            "collection_status": result.get("status", "unknown"),
+                        }
+                        
+                        # Try to get vector dimension from config
+                        try:
+                            config = result.get("config", {})
+                            params = config.get("params", {})
+                            vectors = params.get("vectors", {})
+                            
+                            if isinstance(vectors, dict) and "size" in vectors:
+                                basic_stats["vector_dimension"] = vectors["size"]
+                            else:
+                                basic_stats["vector_dimension"] = "unknown"
+                        except Exception as config_error:
+                            logger.debug(f"Could not get vector dimension: {config_error}")
+                            basic_stats["vector_dimension"] = "unknown"
+                        
+                        stats.update(basic_stats)
+                    else:
+                        # Collection doesn't exist or error
+                        stats.update({
+                            "total_points": 0,
+                            "collection_status": "not_found",
+                            "vector_dimension": "unknown"
+                        })
+                
+            except Exception as e:
+                logger.debug(f"Could not get Qdrant stats (using fallback): {e}")
+                # Add basic fallback stats without logging as error since this is not critical
+                stats.update({
+                    "total_points": 0,
+                    "collection_status": "unavailable",
+                    "vector_dimension": "unknown"
+                })
+        else:
+            stats.update({
+                "total_points": 0,
+                "collection_status": "disabled",
+                "vector_dimension": "unknown"
+            })
+        
+        return stats
+    
+    async def process_request(self, request: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
+        """Process a module request through the interceptor pattern"""
+        if not self.enabled:
+            raise RuntimeError("RAG module not initialized")
+        
+        action = request.get("action", "search")
+        
+        if action == "search":
+            query = request.get("query")
+            if not query:
+                raise ValueError("Query is required for search action")
+            
+            max_results = request.get("max_results", self.config.get("max_results", 10))
+            filters = request.get("filters", {})
+            
+            results = await self.search_documents(query, max_results, filters)
+            
+            return {
+                "action": "search",
+                "query": query,
+                "results": [
+                    {
+                        "document_id": result.document.id,
+                        "content": result.document.content,
+                        "metadata": result.document.metadata,
+                        "score": result.score,
+                        "relevance_score": result.relevance_score
+                    }
+                    for result in results
+                ],
+                "total_results": len(results),
+                "cache_hit": False  # Would be determined by search logic
+            }
+        
+        elif action == "index":
+            content = request.get("content")
+            if not content:
+                raise ValueError("Content is required for index action")
+            
+            metadata = request.get("metadata", {})
+            document_id = await self.index_document(content, metadata)
+            
+            return {
+                "action": "index",
+                "document_id": document_id,
+                "status": "success",
+                "message": "Document indexed successfully"
+            }
+        
+        elif action == "process":
+            file_data = request.get("file_data")
+            filename = request.get("filename")
+            if not file_data or not filename:
+                raise ValueError("File data and filename are required for process action")
+            
+            # Decode base64 file data if provided as string
+            if isinstance(file_data, str):
+                import base64
+                file_data = base64.b64decode(file_data)
+            
+            metadata = request.get("metadata", {})
+            processed_doc = await self.process_document(file_data, filename, metadata)
+            
+            return {
+                "action": "process",
+                "document_id": processed_doc.id,
+                "filename": processed_doc.original_filename,
+                "file_type": processed_doc.file_type,
+                "mime_type": processed_doc.mime_type,
+                "word_count": processed_doc.word_count,
+                "sentence_count": processed_doc.sentence_count,
+                "language": processed_doc.language,
+                "entities_count": len(processed_doc.entities),
+                "keywords_count": len(processed_doc.keywords),
+                "processing_time": processed_doc.processing_time,
+                "status": "success",
+                "message": "Document processed successfully"
+            }
+        
+        elif action == "delete":
+            document_id = request.get("document_id")
+            if not document_id:
+                raise ValueError("Document ID is required for delete action")
+            
+            success = await self.delete_document(document_id)
+            
+            return {
+                "action": "delete",
+                "document_id": document_id,
+                "status": "success" if success else "failed",
+                "message": "Document deleted successfully" if success else "Failed to delete document"
+            }
+        
+        elif action == "stats":
+            stats = await self.get_stats()
+            
+            return {
+                "action": "stats",
+                "statistics": stats
+            }
+        
+        else:
+            raise ValueError(f"Unsupported action: {action}")
+    
+    async def pre_request_interceptor(self, context: Dict[str, Any]) -> Dict[str, Any]:
+        """Pre-request interceptor for RAG enhancement"""
+        if not self.enabled:
+            return context
+        
+        request = context.get("request")
+        if not request:
+            return context
+        
+        # Check if this is a request that could benefit from RAG
+        if request.url.path.startswith("/api/v1/chat") or request.url.path.startswith("/api/v1/completions"):
+            # Extract query/prompt from request
+            request_body = await request.body() if hasattr(request, 'body') else b""
+            
+            if request_body:
+                try:
+                    data = json.loads(request_body.decode())
+                    query = data.get("message", data.get("prompt", ""))
+                    
+                    if query:
+                        # Search for relevant documents
+                        search_results = await self.search_documents(query, max_results=3)
+                        
+                        if search_results:
+                            # Add context to request
+                            context["rag_context"] = [
+                                {
+                                    "content": result.document.content,
+                                    "metadata": result.document.metadata,
+                                    "relevance_score": result.relevance_score
+                                }
+                                for result in search_results
+                            ]
+                            
+                            log_module_event("rag", "context_added", {
+                                "query": query[:100],
+                                "results_count": len(search_results)
+                            })
+                
+                except Exception as e:
+                    logger.error(f"Error processing RAG request: {e}")
+        
+        return context
+
+# Global RAG instance
+rag_module = RAGModule()
+
+# Module interface functions
+async def initialize(config: Dict[str, Any]):
+    """Initialize RAG module"""
+    await rag_module.initialize(config)
+
+async def cleanup():
+    """Cleanup RAG module"""
+    await rag_module.cleanup()
+
+async def pre_request_interceptor(context: Dict[str, Any]) -> Dict[str, Any]:
+    """Pre-request interceptor"""
+    return await rag_module.pre_request_interceptor(context)
+
+# Additional exported functions
+async def process_document(file_data: bytes, filename: str, metadata: Dict[str, Any] = None) -> ProcessedDocument:
+    """Process a document with full content analysis"""
+    return await rag_module.process_document(file_data, filename, metadata)
+
+async def index_document(content: str, metadata: Dict[str, Any] = None, collection_name: str = None) -> str:
+    """Index a document (backward compatibility)"""
+    return await rag_module.index_document(content, metadata, collection_name)
+
+async def index_processed_document(processed_doc: ProcessedDocument, collection_name: str = None) -> str:
+    """Index a processed document"""
+    return await rag_module.index_processed_document(processed_doc, collection_name)
+
+async def search_documents(query: str, max_results: int = None, filters: Dict[str, Any] = None, collection_name: str = None, score_threshold: float = None) -> List[SearchResult]:
+    """Search documents"""
+    return await rag_module.search_documents(query, max_results, filters, collection_name, score_threshold)
+
+async def delete_document(document_id: str, collection_name: str = None) -> bool:
+    """Delete a document"""
+    return await rag_module.delete_document(document_id, collection_name)
+
+async def create_collection(collection_name: str) -> bool:
+    """Create a new Qdrant collection"""
+    return await rag_module.create_collection(collection_name)
+
+async def delete_collection(collection_name: str) -> bool:
+    """Delete a Qdrant collection"""
+    return await rag_module.delete_collection(collection_name)
+
+async def get_supported_types() -> List[str]:
+    """Get list of supported file types"""
+    return list(rag_module.supported_types.keys())
diff --git a/backend/app/modules/rag/module.yaml b/backend/app/modules/rag/module.yaml
new file mode 100644
index 0000000..cfe8b53
--- /dev/null
+++ b/backend/app/modules/rag/module.yaml
@@ -0,0 +1,82 @@
+name: rag
+version: 1.0.0
+description: "Document search, retrieval, and vector storage"
+author: "Enclava Team"
+category: "ai"
+
+# Module lifecycle
+enabled: true
+auto_start: true
+dependencies: []
+optional_dependencies:
+  - cache
+
+# Module capabilities
+provides:
+  - "document_storage"
+  - "semantic_search"
+  - "vector_embeddings"
+  - "document_processing"
+
+consumes:
+  - "qdrant_connection"
+  - "llm_embeddings"
+  - "document_parsing"
+
+# API endpoints
+endpoints:
+  - path: "/rag/collections"
+    method: "GET"
+    description: "List document collections"
+    
+  - path: "/rag/upload"
+    method: "POST"
+    description: "Upload and process documents"
+    
+  - path: "/rag/search"
+    method: "POST"
+    description: "Semantic search in documents"
+    
+  - path: "/rag/collections/{collection_id}/documents"
+    method: "GET"
+    description: "List documents in collection"
+
+# UI Configuration
+ui_config:
+  icon: "search"
+  color: "#8B5CF6"
+  category: "AI & ML"
+  
+  forms:
+    - name: "collection_config"
+      title: "Collection Settings"
+      fields: ["name", "description", "embedding_model"]
+      
+    - name: "search_config"
+      title: "Search Configuration"
+      fields: ["top_k", "similarity_threshold", "rerank_enabled"]
+
+# Permissions
+permissions:
+  - name: "rag.create"
+    description: "Create document collections"
+    
+  - name: "rag.upload"
+    description: "Upload documents to collections"
+    
+  - name: "rag.search"
+    description: "Search document collections"
+    
+  - name: "rag.manage"
+    description: "Manage all collections (admin)"
+
+# Health checks
+health_checks:
+  - name: "qdrant_connectivity"
+    description: "Check Qdrant vector database connection"
+    
+  - name: "embeddings_service"
+    description: "Check LLM embeddings service"
+    
+  - name: "document_processing"
+    description: "Check document parsing capabilities"
\ No newline at end of file
diff --git a/backend/app/services/jsonl_processor.py b/backend/app/services/jsonl_processor.py
new file mode 100644
index 0000000..001cf08
--- /dev/null
+++ b/backend/app/services/jsonl_processor.py
@@ -0,0 +1,211 @@
+"""
+Optimized JSONL Processor for RAG Module
+Handles JSONL files efficiently to prevent resource exhaustion
+"""
+
+import json
+import logging
+import asyncio
+from typing import Dict, Any, List
+from datetime import datetime
+import uuid
+
+from qdrant_client.models import PointStruct, Filter, FieldCondition, MatchValue
+from qdrant_client.http.models import Batch
+
+from app.modules.rag.main import ProcessedDocument
+# from app.core.analytics import log_module_event  # Analytics module not available
+
+logger = logging.getLogger(__name__)
+
+
+class JSONLProcessor:
+    """Specialized processor for JSONL files"""
+
+    def __init__(self, rag_module):
+        self.rag_module = rag_module
+        self.config = rag_module.config
+
+    async def process_and_index_jsonl(self, collection_name: str, content: bytes,
+                                   filename: str, metadata: Dict[str, Any]) -> str:
+        """Process and index a JSONL file efficiently
+
+        Processes each JSON line as a separate document to avoid
+        creating thousands of chunks from a single large document.
+        """
+        try:
+            # Decode content
+            jsonl_content = content.decode('utf-8', errors='replace')
+            lines = jsonl_content.strip().split('\n')
+
+            logger.info(f"Processing JSONL file {filename} with {len(lines)} lines")
+
+            # Generate base document ID
+            base_doc_id = self.rag_module._generate_document_id(jsonl_content, metadata)
+
+            # Process lines in batches
+            batch_size = 10  # Smaller batches for better memory management
+            processed_count = 0
+
+            for batch_start in range(0, len(lines), batch_size):
+                batch_end = min(batch_start + batch_size, len(lines))
+                batch_lines = lines[batch_start:batch_end]
+
+                # Process batch
+                await self._process_jsonl_batch(
+                    collection_name,
+                    batch_lines,
+                    batch_start,
+                    base_doc_id,
+                    filename,
+                    metadata
+                )
+
+                processed_count += len(batch_lines)
+
+                # Log progress
+                if processed_count % 50 == 0:
+                    logger.info(f"Processed {processed_count}/{len(lines)} lines from {filename}")
+
+                # Small delay to prevent resource exhaustion
+                await asyncio.sleep(0.05)
+
+            logger.info(f"Successfully processed JSONL file {filename} with {len(lines)} lines")
+            return base_doc_id
+
+        except Exception as e:
+            logger.error(f"Error processing JSONL file {filename}: {e}")
+            raise
+
+    async def _process_jsonl_batch(self, collection_name: str, lines: List[str],
+                                 start_idx: int, base_doc_id: str,
+                                 filename: str, metadata: Dict[str, Any]) -> None:
+        """Process a batch of JSONL lines"""
+        try:
+            points = []
+
+            for line_idx, line in enumerate(lines, start=start_idx + 1):
+                if not line.strip():
+                    continue
+
+                try:
+                    # Parse JSON line
+                    data = json.loads(line)
+
+                    # Debug: check if data is None
+                    if data is None:
+                        logger.warning(f"JSON line {line_idx} parsed as None")
+                        continue
+
+                    # Handle helpjuice export format
+                    if 'payload' in data and data['payload'] is not None:
+                        payload = data['payload']
+                        article_id = data.get('id', f'article_{line_idx}')
+
+                        # Extract Q&A
+                        question = payload.get('question', '')
+                        answer = payload.get('answer', '')
+                        language = payload.get('language', 'EN')
+
+                        if question or answer:
+                            # Create Q&A content
+                            content = f"Question: {question}\n\nAnswer: {answer}"
+
+                            # Create metadata
+                            doc_metadata = {
+                                **metadata,
+                                "article_id": article_id,
+                                "language": language,
+                                "filename": filename,
+                                "line_number": line_idx,
+                                "content_type": "qa_pair",
+                                "question": question[:100],  # Truncate for metadata
+                                "processed_at": datetime.utcnow().isoformat()
+                            }
+
+                            # Generate single embedding for the Q&A pair
+                            embeddings = await self.rag_module._generate_embeddings([content])
+
+                            # Create point
+                            point_id = str(uuid.uuid4())
+                            points.append(PointStruct(
+                                id=point_id,
+                                vector=embeddings[0],
+                                payload={
+                                    **doc_metadata,
+                                    "document_id": f"{base_doc_id}_{article_id}",
+                                    "content": content,
+                                    "chunk_index": 0,
+                                    "chunk_count": 1
+                                }
+                            ))
+
+                    # Handle generic JSON format
+                    else:
+                        content = json.dumps(data, indent=2, ensure_ascii=False)
+
+                        # For larger JSON objects, we might need to chunk
+                        if len(content) > 1000:
+                            chunks = self.rag_module._chunk_text(content, chunk_size=500)
+                            embeddings = await self.rag_module._generate_embeddings(chunks)
+
+                            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+                                point_id = str(uuid.uuid4())
+                                points.append(PointStruct(
+                                    id=point_id,
+                                    vector=embedding,
+                                    payload={
+                                        **metadata,
+                                        "filename": filename,
+                                        "line_number": line_idx,
+                                        "content_type": "json_object",
+                                        "document_id": f"{base_doc_id}_line_{line_idx}",
+                                        "content": chunk,
+                                        "chunk_index": i,
+                                        "chunk_count": len(chunks)
+                                    }
+                                ))
+                        else:
+                            # Small JSON - no chunking needed
+                            embeddings = await self.rag_module._generate_embeddings([content])
+                            point_id = str(uuid.uuid4())
+                            points.append(PointStruct(
+                                id=point_id,
+                                vector=embeddings[0],
+                                payload={
+                                    **metadata,
+                                    "filename": filename,
+                                    "line_number": line_idx,
+                                    "content_type": "json_object",
+                                    "document_id": f"{base_doc_id}_line_{line_idx}",
+                                    "content": content,
+                                    "chunk_index": 0,
+                                    "chunk_count": 1
+                                }
+                            ))
+
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Error parsing JSONL line {line_idx}: {e}")
+                    continue
+                except Exception as e:
+                    logger.warning(f"Error processing JSONL line {line_idx}: {e}")
+                    continue
+
+            # Insert all points in this batch
+            if points:
+                self.rag_module.qdrant_client.upsert(
+                    collection_name=collection_name,
+                    points=points
+                )
+
+                # Update stats
+                self.rag_module.stats["documents_indexed"] += len(points)
+                # log_module_event("rag", "jsonl_batch_processed", {  # Analytics module not available
+                #     "filename": filename,
+                #     "lines_processed": len(lines),
+                #     "points_created": len(points)
+                # })
+
+        except Exception as e:
+            logger.error(f"Error processing JSONL batch: {e}")
+            raise
\ No newline at end of file
diff --git a/backend/app/services/qdrant_stats_service.py b/backend/app/services/qdrant_stats_service.py
new file mode 100644
index 0000000..b89d446
--- /dev/null
+++ b/backend/app/services/qdrant_stats_service.py
@@ -0,0 +1,163 @@
+"""
+Qdrant Stats Service
+Provides direct, live statistics from Qdrant vector database
+This is the single source of truth for all RAG collection statistics
+"""
+
+import httpx
+import logging
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class QdrantStatsService:
+    """Service for getting live statistics from Qdrant"""
+
+    def __init__(self):
+        self.qdrant_host = getattr(settings, 'QDRANT_HOST', 'enclava-qdrant')
+        self.qdrant_port = getattr(settings, 'QDRANT_PORT', 6333)
+        self.qdrant_url = f"http://{self.qdrant_host}:{self.qdrant_port}"
+
+    async def get_collections_stats(self) -> Dict[str, Any]:
+        """Get live collection statistics directly from Qdrant"""
+        try:
+            async with httpx.AsyncClient(timeout=10.0) as client:
+                # Get all collections
+                response = await client.get(f"{self.qdrant_url}/collections")
+                if response.status_code != 200:
+                    logger.error(f"Failed to get collections: {response.status_code}")
+                    return {"collections": [], "total_documents": 0, "total_size_bytes": 0}
+
+                data = response.json()
+                result = data.get("result", {})
+                collections_data = result.get("collections", [])
+
+                collections = []
+                total_documents = 0
+                total_size_bytes = 0
+
+                # Get detailed info for each collection
+                for col_info in collections_data:
+                    collection_name = col_info.get("name", "")
+                    # Include all collections, not just rag_ ones
+
+                    # Get detailed collection info
+                    try:
+                        detail_response = await client.get(f"{self.qdrant_url}/collections/{collection_name}")
+                        if detail_response.status_code == 200:
+                            detail_data = detail_response.json()
+                            detail_result = detail_data.get("result", {})
+
+                            points_count = detail_result.get("points_count", 0)
+                            status = detail_result.get("status", "unknown")
+
+                            # Get vector size for size calculation
+                            vector_size = 1024  # Default for multilingual-e5-large
+                            try:
+                                config = detail_result.get("config", {})
+                                params = config.get("params", {})
+                                vectors = params.get("vectors", {})
+                                if isinstance(vectors, dict) and "size" in vectors:
+                                    vector_size = vectors["size"]
+                                elif isinstance(vectors, dict) and "default" in vectors:
+                                    vector_size = vectors["default"].get("size", 1024)
+                            except Exception:
+                                pass
+
+                            # Estimate size (points * vector_size * 4 bytes + 20% metadata overhead)
+                            estimated_size = int(points_count * vector_size * 4 * 1.2)
+
+                            # Extract collection metadata for user-friendly name
+                            display_name = collection_name
+                            description = ""
+
+                            # Parse collection name to get original name
+                            if collection_name.startswith("rag_"):
+                                parts = collection_name[4:].split("_")
+                                if len(parts) > 1:
+                                    # Remove the UUID suffix
+                                    uuid_parts = [p for p in parts if len(p) == 8 and all(c in '0123456789abcdef' for c in p)]
+                                    for uuid_part in uuid_parts:
+                                        parts.remove(uuid_part)
+                                    display_name = " ".join(parts).replace("_", " ").title()
+
+                            collection_stat = {
+                                "id": collection_name,
+                                "name": display_name,
+                                "description": description,
+                                "document_count": points_count,
+                                "vector_count": points_count,
+                                "size_bytes": estimated_size,
+                                "status": status,
+                                "qdrant_collection_name": collection_name,
+                                "created_at": "",  # Not available from Qdrant
+                                "updated_at": datetime.utcnow().isoformat(),
+                                "is_active": status == "green",
+                                "is_managed": True,
+                                "source": "qdrant"
+                            }
+
+                            collections.append(collection_stat)
+                            total_documents += points_count
+                            total_size_bytes += estimated_size
+
+                    except Exception as e:
+                        logger.error(f"Error getting details for collection {collection_name}: {e}")
+                        continue
+
+                return {
+                    "collections": collections,
+                    "total_documents": total_documents,
+                    "total_size_bytes": total_size_bytes,
+                    "total_collections": len(collections)
+                }
+
+        except Exception as e:
+            logger.error(f"Error getting Qdrant stats: {e}")
+            return {"collections": [], "total_documents": 0, "total_size_bytes": 0, "total_collections": 0}
+
+    async def get_collection_stats(self, collection_name: str) -> Optional[Dict[str, Any]]:
+        """Get statistics for a specific collection"""
+        try:
+            async with httpx.AsyncClient(timeout=10.0) as client:
+                response = await client.get(f"{self.qdrant_url}/collections/{collection_name}")
+                if response.status_code != 200:
+                    return None
+
+                data = response.json()
+                result = data.get("result", {})
+
+                points_count = result.get("points_count", 0)
+                status = result.get("status", "unknown")
+
+                # Get vector size
+                vector_size = 1024
+                try:
+                    config = result.get("config", {})
+                    params = config.get("params", {})
+                    vectors = params.get("vectors", {})
+                    if isinstance(vectors, dict) and "size" in vectors:
+                        vector_size = vectors["size"]
+                except Exception:
+                    pass
+
+                estimated_size = int(points_count * vector_size * 4 * 1.2)
+
+                return {
+                    "document_count": points_count,
+                    "vector_count": points_count,
+                    "size_bytes": estimated_size,
+                    "status": status
+                }
+
+        except Exception as e:
+            logger.error(f"Error getting collection stats for {collection_name}: {e}")
+            return None
+
+
+# Global instance
+qdrant_stats_service = QdrantStatsService()
\ No newline at end of file

From 755ea4c5854184d926b5fae641cab1939fef7cdd Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Tue, 23 Sep 2025 16:19:00 +0200
Subject: [PATCH 11/13] rag debug view

---
 backend/app/api/rag_debug.py                  |  97 +++
 .../app/api/rag/debug/collections/route.ts    |  56 ++
 .../src/app/api/rag/debug/search/route.ts     |  67 +++
 frontend/src/app/rag-demo/page.tsx            | 569 ++++++++++++++++++
 4 files changed, 789 insertions(+)
 create mode 100644 backend/app/api/rag_debug.py
 create mode 100644 frontend/src/app/api/rag/debug/collections/route.ts
 create mode 100644 frontend/src/app/api/rag/debug/search/route.ts
 create mode 100644 frontend/src/app/rag-demo/page.tsx

diff --git a/backend/app/api/rag_debug.py b/backend/app/api/rag_debug.py
new file mode 100644
index 0000000..75a81ce
--- /dev/null
+++ b/backend/app/api/rag_debug.py
@@ -0,0 +1,97 @@
+"""
+RAG Debug API endpoints for testing and debugging
+"""
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from typing import Dict, Any, Optional
+import logging
+
+from app.core.security import get_current_user
+from app.core.config import settings
+from app.modules.rag.main import RAGModule
+from app.models.user import User
+
+logger = logging.getLogger(__name__)
+
+# Create router
+router = APIRouter()
+
+@router.get("/collections")
+async def list_collections(
+    current_user: User = Depends(get_current_user)
+):
+    """List all available RAG collections"""
+    try:
+        from app.services.qdrant_stats_service import qdrant_stats_service
+
+        # Get collections from Qdrant (same as main RAG API)
+        stats_data = await qdrant_stats_service.get_collections_stats()
+        collections = stats_data.get("collections", [])
+
+        # Extract collection names
+        collection_names = [col["name"] for col in collections]
+
+        return {
+            "collections": collection_names,
+            "count": len(collection_names)
+        }
+
+    except Exception as e:
+        logger.error(f"List collections error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/search")
+async def debug_search(
+    query: str = Query(..., description="Search query"),
+    max_results: int = Query(10, ge=1, le=50, description="Maximum number of results"),
+    score_threshold: float = Query(0.3, ge=0.0, le=1.0, description="Minimum score threshold"),
+    collection_name: Optional[str] = Query(None, description="Collection name to search"),
+    config: Optional[Dict[str, Any]] = None,
+    current_user: User = Depends(get_current_user)
+):
+    """Debug search endpoint with detailed information"""
+    try:
+        # Get configuration
+        app_config = settings
+
+        # Initialize RAG module
+        rag_module = RAGModule(app_config)
+
+        # Get available collections if none specified
+        if not collection_name:
+            collections = await rag_module.list_collections()
+            if collections:
+                collection_name = collections[0]  # Use first collection
+            else:
+                return {
+                    "results": [],
+                    "debug_info": {
+                        "error": "No collections available",
+                        "collections_found": 0
+                    },
+                    "search_time_ms": 0
+                }
+
+        # Perform search
+        results = await rag_module.search(
+            query=query,
+            max_results=max_results,
+            score_threshold=score_threshold,
+            collection_name=collection_name,
+            config=config or {}
+        )
+
+        return results
+
+    except Exception as e:
+        logger.error(f"Debug search error: {e}")
+        return {
+            "results": [],
+            "debug_info": {
+                "error": str(e),
+                "query": query,
+                "collection_name": collection_name
+            },
+            "search_time_ms": 0
+        }
\ No newline at end of file
diff --git a/frontend/src/app/api/rag/debug/collections/route.ts b/frontend/src/app/api/rag/debug/collections/route.ts
new file mode 100644
index 0000000..acfbd08
--- /dev/null
+++ b/frontend/src/app/api/rag/debug/collections/route.ts
@@ -0,0 +1,56 @@
+import { NextRequest, NextResponse } from 'next/server';
+import { tokenManager } from '@/lib/token-manager';
+
+export async function GET(request: NextRequest) {
+  try {
+    // Get authentication token from Authorization header or tokenManager
+    const authHeader = request.headers.get('authorization');
+    let token;
+
+    if (authHeader && authHeader.startsWith('Bearer ')) {
+      token = authHeader.substring(7);
+    } else {
+      token = await tokenManager.getAccessToken();
+    }
+
+    if (!token) {
+      return NextResponse.json(
+        { error: 'Authentication required' },
+        { status: 401 }
+      );
+    }
+
+    // Backend URL
+    const backendUrl = process.env.INTERNAL_API_URL || `http://enclava-backend:${process.env.BACKEND_INTERNAL_PORT || '8000'}`;
+
+    // Build the proxy URL
+    const proxyUrl = `${backendUrl}/api-internal/v1/rag/debug/collections`;
+
+    // Proxy the request to the backend with authentication
+    const response = await fetch(proxyUrl, {
+      method: 'GET',
+      headers: {
+        'Content-Type': 'application/json',
+        'Authorization': `Bearer ${token}`,
+      },
+    });
+
+    if (!response.ok) {
+      const errorText = await response.text();
+      console.error('Backend list collections error:', response.status, errorText);
+      return NextResponse.json(
+        { error: `Backend request failed: ${response.status}` },
+        { status: response.status }
+      );
+    }
+
+    const data = await response.json();
+    return NextResponse.json(data);
+  } catch (error) {
+    console.error('RAG collections proxy error:', error);
+    return NextResponse.json(
+      { error: 'Failed to proxy collections request' },
+      { status: 500 }
+    );
+  }
+}
\ No newline at end of file
diff --git a/frontend/src/app/api/rag/debug/search/route.ts b/frontend/src/app/api/rag/debug/search/route.ts
new file mode 100644
index 0000000..e06bcd9
--- /dev/null
+++ b/frontend/src/app/api/rag/debug/search/route.ts
@@ -0,0 +1,67 @@
+import { NextRequest, NextResponse } from 'next/server';
+import { tokenManager } from '@/lib/token-manager';
+
+export async function POST(request: NextRequest) {
+  try {
+    // Get the search parameters from the query string
+    const searchParams = request.nextUrl.searchParams;
+    const query = searchParams.get('query') || '';
+    const max_results = searchParams.get('max_results') || '10';
+    const score_threshold = searchParams.get('score_threshold') || '0.3';
+    const collection_name = searchParams.get('collection_name');
+
+    // Get the config from the request body
+    const body = await request.json();
+
+    // Get authentication token from Authorization header or tokenManager
+    const authHeader = request.headers.get('authorization');
+    let token;
+
+    if (authHeader && authHeader.startsWith('Bearer ')) {
+      token = authHeader.substring(7);
+    } else {
+      token = await tokenManager.getAccessToken();
+    }
+
+    if (!token) {
+      return NextResponse.json(
+        { error: 'Authentication required' },
+        { status: 401 }
+      );
+    }
+
+    // Backend URL
+    const backendUrl = process.env.INTERNAL_API_URL || `http://enclava-backend:${process.env.BACKEND_INTERNAL_PORT || '8000'}`;
+
+    // Build the proxy URL with query parameters
+    const proxyUrl = `${backendUrl}/api-internal/v1/rag/debug/search?query=${encodeURIComponent(query)}&max_results=${max_results}&score_threshold=${score_threshold}${collection_name ? `&collection_name=${encodeURIComponent(collection_name)}` : ''}`;
+
+    // Proxy the request to the backend with authentication
+    const response = await fetch(proxyUrl, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'Authorization': `Bearer ${token}`,
+      },
+      body: JSON.stringify(body),
+    });
+
+    if (!response.ok) {
+      const errorText = await response.text();
+      console.error('Backend RAG search error:', response.status, errorText);
+      return NextResponse.json(
+        { error: `Backend request failed: ${response.status}` },
+        { status: response.status }
+      );
+    }
+
+    const data = await response.json();
+    return NextResponse.json(data);
+  } catch (error) {
+    console.error('RAG debug search proxy error:', error);
+    return NextResponse.json(
+      { error: 'Failed to proxy RAG search request' },
+      { status: 500 }
+    );
+  }
+}
\ No newline at end of file
diff --git a/frontend/src/app/rag-demo/page.tsx b/frontend/src/app/rag-demo/page.tsx
new file mode 100644
index 0000000..b07c7ba
--- /dev/null
+++ b/frontend/src/app/rag-demo/page.tsx
@@ -0,0 +1,569 @@
+"use client";
+
+import { useState, useEffect } from 'react';
+import { useAuth } from '@/contexts/AuthContext';
+import { tokenManager } from '@/lib/token-manager';
+
+interface SearchResult {
+  document: {
+    id: string;
+    content: string;
+    metadata: Record<string, any>;
+  };
+  score: number;
+  debug_info?: Record<string, any>;
+}
+
+interface DebugInfo {
+  query_embedding?: number[];
+  embedding_dimension?: number;
+  score_stats?: {
+    min: number;
+    max: number;
+    avg: number;
+    stddev: number;
+  };
+  collection_stats?: {
+    total_documents: number;
+    total_chunks: number;
+    languages: string[];
+  };
+}
+
+export default function RAGDemoPage() {
+  const { user, loading } = useAuth();
+  const [query, setQuery] = useState('are sd card backups encrypted?');
+  const [results, setResults] = useState<SearchResult[]>([]);
+  const [debugInfo, setDebugInfo] = useState<DebugInfo>({});
+  const [searchTime, setSearchTime] = useState(0);
+  const [isLoading, setIsLoading] = useState(false);
+  const [error, setError] = useState('');
+
+  // Configuration state
+  const [config, setConfig] = useState({
+    max_results: 10,
+    score_threshold: 0.3,
+    collection_name: '',
+    chunk_size: 300,
+    chunk_overlap: 50,
+    enable_hybrid: false,
+    vector_weight: 0.7,
+    bm25_weight: 0.3,
+    use_query_prefix: true,
+    use_passage_prefix: true,
+    show_timing: true,
+    show_embeddings: false,
+  });
+
+  // Available collections
+  const [collections, setCollections] = useState<string[]>([]);
+  const [collectionsLoading, setCollectionsLoading] = useState(false);
+
+  const presets = {
+    default: {
+      max_results: 10,
+      score_threshold: 0.3,
+      chunk_size: 300,
+      chunk_overlap: 50,
+      enable_hybrid: false,
+      vector_weight: 0.7,
+      bm25_weight: 0.3,
+    },
+    high_precision: {
+      max_results: 5,
+      score_threshold: 0.5,
+      chunk_size: 200,
+      chunk_overlap: 30,
+      enable_hybrid: true,
+      vector_weight: 0.8,
+      bm25_weight: 0.2,
+    },
+    high_recall: {
+      max_results: 20,
+      score_threshold: 0.1,
+      chunk_size: 400,
+      chunk_overlap: 100,
+      enable_hybrid: true,
+      vector_weight: 0.6,
+      bm25_weight: 0.4,
+    },
+    hybrid: {
+      max_results: 10,
+      score_threshold: 0.2,
+      chunk_size: 300,
+      chunk_overlap: 50,
+      enable_hybrid: true,
+      vector_weight: 0.5,
+      bm25_weight: 0.5,
+    },
+  };
+
+  useEffect(() => {
+    // Check if we have tokens in localStorage but not in tokenManager
+    const syncTokens = async () => {
+      const rawTokens = localStorage.getItem('auth_tokens');
+      if (rawTokens && !tokenManager.isAuthenticated()) {
+        try {
+          const tokens = JSON.parse(rawTokens);
+          // Sync tokens to tokenManager
+          tokenManager.setTokens(
+            tokens.access_token,
+            tokens.refresh_token,
+            Math.floor((tokens.access_expires_at - Date.now()) / 1000)
+          );
+          console.log('RAG Demo: Tokens synced from localStorage to tokenManager');
+        } catch (e) {
+          console.error('RAG Demo: Failed to sync tokens:', e);
+        }
+      }
+      loadCollections();
+    };
+
+    syncTokens();
+  }, [user]);
+
+  const loadCollections = async () => {
+    setCollectionsLoading(true);
+    try {
+      console.log('RAG Demo: Loading collections...');
+      console.log('RAG Demo: User authenticated:', !!user);
+      console.log('RAG Demo: TokenManager authenticated:', tokenManager.isAuthenticated());
+
+      const token = await tokenManager.getAccessToken();
+      console.log('RAG Demo: Token retrieved:', token ? 'Yes' : 'No');
+      console.log('RAG Demo: Token expiry:', tokenManager.getTokenExpiry());
+
+      const headers: Record<string, string> = {
+        'Content-Type': 'application/json',
+      };
+
+      if (token) {
+        headers['Authorization'] = `Bearer ${token}`;
+        console.log('RAG Demo: Authorization header set');
+      } else {
+        console.warn('RAG Demo: No token available');
+      }
+
+      const response = await fetch('/api/rag/debug/collections', { headers });
+      console.log('RAG Demo: Collections response status:', response.status);
+      if (response.ok) {
+        const data = await response.json();
+        console.log('RAG Demo: Collections loaded:', data.collections);
+        setCollections(data.collections || []);
+        // Auto-select first collection if none selected
+        if (data.collections && data.collections.length > 0 && !config.collection_name) {
+          setConfig(prev => ({ ...prev, collection_name: data.collections[0] }));
+        }
+      } else {
+        const errorText = await response.text();
+        console.error('RAG Demo: Collections failed:', response.status, errorText);
+      }
+    } catch (err) {
+      console.error('RAG Demo: Failed to load collections:', err);
+    } finally {
+      setCollectionsLoading(false);
+    }
+  };
+
+  const loadPreset = (presetName: keyof typeof presets) => {
+    setConfig(prev => ({
+      ...prev,
+      ...presets[presetName],
+    }));
+  };
+
+  const performSearch = async () => {
+    if (!query.trim()) return;
+    if (!config.collection_name) {
+      setError('Please select a collection');
+      return;
+    }
+
+    setIsLoading(true);
+    setError('');
+    setResults([]);
+
+    try {
+      const token = await tokenManager.getAccessToken();
+      const headers: Record<string, string> = {
+        'Content-Type': 'application/json',
+      };
+
+      if (token) {
+        headers['Authorization'] = `Bearer ${token}`;
+      }
+
+      const response = await fetch('/api/rag/debug/search', {
+        method: 'POST',
+        headers,
+        body: JSON.stringify({
+          query,
+          max_results: config.max_results,
+          score_threshold: config.score_threshold,
+          collection_name: config.collection_name,
+          config,
+        }),
+      });
+
+      if (!response.ok) {
+        throw new Error(`Search failed: ${response.statusText}`);
+      }
+
+      const data = await response.json();
+      setResults(data.results || []);
+      setDebugInfo(data.debug_info || {});
+      setSearchTime(data.search_time_ms || 0);
+    } catch (err) {
+      setError(err instanceof Error ? err.message : 'Unknown error');
+    } finally {
+      setIsLoading(false);
+    }
+  };
+
+  const updateConfig = (key: string, value: any) => {
+    setConfig(prev => ({ ...prev, [key]: value }));
+  };
+
+  if (loading) {
+    return (
+      <div className="flex items-center justify-center min-h-screen">
+        <div className="text-lg">Loading...</div>
+      </div>
+    );
+  }
+
+  if (!user) {
+    return (
+      <div className="flex items-center justify-center min-h-screen">
+        <div className="text-center">
+          <h1 className="text-2xl font-bold mb-4">RAG Demo</h1>
+          <p>Please log in to access the RAG demo interface.</p>
+        </div>
+      </div>
+    );
+  }
+
+  return (
+    <div className="container mx-auto px-4 py-8 max-w-7xl">
+      <h1 className="text-3xl font-bold mb-2">🔍 RAG Search Demo</h1>
+      <p className="text-gray-600 mb-6">Test and tune your RAG system with real-time search and debugging</p>
+
+      <div className="grid grid-cols-1 lg:grid-cols-4 gap-6">
+        {/* Search Results - Main Content */}
+        <div className="lg:col-span-3 space-y-6">
+          {/* Preset Buttons */}
+          <div className="flex flex-wrap gap-2">
+            {Object.entries(presets).map(([name, _]) => (
+              <button
+                key={name}
+                onClick={() => loadPreset(name as keyof typeof presets)}
+                className="px-3 py-1 bg-gray-100 hover:bg-gray-200 rounded-md text-sm capitalize"
+              >
+                {name.replace('_', ' ')}
+              </button>
+            ))}
+          </div>
+
+          {/* Search Box */}
+          <div className="bg-white rounded-lg shadow p-6">
+            <div className="flex gap-2">
+              <input
+                type="text"
+                value={query}
+                onChange={(e) => setQuery(e.target.value)}
+                onKeyPress={(e) => e.key === 'Enter' && performSearch()}
+                placeholder="Enter your search query..."
+                className="flex-1 px-4 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500"
+              />
+              <button
+                onClick={performSearch}
+                disabled={isLoading || !config.collection_name}
+                className="px-6 py-2 bg-blue-500 text-white rounded-md hover:bg-blue-600 disabled:opacity-50"
+              >
+                {isLoading ? 'Searching...' : 'Search'}
+              </button>
+            </div>
+
+            {error && (
+              <div className="mt-4 p-4 bg-red-100 text-red-700 rounded-md">
+                Error: {error}
+              </div>
+            )}
+
+            {/* Results Summary */}
+            {results.length > 0 && (
+              <div className="mt-4 p-3 bg-blue-50 rounded-md">
+                <p className="text-sm">
+                  Found <strong>{results.length}</strong> results in <strong>{searchTime.toFixed(0)}ms</strong>
+                  {config.enable_hybrid && (
+                    <span className="ml-2 text-green-600">• Hybrid Search Enabled</span>
+                  )}
+                </p>
+              </div>
+            )}
+          </div>
+
+          {/* Search Results */}
+          <div className="space-y-4">
+            {results.map((result, index) => (
+              <div key={index} className="bg-white rounded-lg shadow p-6">
+                <div className="flex justify-between items-start mb-3">
+                  <h3 className="text-lg font-semibold">Result {index + 1}</h3>
+                  <span className={`px-3 py-1 rounded-full text-sm font-medium ${
+                    result.score >= 0.5 ? 'bg-green-100 text-green-800' :
+                    result.score >= 0.3 ? 'bg-yellow-100 text-yellow-800' :
+                    'bg-red-100 text-red-800'
+                  }`}>
+                    Score: {result.score.toFixed(4)}
+                  </span>
+                </div>
+
+                <div className="text-gray-700 mb-4 whitespace-pre-wrap">
+                  {result.document.content}
+                </div>
+
+                {/* Metadata */}
+                <div className="text-sm text-gray-500 mb-3">
+                  {result.document.metadata.content_type && (
+                    <span>Type: {result.document.metadata.content_type}</span>
+                  )}
+                  {result.document.metadata.language && (
+                    <span className="ml-3">Language: {result.document.metadata.language}</span>
+                  )}
+                  {result.document.metadata.filename && (
+                    <span className="ml-3">File: {result.document.metadata.filename}</span>
+                  )}
+                  {result.document.metadata.chunk_index !== undefined && (
+                    <span className="ml-3">
+                      Chunk: {result.document.metadata.chunk_index + 1}/{result.document.metadata.chunk_count || '?'}
+                    </span>
+                  )}
+                </div>
+
+                {/* Debug Details */}
+                {config.show_timing && result.debug_info && (
+                  <div className="mt-4 p-3 bg-gray-50 rounded-md text-xs font-mono">
+                    <p><strong>Debug Information:</strong></p>
+                    {result.debug_info.vector_score !== undefined && (
+                      <p>Vector Score: {result.debug_info.vector_score.toFixed(4)}</p>
+                    )}
+                    {result.debug_info.bm25_score !== undefined && (
+                      <p>BM25 Score: {result.debug_info.bm25_score.toFixed(4)}</p>
+                    )}
+                    {result.document.metadata.question && (
+                      <div className="mt-2">
+                        <p><strong>Question:</strong> {result.document.metadata.question}</p>
+                      </div>
+                    )}
+                  </div>
+                )}
+              </div>
+            ))}
+          </div>
+
+          {/* Debug Section */}
+          {debugInfo && Object.keys(debugInfo).length > 0 && (
+            <div className="bg-gray-900 text-green-400 rounded-lg shadow p-6 font-mono text-sm">
+              <h3 className="text-lg font-semibold mb-4">Debug Information</h3>
+
+              {debugInfo.score_stats && (
+                <div className="mb-4">
+                  <p className="font-semibold mb-2">Score Statistics:</p>
+                  <div className="grid grid-cols-2 md:grid-cols-4 gap-2 text-xs">
+                    <div>Min: {debugInfo.score_stats.min?.toFixed(4)}</div>
+                    <div>Max: {debugInfo.score_stats.max?.toFixed(4)}</div>
+                    <div>Avg: {debugInfo.score_stats.avg?.toFixed(4)}</div>
+                    <div>StdDev: {debugInfo.score_stats.stddev?.toFixed(4)}</div>
+                  </div>
+                </div>
+              )}
+
+              {debugInfo.collection_stats && (
+                <div className="mb-4">
+                  <p className="font-semibold mb-2">Collection Stats:</p>
+                  <div className="text-xs">
+                    <p>Total Documents: {debugInfo.collection_stats.total_documents}</p>
+                    <p>Total Chunks: {debugInfo.collection_stats.total_chunks}</p>
+                    <p>Languages: {debugInfo.collection_stats.languages?.join(', ')}</p>
+                  </div>
+                </div>
+              )}
+
+              {debugInfo.query_embedding && config.show_embeddings && (
+                <div>
+                  <p className="font-semibold mb-2">Query Embedding (first 10 dims):</p>
+                  <p className="text-xs">
+                    [{debugInfo.query_embedding.slice(0, 10).map(x => x.toFixed(6)).join(', ')}...]
+                  </p>
+                </div>
+              )}
+            </div>
+          )}
+        </div>
+
+        {/* Configuration Panel */}
+        <div className="space-y-6">
+          <div className="bg-white rounded-lg shadow p-6">
+            <h2 className="text-xl font-semibold mb-4">⚙️ Configuration</h2>
+
+            <div className="space-y-4">
+              {/* Search Settings */}
+              <div>
+                <h3 className="font-medium mb-2">Search Settings</h3>
+                <div className="space-y-3">
+                  <div>
+                    <label className="block text-sm mb-1">Max Results: {config.max_results}</label>
+                    <input
+                      type="range"
+                      min="1"
+                      max="50"
+                      value={config.max_results}
+                      onChange={(e) => updateConfig('max_results', parseInt(e.target.value))}
+                      className="w-full"
+                    />
+                  </div>
+                  <div>
+                    <label className="block text-sm mb-1">Score Threshold: {config.score_threshold}</label>
+                    <input
+                      type="range"
+                      min="0"
+                      max="1"
+                      step="0.05"
+                      value={config.score_threshold}
+                      onChange={(e) => updateConfig('score_threshold', parseFloat(e.target.value))}
+                      className="w-full"
+                    />
+                  </div>
+                  <div>
+                    <label className="block text-sm mb-1">Collection Name</label>
+                    {collectionsLoading ? (
+                      <select
+                        disabled
+                        className="w-full px-3 py-1 border border-gray-300 rounded-md text-sm bg-gray-50"
+                      >
+                        <option>Loading collections...</option>
+                      </select>
+                    ) : (
+                      <select
+                        value={config.collection_name}
+                        onChange={(e) => updateConfig('collection_name', e.target.value)}
+                        className="w-full px-3 py-1 border border-gray-300 rounded-md text-sm"
+                      >
+                        <option value="">Select a collection...</option>
+                        {collections.map(collection => (
+                          <option key={collection} value={collection}>
+                            {collection}
+                          </option>
+                        ))}
+                      </select>
+                    )}
+                  </div>
+                </div>
+              </div>
+
+              {/* Chunking Settings */}
+              <div>
+                <h3 className="font-medium mb-2">Chunking Settings</h3>
+                <div className="space-y-3">
+                  <div>
+                    <label className="block text-sm mb-1">Chunk Size: {config.chunk_size}</label>
+                    <input
+                      type="range"
+                      min="100"
+                      max="1000"
+                      step="50"
+                      value={config.chunk_size}
+                      onChange={(e) => updateConfig('chunk_size', parseInt(e.target.value))}
+                      className="w-full"
+                    />
+                  </div>
+                  <div>
+                    <label className="block text-sm mb-1">Chunk Overlap: {config.chunk_overlap}</label>
+                    <input
+                      type="range"
+                      min="0"
+                      max="200"
+                      step="10"
+                      value={config.chunk_overlap}
+                      onChange={(e) => updateConfig('chunk_overlap', parseInt(e.target.value))}
+                      className="w-full"
+                    />
+                  </div>
+                </div>
+              </div>
+
+              {/* Hybrid Search */}
+              <div>
+                <h3 className="font-medium mb-2">Hybrid Search</h3>
+                <div className="space-y-3">
+                  <label className="flex items-center">
+                    <input
+                      type="checkbox"
+                      checked={config.enable_hybrid}
+                      onChange={(e) => updateConfig('enable_hybrid', e.target.checked)}
+                      className="mr-2"
+                    />
+                    <span className="text-sm">Enable Hybrid Search</span>
+                  </label>
+                  {config.enable_hybrid && (
+                    <>
+                      <div>
+                        <label className="block text-sm mb-1">Vector Weight: {config.vector_weight}</label>
+                        <input
+                          type="range"
+                          min="0"
+                          max="1"
+                          step="0.05"
+                          value={config.vector_weight}
+                          onChange={(e) => updateConfig('vector_weight', parseFloat(e.target.value))}
+                          className="w-full"
+                        />
+                      </div>
+                      <div>
+                        <label className="block text-sm mb-1">BM25 Weight: {config.bm25_weight}</label>
+                        <input
+                          type="range"
+                          min="0"
+                          max="1"
+                          step="0.05"
+                          value={config.bm25_weight}
+                          onChange={(e) => updateConfig('bm25_weight', parseFloat(e.target.value))}
+                          className="w-full"
+                        />
+                      </div>
+                    </>
+                  )}
+                </div>
+              </div>
+
+              {/* Debug Options */}
+              <div>
+                <h3 className="font-medium mb-2">Debug Options</h3>
+                <div className="space-y-2">
+                  <label className="flex items-center">
+                    <input
+                      type="checkbox"
+                      checked={config.show_timing}
+                      onChange={(e) => updateConfig('show_timing', e.target.checked)}
+                      className="mr-2"
+                    />
+                    <span className="text-sm">Show Timing</span>
+                  </label>
+                  <label className="flex items-center">
+                    <input
+                      type="checkbox"
+                      checked={config.show_embeddings}
+                      onChange={(e) => updateConfig('show_embeddings', e.target.checked)}
+                      className="mr-2"
+                    />
+                    <span className="text-sm">Show Embeddings</span>
+                  </label>
+                </div>
+              </div>
+            </div>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
\ No newline at end of file

From f3f5cca50b05dc5f6ea79a90023602de63084f58 Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Wed, 1 Oct 2025 15:50:34 +0200
Subject: [PATCH 12/13] fixing rag

---
 .gitignore                          |  1 +
 backend/Dockerfile                  |  3 +
 backend/app/modules/chatbot/main.py | 13 ++++
 backend/modules/rag/main.py         | 52 ++++++++++++++--
 backend/requirements.txt            |  6 +-
 backend/scripts/import_jsonl.py     | 92 +++++++++++++++++++++++++++++
 6 files changed, 159 insertions(+), 8 deletions(-)
 create mode 100644 backend/scripts/import_jsonl.py

diff --git a/.gitignore b/.gitignore
index 6642c56..4abb4a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ frontend/.env.development
 
 
 backend/storage/
+
 # TypeScript
 *.tsbuildinfo
 
diff --git a/backend/Dockerfile b/backend/Dockerfile
index aaa4fe6..0cb709e 100644
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -17,6 +17,9 @@ RUN apt-get update && apt-get install -y \
     ffmpeg \
     && rm -rf /var/lib/apt/lists/*
 
+# Install CPU-only PyTorch and compatible numpy first (faster download)
+RUN pip install --no-cache-dir torch==2.5.1+cpu torchaudio==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu -f https://download.pytorch.org/whl/torch_stable.html
+
 # Copy requirements and install Python dependencies
 COPY requirements.txt .
 COPY tests/requirements-test.txt ./tests/
diff --git a/backend/app/modules/chatbot/main.py b/backend/app/modules/chatbot/main.py
index 3f9b8dc..96ae1b2 100644
--- a/backend/app/modules/chatbot/main.py
+++ b/backend/app/modules/chatbot/main.py
@@ -453,9 +453,22 @@ class ChatbotModule(BaseModule):
                 guardrails += (
                     "When asked about encryption or SD-card backups, do not claim that backups are encrypted unless the provided context explicitly uses wording like 'encrypt', 'encrypted', or 'encryption'. "
                     "If such wording is absent, state clearly that the SD-card backup is not encrypted. "
+                    "Product policy: For BitBox devices, microSD (SD card) backups are not encrypted; verification steps may require a recovery password, but that is not encryption. Do not conflate password entry with encryption. "
                 )
             extra_instructions["additional_instructions"] = guardrails
 
+        # Deterministic enforcement: if encryption question and RAG context does not explicitly
+        # contain encryption wording, return policy answer without calling the LLM.
+        ctx_lower = (rag_context or "").lower()
+        has_encryption_terms = any(k in ctx_lower for k in ["encrypt", "encrypted", "encryption", "decrypt", "decryption"])
+        if is_encryption and not has_encryption_terms:
+            policy_answer = (
+                "No. BitBox microSD (SD card) backups are not encrypted. "
+                "Verification may require entering a recovery password, but that does not encrypt the backup — "
+                "it only proves you have the correct credentials to restore. Keep the card and password secure."
+            )
+            return policy_answer, sources
+
         messages = self._build_conversation_messages(db_messages, config, rag_context, extra_instructions)
         
         # Note: Current user message is already included in db_messages from the query
diff --git a/backend/modules/rag/main.py b/backend/modules/rag/main.py
index d56503c..92f43f6 100644
--- a/backend/modules/rag/main.py
+++ b/backend/modules/rag/main.py
@@ -1495,8 +1495,16 @@ class RAGModule(BaseModule):
         """Search for relevant documents"""
         if not self.enabled:
             raise RuntimeError("RAG module not initialized")
-        
+
         collection_name = collection_name or self.default_collection_name
+
+        # Special handling for collections with different vector dimensions
+        SPECIAL_COLLECTIONS = {
+            "bitbox02_faq_local": {
+                "dimension": 384,
+                "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+            }
+        }
         max_results = max_results or self.config.get("max_results", 10)
         
         # Check cache (include collection name in cache key)
@@ -1510,9 +1518,24 @@ class RAGModule(BaseModule):
             start_time = time.time()
             
             # Generate query embedding with task-specific prefix for better retrieval
-            # The E5 model works better with "query:" prefix for search queries
-            optimized_query = f"query: {query}"
-            query_embedding = await self._generate_embedding(optimized_query)
+            try:
+                # Check if this is a special collection
+                if collection_name in SPECIAL_COLLECTIONS:
+                    # Try to import sentence-transformers
+                    import sentence_transformers
+                    from sentence_transformers import SentenceTransformer
+                    model = SentenceTransformer(SPECIAL_COLLECTIONS[collection_name]["model"])
+                    query_embedding = model.encode([query], normalize_embeddings=True)[0].tolist()
+                    logger.info(f"Using {SPECIAL_COLLECTIONS[collection_name]['dimension']}-dim local model for {collection_name}")
+                else:
+                    # The E5 model works better with "query:" prefix for search queries
+                    optimized_query = f"query: {query}"
+                    query_embedding = await self._generate_embedding(optimized_query)
+            except ImportError:
+                # Fallback to default embedding if sentence-transformers is not available
+                logger.warning(f"sentence-transformers not available, falling back to default embedding for {collection_name}")
+                optimized_query = f"query: {query}"
+                query_embedding = await self._generate_embedding(optimized_query)
             
             # Build filter
             search_filter = None
@@ -1565,14 +1588,31 @@ class RAGModule(BaseModule):
                 doc_id = result.payload.get("document_id")
                 content = result.payload.get("content", "")
                 score = result.score
-                
+
+                # Generic content extraction for documents without a 'content' field
+                if not content:
+                    # Build content from all text-based fields in the payload
+                    # This makes the RAG module completely agnostic to document structure
+                    text_fields = []
+                    for field, value in result.payload.items():
+                        # Skip system/metadata fields
+                        if field not in ["document_id", "chunk_index", "chunk_count", "indexed_at", "processed_at",
+                                        "file_hash", "mime_type", "file_type", "created_at", "__collection_metadata__"]:
+                            # Include any field that has a non-empty string value
+                            if value and isinstance(value, str) and len(value.strip()) > 0:
+                                text_fields.append(f"{field}: {value}")
+
+                    # Join all text fields to create content
+                    if text_fields:
+                        content = "\n\n".join(text_fields)
+
                 # Log each raw result for debugging
                 logger.info(f"\n--- Raw Result {i+1} ---")
                 logger.info(f"Score: {score}")
                 logger.info(f"Document ID: {doc_id}")
                 logger.info(f"Content preview (first 200 chars): {content[:200]}")
                 logger.info(f"Metadata keys: {list(result.payload.keys())}")
-                
+
                 # Aggregate scores by document
                 if doc_id in document_scores:
                     document_scores[doc_id]["score"] = max(document_scores[doc_id]["score"], score)
diff --git a/backend/requirements.txt b/backend/requirements.txt
index c4ec167..b8fd274 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -46,6 +46,7 @@ qdrant-client==1.7.0
 
 # Text Processing
 tiktoken==0.5.1
+numpy>=1.26.0
 
 # Basic document processing (lightweight)
 markitdown==0.0.1a2
@@ -56,8 +57,9 @@ python-docx==1.1.0
 # nltk==3.8.1
 # spacy==3.7.2
 
-# Heavy ML dependencies (REMOVED - unused in codebase)
-# sentence-transformers==2.6.1  # REMOVED - not used anywhere in codebase
+# Heavy ML dependencies (sentence-transformers will be installed separately)
+# Note: PyTorch is already installed in the base Docker image
+sentence-transformers==2.6.1  # Added back - needed for bitbox02_faq_local collection
 # transformers==4.35.2  # REMOVED - already commented out
 
 # Configuration
diff --git a/backend/scripts/import_jsonl.py b/backend/scripts/import_jsonl.py
new file mode 100644
index 0000000..a932883
--- /dev/null
+++ b/backend/scripts/import_jsonl.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Import a JSONL file into a Qdrant collection from inside the backend container.
+
+Usage (from host):
+  docker compose exec enclava-backend bash -lc \
+    'python /app/scripts/import_jsonl.py \
+      --collection rag_test_import_859b1f01 \
+      --file /app/_to_delete/helpjuice-export.jsonl'
+
+Notes:
+  - Runs fully inside the backend, so Docker service hostnames (e.g. enclava-qdrant)
+    and privatemode-proxy are reachable.
+  - Uses RAGModule + JSONLProcessor to embed/index each JSONL line.
+  - Creates the collection if missing (size=1024, cosine).
+"""
+
+import argparse
+import asyncio
+import os
+from datetime import datetime
+
+
+async def import_jsonl(collection_name: str, file_path: str):
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import Distance, VectorParams
+    from app.modules.rag.main import RAGModule
+    from app.services.jsonl_processor import JSONLProcessor
+    from app.core.config import settings
+
+    if not os.path.exists(file_path):
+        raise SystemExit(f"File not found: {file_path}")
+
+    # Ensure collection exists (inside container uses Docker DNS hostnames)
+    client = QdrantClient(host=settings.QDRANT_HOST, port=settings.QDRANT_PORT)
+    collections = client.get_collections().collections
+    if not any(c.name == collection_name for c in collections):
+        client.create_collection(
+            collection_name=collection_name,
+            vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
+        )
+        print(f"Created Qdrant collection '{collection_name}' (size=1024, cosine)")
+    else:
+        print(f"Using existing Qdrant collection '{collection_name}'")
+
+    # Initialize RAG
+    rag = RAGModule({
+        "chunk_size": 300,
+        "chunk_overlap": 50,
+        "max_results": 10,
+        "score_threshold": 0.3,
+        "embedding_model": "intfloat/multilingual-e5-large-instruct",
+    })
+    await rag.initialize()
+
+    # Process JSONL
+    processor = JSONLProcessor(rag)
+    with open(file_path, "rb") as f:
+        content = f.read()
+
+    doc_id = await processor.process_and_index_jsonl(
+        collection_name=collection_name,
+        content=content,
+        filename=os.path.basename(file_path),
+        metadata={
+            "source": "jsonl_upload",
+            "upload_date": datetime.utcnow().isoformat(),
+            "file_path": os.path.abspath(file_path),
+        },
+    )
+
+    # Report stats using safe HTTP method to avoid client parsing issues
+    try:
+        info = await rag._get_collection_info_safely(collection_name)
+        print(f"Import complete. Points: {info.get('points_count', 0)}, vector_size: {info.get('vector_size', 'n/a')}")
+    except Exception as e:
+        print(f"Import complete. (Could not fetch collection info safely: {e})")
+    await rag.cleanup()
+    return doc_id
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--collection", required=True, help="Qdrant collection name")
+    ap.add_argument("--file", required=True, help="Path inside container (e.g. /app/_to_delete/...).")
+    args = ap.parse_args()
+
+    asyncio.run(import_jsonl(args.collection, args.file))
+
+
+if __name__ == "__main__":
+    main()

From 8391dd5170e29f446b252b546c1726c979388cef Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Wed, 1 Oct 2025 17:05:04 +0200
Subject: [PATCH 13/13] vector size test

---
 backend/modules/rag/main.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backend/modules/rag/main.py b/backend/modules/rag/main.py
index 92f43f6..2672c44 100644
--- a/backend/modules/rag/main.py
+++ b/backend/modules/rag/main.py
@@ -1503,6 +1503,10 @@ class RAGModule(BaseModule):
             "bitbox02_faq_local": {
                 "dimension": 384,
                 "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+            },
+            "bitbox_local_rag": {
+                "dimension": 384,
+                "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
             }
         }
         max_results = max_results or self.config.get("max_results", 10)