chatbot test window fixing

2025-12-17 07:24:34 +01:00 · 2025-09-13 11:14:32 +02:00
parent 40454d6fab
commit e97d197d8b
8 changed files with 237 additions and 58 deletions
--- a/backend/modules/rag/main.py
+++ b/backend/modules/rag/main.py
@@ -148,6 +148,7 @@ class RAGModule(BaseModule):
            'application/vnd.ms-excel': self._process_with_markitdown,
            'text/html': self._process_html,
            'application/json': self._process_json,
+            'application/x-ndjson': self._process_jsonl,  # JSONL support
            'text/markdown': self._process_markdown,
            'text/csv': self._process_csv
        }
@@ -227,6 +228,10 @@ class RAGModule(BaseModule):
        if mime_type:
            return mime_type
        
+        # Check for JSONL file extension
+        if filename.lower().endswith('.jsonl'):
+            return 'application/x-ndjson'
+        
        # Try to detect from content
        if content.startswith(b'%PDF'):
            return 'application/pdf'
@@ -247,6 +252,13 @@ class RAGModule(BaseModule):
        elif content.startswith(b'<html') or content.startswith(b'<!DOCTYPE'):
            return 'text/html'
        elif content.startswith(b'{') or content.startswith(b'['):
+            # Check if it's JSONL by looking for newline-delimited JSON
+            try:
+                lines = content.decode('utf-8', errors='ignore').split('\n')
+                if len(lines) > 1 and all(line.strip().startswith('{') for line in lines[:3] if line.strip()):
+                    return 'application/x-ndjson'
+            except:
+                pass
            return 'application/json'
        else:
            return 'text/plain'
@@ -876,6 +888,75 @@ class RAGModule(BaseModule):
            logger.error(f"Error processing CSV file: {e}")
            return ""
    
+    async def _process_jsonl(self, content: bytes, filename: str) -> str:
+        """Process JSONL files (newline-delimited JSON)
+        
+        Specifically optimized for helpjuice-export.jsonl format:
+        - Each line contains a JSON object with 'id' and 'payload'
+        - Payload contains 'question', 'language', and 'answer' fields
+        - Combines question and answer into searchable content
+        """
+        try:
+            jsonl_content = content.decode('utf-8', errors='replace')
+            lines = jsonl_content.strip().split('\n')
+            
+            processed_articles = []
+            
+            for line_num, line in enumerate(lines, 1):
+                if not line.strip():
+                    continue
+                
+                try:
+                    # Parse each JSON line
+                    data = json.loads(line)
+                    
+                    # Handle helpjuice export format
+                    if 'payload' in data:
+                        payload = data['payload']
+                        article_id = data.get('id', f'article_{line_num}')
+                        
+                        # Extract fields
+                        question = payload.get('question', '')
+                        answer = payload.get('answer', '')
+                        language = payload.get('language', 'EN')
+                        
+                        # Combine question and answer for better search
+                        if question or answer:
+                            # Format as Q&A for better context
+                            article_text = f"## {question}\n\n{answer}\n\n"
+                            
+                            # Add language tag if not English
+                            if language != 'EN':
+                                article_text = f"[{language}] {article_text}"
+                            
+                            # Add metadata separator
+                            article_text += f"---\nArticle ID: {article_id}\nLanguage: {language}\n\n"
+                            
+                            processed_articles.append(article_text)
+                    
+                    # Handle generic JSONL format
+                    else:
+                        # Convert the entire JSON object to readable text
+                        json_text = json.dumps(data, indent=2, ensure_ascii=False)
+                        processed_articles.append(json_text + "\n\n")
+                        
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Error parsing JSONL line {line_num}: {e}")
+                    continue
+                except Exception as e:
+                    logger.warning(f"Error processing JSONL line {line_num}: {e}")
+                    continue
+            
+            # Combine all articles
+            combined_text = '\n'.join(processed_articles)
+            
+            logger.info(f"Successfully processed {len(processed_articles)} articles from JSONL file {filename}")
+            return combined_text
+            
+        except Exception as e:
+            logger.error(f"Error processing JSONL file {filename}: {e}")
+            return ""
+    
    def _generate_document_id(self, content: str, metadata: Dict[str, Any]) -> str:
        """Generate unique document ID"""
        content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]