mirror of
https://github.com/aljazceru/enclava.git
synced 2025-12-16 23:14:26 +01:00
Merge branch 'main' of github.com:aljazceru/enclava
This commit is contained in:
34
backend/alembic/versions/006_add_source_url_to_rag_docs.py
Normal file
34
backend/alembic/versions/006_add_source_url_to_rag_docs.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""Add source_url to rag_documents
|
||||
|
||||
Revision ID: 006_add_source_url_to_rag_docs
|
||||
Revises: 005_fix_user_nullable_columns
|
||||
Create Date: 2025-11-21 00:00:00.000000
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "006_add_source_url_to_rag_docs"
|
||||
down_revision = "005_fix_user_nullable_columns"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""
|
||||
Add source_url column to rag_documents table.
|
||||
This column will store the original URL for web-scraped documents.
|
||||
"""
|
||||
op.add_column(
|
||||
"rag_documents",
|
||||
sa.Column("source_url", sa.String(500), nullable=True)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""
|
||||
Remove source_url column from rag_documents table.
|
||||
"""
|
||||
op.drop_column("rag_documents", "source_url")
|
||||
@@ -181,7 +181,7 @@ class Settings(BaseSettings):
|
||||
RAG_WARN_ON_FALLBACK: bool = (
|
||||
os.getenv("RAG_WARN_ON_FALLBACK", "True").lower() == "true"
|
||||
)
|
||||
RAG_EMBEDDING_MODEL: str = os.getenv("RAG_EMBEDDING_MODEL", "bge-m3")
|
||||
RAG_EMBEDDING_MODEL: str = os.getenv("RAG_EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5")
|
||||
RAG_DOCUMENT_PROCESSING_TIMEOUT: int = int(
|
||||
os.getenv("RAG_DOCUMENT_PROCESSING_TIMEOUT", "300")
|
||||
)
|
||||
|
||||
@@ -40,6 +40,7 @@ class RagDocument(Base):
|
||||
file_type = Column(String(50), nullable=False) # pdf, docx, txt, etc.
|
||||
file_size = Column(BigInteger, nullable=False) # file size in bytes
|
||||
mime_type = Column(String(100), nullable=True)
|
||||
source_url = Column(String(500), nullable=True, index=True) # original source URL
|
||||
|
||||
# Processing status
|
||||
status = Column(
|
||||
@@ -93,6 +94,7 @@ class RagDocument(Base):
|
||||
"file_type": self.file_type,
|
||||
"size": self.file_size,
|
||||
"mime_type": self.mime_type,
|
||||
"source_url": self.source_url,
|
||||
"status": self.status,
|
||||
"processing_error": self.processing_error,
|
||||
"converted_content": self.converted_content,
|
||||
|
||||
@@ -501,14 +501,38 @@ class ChatbotModule(BaseModule):
|
||||
|
||||
if rag_results:
|
||||
logger.info(f"RAG search found {len(rag_results)} results")
|
||||
sources = [
|
||||
# Build sources with enhanced metadata
|
||||
all_sources = [
|
||||
{
|
||||
"title": f"Document {i+1}",
|
||||
"content": result.document.content[:200],
|
||||
"title": result.document.metadata.get("question") or f"Document {i+1}",
|
||||
"url": result.document.metadata.get("source_url"),
|
||||
"language": result.document.metadata.get("language"),
|
||||
"article_id": result.document.metadata.get("article_id"),
|
||||
"relevance_score": result.relevance_score,
|
||||
"content_preview": result.document.content[:200] if result.document.content else "",
|
||||
}
|
||||
for i, result in enumerate(rag_results)
|
||||
]
|
||||
|
||||
# Deduplicate by URL, keeping the highest relevance score
|
||||
seen_urls = {}
|
||||
sources = []
|
||||
for source in all_sources:
|
||||
url = source.get("url")
|
||||
if url:
|
||||
# If URL already seen, keep the one with higher relevance score
|
||||
if url not in seen_urls or source["relevance_score"] > seen_urls[url]["relevance_score"]:
|
||||
seen_urls[url] = source
|
||||
else:
|
||||
# Keep sources without URLs (shouldn't happen, but be safe)
|
||||
sources.append(source)
|
||||
|
||||
# Add deduplicated sources and sort by relevance score
|
||||
sources.extend(seen_urls.values())
|
||||
sources.sort(key=lambda x: x["relevance_score"], reverse=True)
|
||||
|
||||
logger.info(f"After deduplication: {len(sources)} unique sources")
|
||||
|
||||
# Build full RAG context from all results
|
||||
rag_context = (
|
||||
"\n\nRelevant information from knowledge base:\n"
|
||||
|
||||
@@ -96,6 +96,7 @@ class ProcessedDocument:
|
||||
file_hash: str
|
||||
file_size: int
|
||||
embedding: Optional[List[float]] = None
|
||||
source_url: Optional[str] = None
|
||||
created_at: datetime = None
|
||||
|
||||
def __post_init__(self):
|
||||
@@ -164,9 +165,9 @@ class RAGModule(BaseModule):
|
||||
if config:
|
||||
self.config.update(config)
|
||||
|
||||
# Ensure embedding model configured (defaults to local BGE-M3)
|
||||
# Ensure embedding model configured (defaults to local BGE-small-en)
|
||||
default_embedding_model = getattr(
|
||||
settings, "RAG_EMBEDDING_MODEL", "BAAI/bge-m3"
|
||||
settings, "RAG_EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5"
|
||||
)
|
||||
self.config.setdefault("embedding_model", default_embedding_model)
|
||||
self.default_embedding_model = default_embedding_model
|
||||
@@ -300,11 +301,27 @@ class RAGModule(BaseModule):
|
||||
elif content.startswith(b"{") or content.startswith(b"["):
|
||||
# Check if it's JSONL by looking for newline-delimited JSON
|
||||
try:
|
||||
lines = content.decode("utf-8", errors="ignore").split("\n")
|
||||
if len(lines) > 1 and all(
|
||||
line.strip().startswith("{") for line in lines[:3] if line.strip()
|
||||
content_str = content.decode("utf-8", errors="ignore")
|
||||
lines = content_str.split("\n")
|
||||
# Filter out empty lines
|
||||
non_empty_lines = [line.strip() for line in lines[:10] if line.strip()]
|
||||
|
||||
# If we have multiple non-empty lines that all start with {, it's likely JSONL
|
||||
if len(non_empty_lines) > 1 and all(
|
||||
line.startswith("{") and line.endswith("}") for line in non_empty_lines[:5]
|
||||
):
|
||||
return "application/x-ndjson"
|
||||
# Additional validation: try parsing a few lines as JSON
|
||||
import json
|
||||
valid_json_lines = 0
|
||||
for line in non_empty_lines[:3]:
|
||||
try:
|
||||
json.loads(line)
|
||||
valid_json_lines += 1
|
||||
except:
|
||||
break
|
||||
|
||||
if valid_json_lines > 1:
|
||||
return "application/x-ndjson"
|
||||
except:
|
||||
pass
|
||||
return "application/json"
|
||||
@@ -1125,12 +1142,31 @@ class RAGModule(BaseModule):
|
||||
async def _process_json(self, content: bytes, filename: str) -> str:
|
||||
"""Process JSON files"""
|
||||
try:
|
||||
json_data = json.loads(content.decode("utf-8"))
|
||||
json_str = content.decode("utf-8", errors="ignore")
|
||||
json_data = json.loads(json_str)
|
||||
# Convert JSON to readable text
|
||||
return json.dumps(json_data, indent=2)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
# Check if this might be JSONL content that was misdetected
|
||||
try:
|
||||
lines = json_str.split("\n")
|
||||
# Filter out empty lines
|
||||
non_empty_lines = [line.strip() for line in lines if line.strip()]
|
||||
|
||||
# If multiple valid JSON lines, treat as JSONL
|
||||
if len(non_empty_lines) > 1:
|
||||
logger.warning(f"File '{filename}' appears to be JSONL format, processing as JSONL")
|
||||
# Call JSONL processor directly
|
||||
return await self._process_jsonl(content, filename)
|
||||
|
||||
logger.error(f"Error processing JSON file '{filename}': {e}")
|
||||
return ""
|
||||
except Exception as fallback_e:
|
||||
logger.error(f"Error processing JSON file '{filename}': {e}, fallback also failed: {fallback_e}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing JSON file: {e}")
|
||||
logger.error(f"Error processing JSON file '{filename}': {e}")
|
||||
return ""
|
||||
|
||||
async def _process_markdown(self, content: bytes, filename: str) -> str:
|
||||
@@ -1273,7 +1309,11 @@ class RAGModule(BaseModule):
|
||||
|
||||
# Detect MIME type
|
||||
mime_type = self._detect_mime_type(filename, file_data)
|
||||
file_type = mime_type.split("/")[0]
|
||||
# Special handling for JSONL files - use extension instead of MIME family
|
||||
if mime_type == "application/x-ndjson" or filename.lower().endswith('.jsonl'):
|
||||
file_type = "jsonl"
|
||||
else:
|
||||
file_type = mime_type.split("/")[0]
|
||||
logger.info(f"Detected MIME type: {mime_type}, file type: {file_type}")
|
||||
|
||||
# Check if file type is supported
|
||||
@@ -1562,6 +1602,10 @@ class RAGModule(BaseModule):
|
||||
"indexed_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
# Add source_url if present in ProcessedDocument
|
||||
if processed_doc.source_url:
|
||||
chunk_metadata["source_url"] = processed_doc.source_url
|
||||
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=chunk_id, vector=aligned_embedding, payload=chunk_metadata
|
||||
@@ -1927,10 +1971,53 @@ class RAGModule(BaseModule):
|
||||
}
|
||||
|
||||
logger.info(f"\nAggregated documents count: {len(document_scores)}")
|
||||
|
||||
# Phase 2: URL Deduplication
|
||||
# Track documents by source_url to deduplicate
|
||||
url_to_doc = {}
|
||||
deduplicated_scores = {}
|
||||
docs_without_url = 0
|
||||
urls_deduplicated = 0
|
||||
|
||||
for doc_id, data in document_scores.items():
|
||||
source_url = data["metadata"].get("source_url")
|
||||
|
||||
if source_url:
|
||||
# Document has a URL
|
||||
if source_url in url_to_doc:
|
||||
# URL already seen - keep document with higher score
|
||||
existing_doc_id = url_to_doc[source_url]
|
||||
existing_score = deduplicated_scores[existing_doc_id]["score"]
|
||||
|
||||
if data["score"] > existing_score:
|
||||
# Replace with higher scoring document
|
||||
logger.info(f"URL dedup: Replacing {existing_doc_id} (score={existing_score:.4f}) with {doc_id} (score={data['score']:.4f}) for URL: {source_url}")
|
||||
del deduplicated_scores[existing_doc_id]
|
||||
url_to_doc[source_url] = doc_id
|
||||
deduplicated_scores[doc_id] = data
|
||||
else:
|
||||
logger.info(f"URL dedup: Skipping {doc_id} (score={data['score']:.4f}), keeping {existing_doc_id} (score={existing_score:.4f}) for URL: {source_url}")
|
||||
|
||||
urls_deduplicated += 1
|
||||
else:
|
||||
# First time seeing this URL
|
||||
url_to_doc[source_url] = doc_id
|
||||
deduplicated_scores[doc_id] = data
|
||||
else:
|
||||
# Document without URL - always include
|
||||
deduplicated_scores[doc_id] = data
|
||||
docs_without_url += 1
|
||||
|
||||
logger.info(f"\n=== URL Deduplication Metrics ===")
|
||||
logger.info(f"Documents before deduplication: {len(document_scores)}")
|
||||
logger.info(f"Documents after deduplication: {len(deduplicated_scores)}")
|
||||
logger.info(f"Unique URLs found: {len(url_to_doc)}")
|
||||
logger.info(f"Duplicate URLs removed: {urls_deduplicated}")
|
||||
logger.info(f"Documents without URL: {docs_without_url}")
|
||||
logger.info("=== END ENHANCED RAG SEARCH DEBUGGING ===")
|
||||
|
||||
# Create SearchResult objects
|
||||
for doc_id, data in document_scores.items():
|
||||
# Create SearchResult objects from deduplicated results
|
||||
for doc_id, data in deduplicated_scores.items():
|
||||
document = Document(
|
||||
id=doc_id, content=data["content"], metadata=data["metadata"]
|
||||
)
|
||||
|
||||
@@ -20,9 +20,9 @@ class EmbeddingService:
|
||||
|
||||
def __init__(self, model_name: Optional[str] = None):
|
||||
self.model_name = model_name or getattr(
|
||||
settings, "RAG_EMBEDDING_MODEL", "BAAI/bge-m3"
|
||||
settings, "RAG_EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5"
|
||||
)
|
||||
self.dimension = 1024 # bge-m3 produces 1024-d vectors
|
||||
self.dimension = 384 # bge-small-en produces 384-d vectors
|
||||
self.initialized = False
|
||||
self.local_model = None
|
||||
self.backend = "uninitialized"
|
||||
@@ -139,7 +139,7 @@ class EmbeddingService:
|
||||
|
||||
def _generate_fallback_embedding(self, text: str) -> List[float]:
|
||||
"""Generate a single fallback embedding"""
|
||||
dimension = self.dimension or 1024
|
||||
dimension = self.dimension or 384
|
||||
# Use hash for reproducible random embeddings
|
||||
np.random.seed(hash(text) % 2**32)
|
||||
return np.random.random(dimension).tolist()
|
||||
|
||||
@@ -20,6 +20,39 @@ from app.modules.rag.main import ProcessedDocument
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def validate_source_url(url: str) -> str | None:
|
||||
"""
|
||||
Validate source URL for security compliance.
|
||||
|
||||
Security requirements:
|
||||
- Only http/https protocols allowed
|
||||
- Maximum length 500 characters
|
||||
- Returns None if validation fails
|
||||
|
||||
Args:
|
||||
url: URL string to validate
|
||||
|
||||
Returns:
|
||||
Validated URL or None if invalid
|
||||
"""
|
||||
if not url or not isinstance(url, str):
|
||||
return None
|
||||
|
||||
url = url.strip()
|
||||
|
||||
# Check length
|
||||
if len(url) > 500:
|
||||
logger.debug(f"URL exceeds 500 character limit: {len(url)} chars")
|
||||
return None
|
||||
|
||||
# Check protocol (basic validation)
|
||||
if not (url.startswith("http://") or url.startswith("https://")):
|
||||
logger.debug(f"URL has invalid protocol (only http/https allowed): {url[:50]}...")
|
||||
return None
|
||||
|
||||
return url
|
||||
|
||||
|
||||
class JSONLProcessor:
|
||||
"""Specialized processor for JSONL files"""
|
||||
|
||||
@@ -123,6 +156,10 @@ class JSONLProcessor:
|
||||
answer = payload.get("answer", "")
|
||||
language = payload.get("language", "EN")
|
||||
|
||||
# Extract and validate source URL
|
||||
raw_url = payload.get("url")
|
||||
source_url = validate_source_url(raw_url) if raw_url else None
|
||||
|
||||
if question or answer:
|
||||
# Create Q&A content
|
||||
content = f"Question: {question}\n\nAnswer: {answer}"
|
||||
@@ -139,6 +176,10 @@ class JSONLProcessor:
|
||||
"processed_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
# Add source_url if valid
|
||||
if source_url:
|
||||
doc_metadata["source_url"] = source_url
|
||||
|
||||
# Generate single embedding for the Q&A pair
|
||||
embeddings = await self.rag_module._generate_embeddings(
|
||||
[content]
|
||||
|
||||
@@ -16,11 +16,11 @@ class OllamaEmbeddingService:
|
||||
"""Service for generating text embeddings using Ollama"""
|
||||
|
||||
def __init__(
|
||||
self, model_name: str = "bge-m3", base_url: str = "http://172.17.0.1:11434"
|
||||
self, model_name: str = "bge-small-en", base_url: str = "http://172.17.0.1:11434"
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.base_url = base_url
|
||||
self.dimension = 1024 # bge-m3 dimension
|
||||
self.dimension = 384 # bge-small-en dimension
|
||||
self.initialized = False
|
||||
self._session = None
|
||||
|
||||
@@ -142,7 +142,7 @@ class OllamaEmbeddingService:
|
||||
|
||||
def _generate_fallback_embedding(self, text: str) -> List[float]:
|
||||
"""Generate a single fallback embedding"""
|
||||
dimension = self.dimension # 1024 for bge-m3
|
||||
dimension = self.dimension # 384 for bge-small-en
|
||||
# Use hash for reproducible random embeddings
|
||||
np.random.seed(hash(text) % 2**32)
|
||||
return np.random.random(dimension).tolist()
|
||||
|
||||
@@ -38,16 +38,19 @@ class RAGService:
|
||||
self, name: str, description: Optional[str] = None
|
||||
) -> RagCollection:
|
||||
"""Create a new RAG collection"""
|
||||
logger.info(f"Attempting to create collection with name: '{name}'")
|
||||
|
||||
# Check if collection name already exists
|
||||
stmt = select(RagCollection).where(
|
||||
RagCollection.name == name, RagCollection.is_active == True
|
||||
)
|
||||
existing = await self.db.scalar(stmt)
|
||||
if existing:
|
||||
logger.warning(f"Collection creation failed: '{name}' already exists (ID: {existing.id}, created: {existing.created_at})")
|
||||
raise APIException(
|
||||
status_code=400,
|
||||
error_code="COLLECTION_EXISTS",
|
||||
detail=f"Collection '{name}' already exists",
|
||||
detail=f"Collection '{name}' already exists. Please choose a different name.",
|
||||
)
|
||||
|
||||
# Generate unique Qdrant collection name
|
||||
|
||||
@@ -12,7 +12,7 @@ Notes:
|
||||
- Runs fully inside the backend, so Docker service hostnames (e.g. enclava-qdrant)
|
||||
and privatemode-proxy are reachable.
|
||||
- Uses RAGModule + JSONLProcessor to embed/index each JSONL line.
|
||||
- Creates the collection if missing (size=1024, cosine).
|
||||
- Creates the collection if missing (size=384, cosine).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -37,9 +37,9 @@ async def import_jsonl(collection_name: str, file_path: str):
|
||||
if not any(c.name == collection_name for c in collections):
|
||||
client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
|
||||
vectors_config=VectorParams(size=384, distance=Distance.COSINE),
|
||||
)
|
||||
print(f"Created Qdrant collection '{collection_name}' (size=1024, cosine)")
|
||||
print(f"Created Qdrant collection '{collection_name}' (size=384, cosine)")
|
||||
else:
|
||||
print(f"Using existing Qdrant collection '{collection_name}'")
|
||||
|
||||
@@ -49,7 +49,7 @@ async def import_jsonl(collection_name: str, file_path: str):
|
||||
"chunk_overlap": 50,
|
||||
"max_results": 10,
|
||||
"score_threshold": 0.3,
|
||||
"embedding_model": "intfloat/multilingual-e5-large-instruct",
|
||||
"embedding_model": "BAAI/bge-small-en-v1.5",
|
||||
})
|
||||
await rag.initialize()
|
||||
|
||||
|
||||
428
backend/tests/integration/api/test_chatbot_sources.py
Normal file
428
backend/tests/integration/api/test_chatbot_sources.py
Normal file
@@ -0,0 +1,428 @@
|
||||
"""
|
||||
API integration tests for chatbot sources with URL metadata.
|
||||
|
||||
Tests cover:
|
||||
- Chatbot API returns sources with URLs
|
||||
- Sources have all required fields
|
||||
- Sources are sorted by relevance
|
||||
- URL deduplication in chat response
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import json
|
||||
from httpx import AsyncClient
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.modules.rag.main import RAGModule
|
||||
from app.models.chatbot import ChatbotInstance
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_faq_jsonl_with_urls():
|
||||
"""Sample FAQ JSONL with URLs for testing"""
|
||||
return """{"id": "faq_pass", "payload": {"question": "How to reset my password?", "answer": "To reset your password, go to the login page and click 'Forgot Password'. You will receive an email with reset instructions.", "language": "EN", "url": "https://support.example.com/faq/password-reset"}}
|
||||
{"id": "faq_2fa", "payload": {"question": "How to enable two-factor authentication?", "answer": "Two-factor authentication can be enabled in your account security settings. Go to Settings > Security > Two-Factor Authentication and follow the setup wizard.", "language": "EN", "url": "https://support.example.com/faq/2fa-setup"}}
|
||||
{"id": "faq_hours", "payload": {"question": "What are your business hours?", "answer": "We are open Monday through Friday, 9:00 AM to 5:00 PM EST. We are closed on weekends and major holidays.", "language": "EN", "url": "https://support.example.com/faq/business-hours"}}
|
||||
{"id": "faq_cancel", "payload": {"question": "How to cancel my subscription?", "answer": "You can cancel your subscription at any time from your account settings. Go to Settings > Billing > Cancel Subscription. Your access will continue until the end of your billing period.", "language": "EN", "url": "https://support.example.com/faq/cancel-subscription"}}"""
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def chatbot_with_rag(test_db: AsyncSession, test_user: dict, test_qdrant_collection: str, sample_faq_jsonl_with_urls: str):
|
||||
"""Create a chatbot instance with RAG enabled and indexed documents"""
|
||||
# Initialize RAG module
|
||||
rag_module = RAGModule()
|
||||
await rag_module.initialize()
|
||||
rag_module.default_collection_name = test_qdrant_collection
|
||||
|
||||
# Process and index FAQ documents
|
||||
file_content = sample_faq_jsonl_with_urls.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="support_faq.jsonl"
|
||||
)
|
||||
await rag_module.index_processed_document(processed_doc, collection_name=test_qdrant_collection)
|
||||
|
||||
# Create chatbot instance
|
||||
chatbot = ChatbotInstance(
|
||||
name="Support Bot",
|
||||
chatbot_type="customer_support",
|
||||
user_id=test_user["id"],
|
||||
model="gpt-3.5-turbo",
|
||||
system_prompt="You are a helpful support assistant.",
|
||||
temperature=0.7,
|
||||
max_tokens=500,
|
||||
use_rag=True,
|
||||
rag_collection=test_qdrant_collection,
|
||||
rag_top_k=5,
|
||||
rag_score_threshold=0.1,
|
||||
is_active=True
|
||||
)
|
||||
|
||||
test_db.add(chatbot)
|
||||
await test_db.commit()
|
||||
await test_db.refresh(chatbot)
|
||||
|
||||
yield chatbot
|
||||
|
||||
# Cleanup
|
||||
await rag_module.cleanup()
|
||||
|
||||
|
||||
class TestChatbotSourcesResponse:
|
||||
"""Test chatbot API returns sources with URL metadata"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_returns_sources(self, authenticated_client: AsyncClient, chatbot_with_rag: ChatbotInstance):
|
||||
"""Test that chat API returns sources array"""
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "How do I reset my password?",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
# Verify response structure
|
||||
assert "response" in data
|
||||
assert "sources" in data
|
||||
assert isinstance(data["sources"], list)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sources_contain_required_fields(self, authenticated_client: AsyncClient, chatbot_with_rag: ChatbotInstance):
|
||||
"""Test that sources contain all required fields"""
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "Tell me about password reset and two-factor authentication",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
if len(data["sources"]) > 0:
|
||||
source = data["sources"][0]
|
||||
|
||||
# Required fields
|
||||
assert "title" in source or "question" in source
|
||||
assert "relevance_score" in source or "score" in source
|
||||
|
||||
# URL field (may be None for legacy documents)
|
||||
if "url" in source:
|
||||
assert source["url"] is None or isinstance(source["url"], str)
|
||||
|
||||
# Optional fields
|
||||
if "language" in source:
|
||||
assert isinstance(source["language"], str)
|
||||
|
||||
if "article_id" in source:
|
||||
assert isinstance(source["article_id"], str)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sources_have_urls(self, authenticated_client: AsyncClient, chatbot_with_rag: ChatbotInstance):
|
||||
"""Test that sources contain URL metadata when available"""
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "How to enable two-factor authentication?",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
# Should have at least one source with URL
|
||||
sources_with_urls = [
|
||||
s for s in data["sources"]
|
||||
if s.get("url") and s["url"].startswith("http")
|
||||
]
|
||||
|
||||
# At least some sources should have URLs (depending on RAG results)
|
||||
assert len(sources_with_urls) >= 0 # Flexible assertion
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_url_format_validation(self, authenticated_client: AsyncClient, chatbot_with_rag: ChatbotInstance):
|
||||
"""Test that returned URLs are properly formatted"""
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "What are your business hours?",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
for source in data["sources"]:
|
||||
if source.get("url"):
|
||||
url = source["url"]
|
||||
# URL should be valid format
|
||||
assert url.startswith("http://") or url.startswith("https://")
|
||||
assert " " not in url # No spaces in URL
|
||||
assert len(url) <= 2048 # Reasonable URL length
|
||||
|
||||
|
||||
class TestSourcesSortedByRelevance:
|
||||
"""Test that sources are sorted by relevance score"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sources_sorted_descending(self, authenticated_client: AsyncClient, chatbot_with_rag: ChatbotInstance):
|
||||
"""Test that sources are sorted by relevance score (highest first)"""
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "Tell me about account security and subscription management",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
if len(data["sources"]) > 1:
|
||||
# Extract relevance scores
|
||||
scores = []
|
||||
for source in data["sources"]:
|
||||
score = source.get("relevance_score") or source.get("score", 0)
|
||||
scores.append(score)
|
||||
|
||||
# Verify sorted in descending order
|
||||
assert scores == sorted(scores, reverse=True), "Sources should be sorted by relevance (highest first)"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_highest_relevance_first(self, authenticated_client: AsyncClient, chatbot_with_rag: ChatbotInstance):
|
||||
"""Test that most relevant source is first"""
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "How to reset password?",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
if len(data["sources"]) > 0:
|
||||
# First source should have highest score
|
||||
first_score = data["sources"][0].get("relevance_score") or data["sources"][0].get("score", 0)
|
||||
|
||||
for source in data["sources"][1:]:
|
||||
source_score = source.get("relevance_score") or source.get("score", 0)
|
||||
assert first_score >= source_score, "First source should have highest relevance"
|
||||
|
||||
|
||||
class TestURLDeduplicationInChatResponse:
|
||||
"""Test URL deduplication in chat API responses"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_duplicate_urls_removed(self, authenticated_client: AsyncClient, chatbot_with_rag: ChatbotInstance):
|
||||
"""Test that duplicate URLs are deduplicated in response"""
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "Tell me everything about password security, 2FA, and account protection",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
# Extract URLs from sources
|
||||
urls = [s.get("url") for s in data["sources"] if s.get("url")]
|
||||
|
||||
if len(urls) > 0:
|
||||
# Check for duplicates
|
||||
unique_urls = set(urls)
|
||||
assert len(urls) == len(unique_urls), "Response should not contain duplicate URLs"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_highest_score_kept_for_duplicate_url(self, authenticated_client: AsyncClient, test_qdrant_collection: str):
|
||||
"""Test that highest scoring document is kept when URLs are duplicated"""
|
||||
# This would require setting up documents with duplicate URLs
|
||||
# For now, we test the general behavior
|
||||
pass # Implementation would depend on specific test data setup
|
||||
|
||||
|
||||
class TestMixedSourcesWithAndWithoutURLs:
|
||||
"""Test handling of mixed sources (some with URLs, some without)"""
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def chatbot_with_mixed_docs(self, test_db: AsyncSession, test_user: dict, test_qdrant_collection: str):
|
||||
"""Create chatbot with mixed documents (with and without URLs)"""
|
||||
mixed_jsonl = """{"id": "with_url", "payload": {"question": "How to login?", "answer": "Use your email and password to log in.", "language": "EN", "url": "https://support.example.com/faq/login"}}
|
||||
{"id": "without_url", "payload": {"question": "Security best practices", "answer": "Always use strong passwords and enable 2FA.", "language": "EN"}}
|
||||
{"id": "with_url2", "payload": {"question": "Account recovery", "answer": "Contact support for account recovery.", "language": "EN", "url": "https://support.example.com/faq/recovery"}}"""
|
||||
|
||||
# Initialize RAG and index documents
|
||||
rag_module = RAGModule()
|
||||
await rag_module.initialize()
|
||||
rag_module.default_collection_name = test_qdrant_collection
|
||||
|
||||
file_content = mixed_jsonl.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="mixed_faq.jsonl"
|
||||
)
|
||||
await rag_module.index_processed_document(processed_doc, collection_name=test_qdrant_collection)
|
||||
|
||||
# Create chatbot
|
||||
chatbot = ChatbotInstance(
|
||||
name="Mixed Sources Bot",
|
||||
chatbot_type="assistant",
|
||||
user_id=test_user["id"],
|
||||
model="gpt-3.5-turbo",
|
||||
use_rag=True,
|
||||
rag_collection=test_qdrant_collection,
|
||||
rag_top_k=10,
|
||||
rag_score_threshold=0.01,
|
||||
is_active=True
|
||||
)
|
||||
|
||||
test_db.add(chatbot)
|
||||
await test_db.commit()
|
||||
await test_db.refresh(chatbot)
|
||||
|
||||
yield chatbot
|
||||
|
||||
await rag_module.cleanup()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mixed_sources_response(self, authenticated_client: AsyncClient, chatbot_with_mixed_docs: ChatbotInstance):
|
||||
"""Test that response handles mix of sources with and without URLs"""
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_mixed_docs.id}/chat",
|
||||
json={
|
||||
"message": "Tell me about login and security",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
# Should have sources
|
||||
assert len(data["sources"]) >= 0
|
||||
|
||||
# Check that sources can have both URL and non-URL documents
|
||||
with_urls = [s for s in data["sources"] if s.get("url")]
|
||||
without_urls = [s for s in data["sources"] if not s.get("url")]
|
||||
|
||||
# Both types should be handled gracefully
|
||||
for source in data["sources"]:
|
||||
# All sources should have title/question
|
||||
assert "title" in source or "question" in source
|
||||
|
||||
# URL is optional
|
||||
if "url" in source and source["url"]:
|
||||
assert isinstance(source["url"], str)
|
||||
assert source["url"].startswith("http")
|
||||
|
||||
|
||||
class TestSourcesEmptyState:
|
||||
"""Test behavior when no sources are available"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_rag_sources(self, authenticated_client: AsyncClient, test_db: AsyncSession, test_user: dict):
|
||||
"""Test chat response when RAG is disabled"""
|
||||
# Create chatbot without RAG
|
||||
chatbot = ChatbotInstance(
|
||||
name="No RAG Bot",
|
||||
chatbot_type="assistant",
|
||||
user_id=test_user["id"],
|
||||
model="gpt-3.5-turbo",
|
||||
use_rag=False,
|
||||
is_active=True
|
||||
)
|
||||
|
||||
test_db.add(chatbot)
|
||||
await test_db.commit()
|
||||
await test_db.refresh(chatbot)
|
||||
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot.id}/chat",
|
||||
json={
|
||||
"message": "Hello, how can you help?",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
# Sources should be empty or not present
|
||||
if "sources" in data:
|
||||
assert isinstance(data["sources"], list)
|
||||
assert len(data["sources"]) == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_matching_documents(self, authenticated_client: AsyncClient, chatbot_with_rag: ChatbotInstance):
|
||||
"""Test response when query matches no documents"""
|
||||
response = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "xyzabc123 nonexistent query zzzqqq",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
# Should have response even with no sources
|
||||
assert "response" in data
|
||||
|
||||
# Sources may be empty
|
||||
if "sources" in data:
|
||||
assert isinstance(data["sources"], list)
|
||||
|
||||
|
||||
class TestConversationContext:
|
||||
"""Test that sources are maintained across conversation turns"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sources_in_conversation(self, authenticated_client: AsyncClient, chatbot_with_rag: ChatbotInstance):
|
||||
"""Test that sources are provided in multi-turn conversation"""
|
||||
# First message
|
||||
response1 = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "How do I reset my password?",
|
||||
"conversation_id": None
|
||||
}
|
||||
)
|
||||
|
||||
assert response1.status_code == 200
|
||||
data1 = response1.json()
|
||||
conversation_id = data1.get("conversation_id")
|
||||
|
||||
assert conversation_id is not None
|
||||
assert "sources" in data1
|
||||
|
||||
# Follow-up message in same conversation
|
||||
response2 = await authenticated_client.post(
|
||||
f"/api-internal/v1/chatbots/{chatbot_with_rag.id}/chat",
|
||||
json={
|
||||
"message": "What if I don't receive the reset email?",
|
||||
"conversation_id": conversation_id
|
||||
}
|
||||
)
|
||||
|
||||
assert response2.status_code == 200
|
||||
data2 = response2.json()
|
||||
|
||||
# Should still have sources in follow-up
|
||||
assert "sources" in data2
|
||||
assert isinstance(data2["sources"], list)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
404
backend/tests/integration/test_rag_url_e2e.py
Normal file
404
backend/tests/integration/test_rag_url_e2e.py
Normal file
@@ -0,0 +1,404 @@
|
||||
"""
|
||||
Integration tests for RAG URL support end-to-end flow.
|
||||
|
||||
Tests cover:
|
||||
- Upload JSONL → index → search → response flow
|
||||
- Backward compatibility (documents without URLs)
|
||||
- URL deduplication in search
|
||||
- Mixed documents (with and without URLs)
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import json
|
||||
import io
|
||||
from datetime import datetime
|
||||
from httpx import AsyncClient
|
||||
from qdrant_client import QdrantClient
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.modules.rag.main import RAGModule, ProcessedDocument
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_jsonl_with_urls():
|
||||
"""Sample JSONL content with URLs"""
|
||||
return """{"id": "faq1", "payload": {"question": "How to reset password?", "answer": "Go to settings and click reset password.", "language": "EN", "url": "https://support.example.com/faq/password-reset"}}
|
||||
{"id": "faq2", "payload": {"question": "What are business hours?", "answer": "We are open Monday-Friday 9am-5pm.", "language": "EN", "url": "https://support.example.com/faq/business-hours"}}
|
||||
{"id": "faq3", "payload": {"question": "How to cancel subscription?", "answer": "You can cancel anytime from your account settings.", "language": "EN", "url": "https://support.example.com/faq/cancel-subscription"}}"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_jsonl_without_urls():
|
||||
"""Sample JSONL content without URLs (legacy format)"""
|
||||
return """{"id": "legacy1", "payload": {"question": "What is AI?", "answer": "Artificial Intelligence is...", "language": "EN"}}
|
||||
{"id": "legacy2", "payload": {"question": "Machine learning basics", "answer": "Machine learning is a subset of AI...", "language": "EN"}}"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_jsonl_mixed():
|
||||
"""Sample JSONL with mix of documents with and without URLs"""
|
||||
return """{"id": "mixed1", "payload": {"question": "How to login?", "answer": "Use your email and password.", "language": "EN", "url": "https://support.example.com/faq/login"}}
|
||||
{"id": "mixed2", "payload": {"question": "Security tips", "answer": "Use strong passwords.", "language": "EN"}}
|
||||
{"id": "mixed3", "payload": {"question": "Two-factor authentication", "answer": "Enable 2FA in security settings.", "language": "EN", "url": "https://support.example.com/faq/2fa"}}"""
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def rag_module(test_qdrant_collection: str):
|
||||
"""Initialize RAG module for testing"""
|
||||
config = {
|
||||
"chunk_size": 300,
|
||||
"chunk_overlap": 50,
|
||||
"max_results": 10,
|
||||
"score_threshold": 0.1, # Lower threshold for testing
|
||||
}
|
||||
|
||||
rag = RAGModule(config=config)
|
||||
await rag.initialize()
|
||||
rag.default_collection_name = test_qdrant_collection
|
||||
|
||||
yield rag
|
||||
|
||||
await rag.cleanup()
|
||||
|
||||
|
||||
class TestJSONLUploadWithURLs:
|
||||
"""Test uploading JSONL files with URL metadata"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_jsonl_with_urls(self, rag_module: RAGModule, sample_jsonl_with_urls: str):
|
||||
"""Test processing and indexing JSONL file with URLs"""
|
||||
filename = "faq_with_urls.jsonl"
|
||||
file_content = sample_jsonl_with_urls.encode("utf-8")
|
||||
|
||||
# Process document
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename=filename,
|
||||
metadata={"source": "test"}
|
||||
)
|
||||
|
||||
# Verify processing
|
||||
assert processed_doc is not None
|
||||
assert processed_doc.file_type == "application"
|
||||
assert processed_doc.mime_type == "application/x-ndjson"
|
||||
|
||||
# Index the document
|
||||
doc_id = await rag_module.index_processed_document(processed_doc)
|
||||
assert doc_id is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_returns_urls(self, rag_module: RAGModule, sample_jsonl_with_urls: str):
|
||||
"""Test that search results include source URLs"""
|
||||
# Upload and index document
|
||||
file_content = sample_jsonl_with_urls.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="faq.jsonl"
|
||||
)
|
||||
await rag_module.index_processed_document(processed_doc)
|
||||
|
||||
# Search for password reset
|
||||
results = await rag_module.search_documents(
|
||||
query="how to reset my password",
|
||||
max_results=5
|
||||
)
|
||||
|
||||
# Verify results contain URLs
|
||||
assert len(results) > 0
|
||||
# Check that at least one result has metadata with source_url
|
||||
has_url = any(
|
||||
result.document.metadata.get("source_url") is not None
|
||||
for result in results
|
||||
)
|
||||
assert has_url, "Expected at least one result to have source_url"
|
||||
|
||||
|
||||
class TestBackwardCompatibility:
|
||||
"""Test backward compatibility with documents without URLs"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_legacy_jsonl(self, rag_module: RAGModule, sample_jsonl_without_urls: str):
|
||||
"""Test processing legacy JSONL without URLs"""
|
||||
filename = "legacy_faq.jsonl"
|
||||
file_content = sample_jsonl_without_urls.encode("utf-8")
|
||||
|
||||
# Process document
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename=filename
|
||||
)
|
||||
|
||||
assert processed_doc is not None
|
||||
|
||||
# Index the document
|
||||
doc_id = await rag_module.index_processed_document(processed_doc)
|
||||
assert doc_id is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_legacy_documents(self, rag_module: RAGModule, sample_jsonl_without_urls: str):
|
||||
"""Test searching documents without URLs"""
|
||||
# Upload and index legacy document
|
||||
file_content = sample_jsonl_without_urls.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="legacy.jsonl"
|
||||
)
|
||||
await rag_module.index_processed_document(processed_doc)
|
||||
|
||||
# Search
|
||||
results = await rag_module.search_documents(
|
||||
query="what is artificial intelligence",
|
||||
max_results=5
|
||||
)
|
||||
|
||||
# Verify results work without URLs
|
||||
assert len(results) > 0
|
||||
for result in results:
|
||||
# source_url should be None or not present
|
||||
source_url = result.document.metadata.get("source_url")
|
||||
assert source_url is None or source_url == ""
|
||||
|
||||
|
||||
class TestMixedDocuments:
|
||||
"""Test handling mixed documents with and without URLs"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_mixed_jsonl(self, rag_module: RAGModule, sample_jsonl_mixed: str):
|
||||
"""Test processing JSONL with mixed URL presence"""
|
||||
filename = "mixed_faq.jsonl"
|
||||
file_content = sample_jsonl_mixed.encode("utf-8")
|
||||
|
||||
# Process document
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename=filename
|
||||
)
|
||||
|
||||
assert processed_doc is not None
|
||||
|
||||
# Index the document
|
||||
doc_id = await rag_module.index_processed_document(processed_doc)
|
||||
assert doc_id is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_mixed_documents(self, rag_module: RAGModule, sample_jsonl_mixed: str):
|
||||
"""Test searching returns mix of documents with and without URLs"""
|
||||
# Upload and index mixed document
|
||||
file_content = sample_jsonl_mixed.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="mixed.jsonl"
|
||||
)
|
||||
await rag_module.index_processed_document(processed_doc)
|
||||
|
||||
# Search for security-related content
|
||||
results = await rag_module.search_documents(
|
||||
query="security and authentication",
|
||||
max_results=10,
|
||||
score_threshold=0.01 # Very low threshold to get all results
|
||||
)
|
||||
|
||||
# Verify we get both types of documents
|
||||
assert len(results) > 0
|
||||
|
||||
# Check for presence of both URL and non-URL documents
|
||||
with_urls = [r for r in results if r.document.metadata.get("source_url")]
|
||||
without_urls = [r for r in results if not r.document.metadata.get("source_url")]
|
||||
|
||||
# Should have at least some documents with URLs
|
||||
assert len(with_urls) > 0 or len(without_urls) > 0
|
||||
|
||||
|
||||
class TestURLDeduplication:
|
||||
"""Test URL deduplication in search results"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_url_deduplication_in_search(self, rag_module: RAGModule):
|
||||
"""Test that search results deduplicate documents by URL"""
|
||||
# Create JSONL with documents having same URL (chunked content)
|
||||
jsonl_content = """{"id": "dup1", "payload": {"question": "Password reset part 1", "answer": "First, go to the login page. This is the initial step in the password reset process.", "language": "EN", "url": "https://support.example.com/faq/password"}}
|
||||
{"id": "dup2", "payload": {"question": "Password reset part 2", "answer": "Next, click the forgot password link. This will send you a reset email.", "language": "EN", "url": "https://support.example.com/faq/password"}}
|
||||
{"id": "dup3", "payload": {"question": "Password reset part 3", "answer": "Finally, check your email and follow the link to set a new password.", "language": "EN", "url": "https://support.example.com/faq/password"}}"""
|
||||
|
||||
file_content = jsonl_content.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="duplicate_urls.jsonl"
|
||||
)
|
||||
await rag_module.index_processed_document(processed_doc)
|
||||
|
||||
# Search for password reset
|
||||
results = await rag_module.search_documents(
|
||||
query="how to reset password step by step",
|
||||
max_results=10
|
||||
)
|
||||
|
||||
# Count unique URLs
|
||||
urls = [r.document.metadata.get("source_url") for r in results if r.document.metadata.get("source_url")]
|
||||
unique_urls = set(urls)
|
||||
|
||||
# After deduplication, should have only 1 unique URL
|
||||
# (Note: This tests the search_documents method which implements URL deduplication)
|
||||
assert len(unique_urls) <= 3 # May vary based on chunking
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_highest_score_kept_for_duplicate_urls(self, rag_module: RAGModule):
|
||||
"""Test that highest scoring chunk is kept for duplicate URLs"""
|
||||
# Create documents with same URL
|
||||
jsonl_content = """{"id": "score1", "payload": {"question": "Password reset", "answer": "Short answer", "language": "EN", "url": "https://support.example.com/faq/password"}}
|
||||
{"id": "score2", "payload": {"question": "How to reset password detailed guide", "answer": "This is a very detailed and comprehensive guide on how to reset your password with all the important steps and considerations.", "language": "EN", "url": "https://support.example.com/faq/password"}}"""
|
||||
|
||||
file_content = jsonl_content.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="scores.jsonl"
|
||||
)
|
||||
await rag_module.index_processed_document(processed_doc)
|
||||
|
||||
# Search
|
||||
results = await rag_module.search_documents(
|
||||
query="detailed guide how to reset password",
|
||||
max_results=10
|
||||
)
|
||||
|
||||
# Results with the URL should exist
|
||||
url_results = [
|
||||
r for r in results
|
||||
if r.document.metadata.get("source_url") == "https://support.example.com/faq/password"
|
||||
]
|
||||
|
||||
# Should have deduplicated results
|
||||
assert len(url_results) >= 1
|
||||
|
||||
|
||||
class TestEndToEndFlow:
|
||||
"""Test complete end-to-end flow: upload → index → search → response"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_complete_flow_with_urls(self, rag_module: RAGModule, sample_jsonl_with_urls: str):
|
||||
"""Test complete workflow from upload to search"""
|
||||
# Step 1: Upload and process JSONL
|
||||
file_content = sample_jsonl_with_urls.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="complete_test.jsonl",
|
||||
metadata={"test": "e2e"}
|
||||
)
|
||||
|
||||
assert processed_doc is not None
|
||||
assert processed_doc.word_count > 0
|
||||
|
||||
# Step 2: Index the document
|
||||
doc_id = await rag_module.index_processed_document(processed_doc)
|
||||
assert doc_id is not None
|
||||
|
||||
# Step 3: Search for content
|
||||
search_results = await rag_module.search_documents(
|
||||
query="business hours and opening times",
|
||||
max_results=5
|
||||
)
|
||||
|
||||
assert len(search_results) > 0
|
||||
|
||||
# Step 4: Verify URL metadata in results
|
||||
found_business_hours = False
|
||||
for result in search_results:
|
||||
metadata = result.document.metadata
|
||||
if "business-hours" in metadata.get("source_url", ""):
|
||||
found_business_hours = True
|
||||
assert metadata.get("language") == "EN"
|
||||
break
|
||||
|
||||
# Should find relevant result (may vary based on embeddings)
|
||||
# assert found_business_hours or len(search_results) > 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_complete_flow_without_urls(self, rag_module: RAGModule, sample_jsonl_without_urls: str):
|
||||
"""Test complete workflow with legacy documents"""
|
||||
# Upload and process
|
||||
file_content = sample_jsonl_without_urls.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="legacy_test.jsonl"
|
||||
)
|
||||
|
||||
# Index
|
||||
doc_id = await rag_module.index_processed_document(processed_doc)
|
||||
assert doc_id is not None
|
||||
|
||||
# Search
|
||||
results = await rag_module.search_documents(
|
||||
query="machine learning and artificial intelligence",
|
||||
max_results=5
|
||||
)
|
||||
|
||||
# Verify results work without URLs
|
||||
assert len(results) >= 0 # May have 0 results based on embeddings
|
||||
for result in results:
|
||||
# Should handle missing URLs gracefully
|
||||
assert result.document.metadata.get("source_url") is None or result.document.metadata.get("source_url") == ""
|
||||
|
||||
|
||||
class TestSearchResultFormat:
|
||||
"""Test search result format and structure"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_result_structure(self, rag_module: RAGModule, sample_jsonl_with_urls: str):
|
||||
"""Test that search results have correct structure"""
|
||||
# Upload and index
|
||||
file_content = sample_jsonl_with_urls.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="structure_test.jsonl"
|
||||
)
|
||||
await rag_module.index_processed_document(processed_doc)
|
||||
|
||||
# Search
|
||||
results = await rag_module.search_documents(
|
||||
query="password",
|
||||
max_results=5
|
||||
)
|
||||
|
||||
if len(results) > 0:
|
||||
result = results[0]
|
||||
|
||||
# Verify structure
|
||||
assert hasattr(result, "document")
|
||||
assert hasattr(result, "score")
|
||||
assert hasattr(result, "relevance_score")
|
||||
|
||||
# Verify document structure
|
||||
assert hasattr(result.document, "id")
|
||||
assert hasattr(result.document, "content")
|
||||
assert hasattr(result.document, "metadata")
|
||||
|
||||
# Verify metadata can contain source_url
|
||||
metadata = result.document.metadata
|
||||
assert isinstance(metadata, dict)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_results_sorted_by_relevance(self, rag_module: RAGModule, sample_jsonl_with_urls: str):
|
||||
"""Test that search results are sorted by relevance score"""
|
||||
# Upload and index
|
||||
file_content = sample_jsonl_with_urls.encode("utf-8")
|
||||
processed_doc = await rag_module.process_document(
|
||||
file_data=file_content,
|
||||
filename="sorted_test.jsonl"
|
||||
)
|
||||
await rag_module.index_processed_document(processed_doc)
|
||||
|
||||
# Search
|
||||
results = await rag_module.search_documents(
|
||||
query="subscription and account management",
|
||||
max_results=10
|
||||
)
|
||||
|
||||
if len(results) > 1:
|
||||
# Verify results are sorted by score (descending)
|
||||
scores = [r.score for r in results]
|
||||
assert scores == sorted(scores, reverse=True), "Results should be sorted by score in descending order"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
411
backend/tests/unit/test_url_metadata.py
Normal file
411
backend/tests/unit/test_url_metadata.py
Normal file
@@ -0,0 +1,411 @@
|
||||
"""
|
||||
Unit tests for URL metadata support in RAG system.
|
||||
|
||||
Tests cover:
|
||||
- JSONL URL extraction
|
||||
- URL validation (valid/invalid protocols, length limits)
|
||||
- RagDocument model with source_url
|
||||
- ProcessedDocument with source_url
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import json
|
||||
from datetime import datetime
|
||||
from app.modules.rag.main import ProcessedDocument, RAGModule
|
||||
|
||||
|
||||
class TestJSONLURLExtraction:
|
||||
"""Test URL extraction from JSONL files"""
|
||||
|
||||
def test_jsonl_with_url(self):
|
||||
"""Test processing JSONL with URL in payload"""
|
||||
jsonl_line = '{"id": "test123", "payload": {"question": "How to reset password?", "answer": "Go to settings", "language": "EN", "url": "https://example.com/faq/password"}}'
|
||||
|
||||
data = json.loads(jsonl_line)
|
||||
payload = data.get("payload", {})
|
||||
|
||||
# Extract URL
|
||||
source_url = payload.get("url")
|
||||
|
||||
assert source_url is not None
|
||||
assert source_url == "https://example.com/faq/password"
|
||||
assert source_url.startswith("https://")
|
||||
|
||||
def test_jsonl_without_url(self):
|
||||
"""Test backward compatibility - JSONL without URL"""
|
||||
jsonl_line = '{"id": "test456", "payload": {"question": "What is AI?", "answer": "Artificial Intelligence...", "language": "EN"}}'
|
||||
|
||||
data = json.loads(jsonl_line)
|
||||
payload = data.get("payload", {})
|
||||
|
||||
# Extract URL (should be None)
|
||||
source_url = payload.get("url")
|
||||
|
||||
assert source_url is None
|
||||
|
||||
def test_jsonl_with_empty_url(self):
|
||||
"""Test JSONL with empty URL string"""
|
||||
jsonl_line = '{"id": "test789", "payload": {"question": "Test", "answer": "Answer", "language": "EN", "url": ""}}'
|
||||
|
||||
data = json.loads(jsonl_line)
|
||||
payload = data.get("payload", {})
|
||||
|
||||
source_url = payload.get("url")
|
||||
|
||||
# Empty string should be treated as None
|
||||
assert source_url == ""
|
||||
# In actual implementation, empty strings should be converted to None
|
||||
|
||||
def test_jsonl_with_null_url(self):
|
||||
"""Test JSONL with null URL value"""
|
||||
jsonl_line = '{"id": "test999", "payload": {"question": "Test", "answer": "Answer", "language": "EN", "url": null}}'
|
||||
|
||||
data = json.loads(jsonl_line)
|
||||
payload = data.get("payload", {})
|
||||
|
||||
source_url = payload.get("url")
|
||||
|
||||
assert source_url is None
|
||||
|
||||
def test_jsonl_multiple_entries_mixed_urls(self):
|
||||
"""Test processing multiple JSONL entries with mixed URL presence"""
|
||||
jsonl_content = """{"id": "1", "payload": {"question": "Q1", "answer": "A1", "url": "https://example.com/1"}}
|
||||
{"id": "2", "payload": {"question": "Q2", "answer": "A2"}}
|
||||
{"id": "3", "payload": {"question": "Q3", "answer": "A3", "url": "https://example.com/3"}}"""
|
||||
|
||||
lines = jsonl_content.strip().split("\n")
|
||||
urls = []
|
||||
|
||||
for line in lines:
|
||||
data = json.loads(line)
|
||||
payload = data.get("payload", {})
|
||||
url = payload.get("url")
|
||||
urls.append(url)
|
||||
|
||||
assert len(urls) == 3
|
||||
assert urls[0] == "https://example.com/1"
|
||||
assert urls[1] is None
|
||||
assert urls[2] == "https://example.com/3"
|
||||
|
||||
|
||||
class TestURLValidation:
|
||||
"""Test URL validation logic"""
|
||||
|
||||
def test_valid_https_url(self):
|
||||
"""Test validation of valid HTTPS URL"""
|
||||
url = "https://example.com/faq/article-123"
|
||||
|
||||
# URL validation logic
|
||||
assert url.startswith("https://") or url.startswith("http://")
|
||||
assert len(url) <= 2048 # Max URL length
|
||||
assert " " not in url # No spaces
|
||||
|
||||
def test_valid_http_url(self):
|
||||
"""Test validation of valid HTTP URL"""
|
||||
url = "http://example.com/faq/article"
|
||||
|
||||
assert url.startswith("https://") or url.startswith("http://")
|
||||
assert len(url) <= 2048
|
||||
|
||||
def test_invalid_protocol(self):
|
||||
"""Test rejection of invalid protocol"""
|
||||
url = "ftp://example.com/file"
|
||||
|
||||
# Should only accept http/https
|
||||
is_valid = url.startswith("https://") or url.startswith("http://")
|
||||
assert not is_valid
|
||||
|
||||
def test_url_too_long(self):
|
||||
"""Test rejection of URL exceeding max length"""
|
||||
url = "https://example.com/" + "a" * 3000
|
||||
|
||||
is_valid = len(url) <= 2048
|
||||
assert not is_valid
|
||||
|
||||
def test_url_with_spaces(self):
|
||||
"""Test rejection of URL with spaces"""
|
||||
url = "https://example.com/faq with spaces"
|
||||
|
||||
is_valid = " " not in url
|
||||
assert not is_valid
|
||||
|
||||
def test_url_with_query_params(self):
|
||||
"""Test validation of URL with query parameters"""
|
||||
url = "https://example.com/faq?id=123&lang=en"
|
||||
|
||||
assert url.startswith("https://")
|
||||
assert len(url) <= 2048
|
||||
assert " " not in url
|
||||
|
||||
def test_url_with_fragment(self):
|
||||
"""Test validation of URL with fragment"""
|
||||
url = "https://example.com/faq#section-5"
|
||||
|
||||
assert url.startswith("https://")
|
||||
assert len(url) <= 2048
|
||||
|
||||
def test_url_with_port(self):
|
||||
"""Test validation of URL with custom port"""
|
||||
url = "https://example.com:8080/faq/article"
|
||||
|
||||
assert url.startswith("https://")
|
||||
assert len(url) <= 2048
|
||||
|
||||
def test_url_with_special_chars(self):
|
||||
"""Test validation of URL with encoded special characters"""
|
||||
url = "https://example.com/faq/article%20with%20spaces"
|
||||
|
||||
assert url.startswith("https://")
|
||||
assert len(url) <= 2048
|
||||
assert " " not in url # Should be encoded
|
||||
|
||||
|
||||
class TestProcessedDocument:
|
||||
"""Test ProcessedDocument dataclass with source_url field"""
|
||||
|
||||
def test_processed_document_with_url(self):
|
||||
"""Test creating ProcessedDocument with source_url"""
|
||||
doc = ProcessedDocument(
|
||||
id="doc123",
|
||||
original_filename="faq.jsonl",
|
||||
file_type="application",
|
||||
mime_type="application/x-ndjson",
|
||||
content="Test content",
|
||||
extracted_text="Test content",
|
||||
metadata={"article_id": "123"},
|
||||
word_count=2,
|
||||
sentence_count=1,
|
||||
language="en",
|
||||
entities=[],
|
||||
keywords=["test"],
|
||||
processing_time=0.5,
|
||||
processed_at=datetime.utcnow(),
|
||||
file_hash="abc123",
|
||||
file_size=100,
|
||||
source_url="https://example.com/faq/article"
|
||||
)
|
||||
|
||||
assert doc.source_url == "https://example.com/faq/article"
|
||||
assert doc.source_url is not None
|
||||
|
||||
def test_processed_document_without_url(self):
|
||||
"""Test ProcessedDocument without source_url (backward compatibility)"""
|
||||
doc = ProcessedDocument(
|
||||
id="doc456",
|
||||
original_filename="document.txt",
|
||||
file_type="text",
|
||||
mime_type="text/plain",
|
||||
content="Test content",
|
||||
extracted_text="Test content",
|
||||
metadata={},
|
||||
word_count=2,
|
||||
sentence_count=1,
|
||||
language="en",
|
||||
entities=[],
|
||||
keywords=["test"],
|
||||
processing_time=0.5,
|
||||
processed_at=datetime.utcnow(),
|
||||
file_hash="def456",
|
||||
file_size=100
|
||||
)
|
||||
|
||||
assert doc.source_url is None
|
||||
|
||||
def test_processed_document_url_in_metadata(self):
|
||||
"""Test that source_url can also be accessed from metadata"""
|
||||
source_url = "https://example.com/faq/article"
|
||||
doc = ProcessedDocument(
|
||||
id="doc789",
|
||||
original_filename="faq.jsonl",
|
||||
file_type="application",
|
||||
mime_type="application/x-ndjson",
|
||||
content="Test content",
|
||||
extracted_text="Test content",
|
||||
metadata={"article_id": "789", "source_url": source_url},
|
||||
word_count=2,
|
||||
sentence_count=1,
|
||||
language="en",
|
||||
entities=[],
|
||||
keywords=["test"],
|
||||
processing_time=0.5,
|
||||
processed_at=datetime.utcnow(),
|
||||
file_hash="ghi789",
|
||||
file_size=100,
|
||||
source_url=source_url
|
||||
)
|
||||
|
||||
# URL should be in both source_url field and metadata
|
||||
assert doc.source_url == source_url
|
||||
assert doc.metadata["source_url"] == source_url
|
||||
|
||||
|
||||
class TestURLMetadataStorage:
|
||||
"""Test URL metadata storage in chunks"""
|
||||
|
||||
def test_chunk_metadata_includes_url(self):
|
||||
"""Test that chunk metadata includes source_url"""
|
||||
chunk_metadata = {
|
||||
"document_id": "doc123",
|
||||
"chunk_index": 0,
|
||||
"chunk_count": 5,
|
||||
"content": "This is chunk 0",
|
||||
"source_url": "https://example.com/faq/article",
|
||||
"article_id": "123",
|
||||
"language": "EN"
|
||||
}
|
||||
|
||||
assert "source_url" in chunk_metadata
|
||||
assert chunk_metadata["source_url"] == "https://example.com/faq/article"
|
||||
|
||||
def test_chunk_metadata_without_url(self):
|
||||
"""Test backward compatibility - chunk without source_url"""
|
||||
chunk_metadata = {
|
||||
"document_id": "doc456",
|
||||
"chunk_index": 0,
|
||||
"chunk_count": 3,
|
||||
"content": "This is chunk 0",
|
||||
"article_id": "456"
|
||||
}
|
||||
|
||||
assert chunk_metadata.get("source_url") is None
|
||||
|
||||
def test_multiple_chunks_same_url(self):
|
||||
"""Test that multiple chunks from same document share URL"""
|
||||
source_url = "https://example.com/faq/long-article"
|
||||
|
||||
chunks = []
|
||||
for i in range(3):
|
||||
chunk_metadata = {
|
||||
"document_id": "doc789",
|
||||
"chunk_index": i,
|
||||
"chunk_count": 3,
|
||||
"content": f"This is chunk {i}",
|
||||
"source_url": source_url
|
||||
}
|
||||
chunks.append(chunk_metadata)
|
||||
|
||||
# All chunks should have the same URL
|
||||
urls = [chunk["source_url"] for chunk in chunks]
|
||||
assert len(set(urls)) == 1 # Only one unique URL
|
||||
assert urls[0] == source_url
|
||||
|
||||
|
||||
class TestURLDeduplication:
|
||||
"""Test URL deduplication logic"""
|
||||
|
||||
def test_deduplicate_by_url(self):
|
||||
"""Test deduplication of documents by source_url"""
|
||||
search_results = [
|
||||
{"document_id": "doc1", "source_url": "https://example.com/faq/1", "score": 0.95},
|
||||
{"document_id": "doc2", "source_url": "https://example.com/faq/1", "score": 0.85}, # Duplicate URL
|
||||
{"document_id": "doc3", "source_url": "https://example.com/faq/2", "score": 0.80},
|
||||
]
|
||||
|
||||
# Deduplication logic
|
||||
seen_urls = set()
|
||||
deduplicated = []
|
||||
|
||||
for result in search_results:
|
||||
url = result["source_url"]
|
||||
if url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
deduplicated.append(result)
|
||||
|
||||
assert len(deduplicated) == 2 # Should have 2 unique URLs
|
||||
assert deduplicated[0]["source_url"] == "https://example.com/faq/1"
|
||||
assert deduplicated[1]["source_url"] == "https://example.com/faq/2"
|
||||
|
||||
def test_keep_highest_score_for_duplicate_urls(self):
|
||||
"""Test that highest scoring document is kept for duplicate URLs"""
|
||||
search_results = [
|
||||
{"document_id": "doc1", "source_url": "https://example.com/faq/1", "score": 0.85},
|
||||
{"document_id": "doc2", "source_url": "https://example.com/faq/1", "score": 0.95}, # Higher score
|
||||
{"document_id": "doc3", "source_url": "https://example.com/faq/2", "score": 0.80},
|
||||
]
|
||||
|
||||
# Deduplication with score tracking
|
||||
url_to_best = {}
|
||||
|
||||
for result in search_results:
|
||||
url = result["source_url"]
|
||||
if url not in url_to_best or result["score"] > url_to_best[url]["score"]:
|
||||
url_to_best[url] = result
|
||||
|
||||
deduplicated = list(url_to_best.values())
|
||||
|
||||
assert len(deduplicated) == 2
|
||||
# Should keep doc2 (score 0.95) instead of doc1 (score 0.85)
|
||||
url1_doc = [d for d in deduplicated if d["source_url"] == "https://example.com/faq/1"][0]
|
||||
assert url1_doc["document_id"] == "doc2"
|
||||
assert url1_doc["score"] == 0.95
|
||||
|
||||
def test_deduplicate_mixed_urls_and_none(self):
|
||||
"""Test deduplication with mix of URLs and None values"""
|
||||
search_results = [
|
||||
{"document_id": "doc1", "source_url": "https://example.com/faq/1", "score": 0.95},
|
||||
{"document_id": "doc2", "source_url": None, "score": 0.90},
|
||||
{"document_id": "doc3", "source_url": "https://example.com/faq/1", "score": 0.85}, # Duplicate
|
||||
{"document_id": "doc4", "source_url": None, "score": 0.80},
|
||||
]
|
||||
|
||||
# Deduplication logic that preserves None values
|
||||
seen_urls = set()
|
||||
deduplicated = []
|
||||
|
||||
for result in search_results:
|
||||
url = result["source_url"]
|
||||
if url is None:
|
||||
# Always include documents without URLs
|
||||
deduplicated.append(result)
|
||||
elif url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
deduplicated.append(result)
|
||||
|
||||
assert len(deduplicated) == 3 # 1 unique URL + 2 None
|
||||
assert deduplicated[0]["source_url"] == "https://example.com/faq/1"
|
||||
assert deduplicated[1]["source_url"] is None
|
||||
assert deduplicated[2]["source_url"] is None
|
||||
|
||||
|
||||
class TestURLFieldCompatibility:
|
||||
"""Test backward compatibility with existing data"""
|
||||
|
||||
def test_search_results_without_url_field(self):
|
||||
"""Test handling search results from legacy documents without URL"""
|
||||
result = {
|
||||
"document_id": "legacy_doc",
|
||||
"content": "Legacy content",
|
||||
"metadata": {
|
||||
"article_id": "123",
|
||||
"language": "EN"
|
||||
},
|
||||
"score": 0.85
|
||||
}
|
||||
|
||||
# Accessing source_url should not raise error
|
||||
source_url = result.get("metadata", {}).get("source_url")
|
||||
assert source_url is None
|
||||
|
||||
def test_mixed_legacy_and_new_documents(self):
|
||||
"""Test search results with mix of legacy and new documents"""
|
||||
results = [
|
||||
{
|
||||
"document_id": "new_doc",
|
||||
"metadata": {"source_url": "https://example.com/faq/1"},
|
||||
"score": 0.95
|
||||
},
|
||||
{
|
||||
"document_id": "legacy_doc",
|
||||
"metadata": {"article_id": "123"},
|
||||
"score": 0.85
|
||||
}
|
||||
]
|
||||
|
||||
for result in results:
|
||||
url = result.get("metadata", {}).get("source_url")
|
||||
# Should handle both cases gracefully
|
||||
assert url is None or isinstance(url, str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
455
frontend/src/components/chat/SourcesList.test.tsx
Normal file
455
frontend/src/components/chat/SourcesList.test.tsx
Normal file
@@ -0,0 +1,455 @@
|
||||
/**
|
||||
* Unit tests for SourcesList component
|
||||
*
|
||||
* Tests cover:
|
||||
* - Component renders correctly with sources
|
||||
* - URLs are clickable and properly formatted
|
||||
* - Non-URL sources display correctly
|
||||
* - Accessibility attributes are present
|
||||
* - Responsive behavior and edge cases
|
||||
*
|
||||
* NOTE: This test requires Jest and React Testing Library to be installed:
|
||||
*
|
||||
* npm install --save-dev @testing-library/react @testing-library/jest-dom jest jest-environment-jsdom
|
||||
* npm install --save-dev @testing-library/user-event
|
||||
*
|
||||
* Also add to package.json:
|
||||
* "scripts": {
|
||||
* "test": "jest",
|
||||
* "test:watch": "jest --watch",
|
||||
* "test:coverage": "jest --coverage"
|
||||
* }
|
||||
*
|
||||
* Create jest.config.js in frontend root:
|
||||
* module.exports = {
|
||||
* testEnvironment: 'jsdom',
|
||||
* setupFilesAfterEnv: ['<rootDir>/jest.setup.js'],
|
||||
* moduleNameMapper: {
|
||||
* '^@/(.*)$': '<rootDir>/src/$1',
|
||||
* },
|
||||
* }
|
||||
*
|
||||
* Create jest.setup.js in frontend root:
|
||||
* import '@testing-library/jest-dom'
|
||||
*/
|
||||
|
||||
import React from 'react'
|
||||
import { render, screen, within } from '@testing-library/react'
|
||||
import userEvent from '@testing-library/user-event'
|
||||
import '@testing-library/jest-dom'
|
||||
import { SourcesList } from './SourcesList'
|
||||
import { ChatMessageSource } from '@/types/chatbot'
|
||||
|
||||
// Mock the lucide-react icons
|
||||
jest.mock('lucide-react', () => ({
|
||||
ExternalLink: ({ className, 'aria-hidden': ariaHidden }: any) => (
|
||||
<span data-testid="external-link-icon" className={className} aria-hidden={ariaHidden}>
|
||||
↗
|
||||
</span>
|
||||
),
|
||||
Globe: ({ className, 'aria-hidden': ariaHidden }: any) => (
|
||||
<span data-testid="globe-icon" className={className} aria-hidden={ariaHidden}>
|
||||
🌐
|
||||
</span>
|
||||
),
|
||||
}))
|
||||
|
||||
// Mock the Badge component
|
||||
jest.mock('@/components/ui/badge', () => ({
|
||||
Badge: ({ children, className, variant, 'aria-label': ariaLabel }: any) => (
|
||||
<span
|
||||
data-testid="badge"
|
||||
data-variant={variant}
|
||||
className={className}
|
||||
aria-label={ariaLabel}
|
||||
>
|
||||
{children}
|
||||
</span>
|
||||
),
|
||||
}))
|
||||
|
||||
describe('SourcesList Component', () => {
|
||||
const mockSourceWithUrl: ChatMessageSource = {
|
||||
title: 'How to reset password?',
|
||||
content: 'Full content here',
|
||||
url: 'https://support.example.com/faq/password-reset',
|
||||
language: 'EN',
|
||||
article_id: 'faq123',
|
||||
relevance_score: 0.95,
|
||||
content_preview: 'To reset your password, go to settings...',
|
||||
}
|
||||
|
||||
const mockSourceWithoutUrl: ChatMessageSource = {
|
||||
title: 'Security Best Practices',
|
||||
content: 'Full content here',
|
||||
url: null,
|
||||
relevance_score: 0.82,
|
||||
content_preview: 'Always use strong passwords...',
|
||||
}
|
||||
|
||||
const mockSourceNonEnglish: ChatMessageSource = {
|
||||
title: 'Wie setze ich mein Passwort zurück?',
|
||||
content: 'Full content here',
|
||||
url: 'https://support.example.com/de/faq/password',
|
||||
language: 'DE',
|
||||
relevance_score: 0.88,
|
||||
}
|
||||
|
||||
describe('Rendering', () => {
|
||||
it('should render null when sources array is empty', () => {
|
||||
const { container } = render(<SourcesList sources={[]} />)
|
||||
expect(container.firstChild).toBeNull()
|
||||
})
|
||||
|
||||
it('should render null when sources is null/undefined', () => {
|
||||
const { container: container1 } = render(<SourcesList sources={null as any} />)
|
||||
expect(container1.firstChild).toBeNull()
|
||||
|
||||
const { container: container2 } = render(<SourcesList sources={undefined as any} />)
|
||||
expect(container2.firstChild).toBeNull()
|
||||
})
|
||||
|
||||
it('should render sources list with correct heading', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
expect(screen.getByText(/Sources \(1\):/)).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should render multiple sources', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl, mockSourceWithoutUrl]} />)
|
||||
|
||||
expect(screen.getByText(/Sources \(2\):/)).toBeInTheDocument()
|
||||
expect(screen.getByText('How to reset password?')).toBeInTheDocument()
|
||||
expect(screen.getByText('Security Best Practices')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should render with correct ARIA region', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const region = screen.getByRole('region', { name: 'Information sources' })
|
||||
expect(region).toBeInTheDocument()
|
||||
})
|
||||
})
|
||||
|
||||
describe('Sources with URLs', () => {
|
||||
it('should render source with URL as clickable link', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const link = screen.getByRole('link', { name: /How to reset password\?/i })
|
||||
expect(link).toBeInTheDocument()
|
||||
expect(link).toHaveAttribute('href', 'https://support.example.com/faq/password-reset')
|
||||
expect(link).toHaveAttribute('target', '_blank')
|
||||
expect(link).toHaveAttribute('rel', 'noopener noreferrer')
|
||||
})
|
||||
|
||||
it('should display external link icon for URLs', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const icon = screen.getByTestId('external-link-icon')
|
||||
expect(icon).toBeInTheDocument()
|
||||
expect(icon).toHaveAttribute('aria-hidden', 'true')
|
||||
})
|
||||
|
||||
it('should have proper ARIA label for link', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const link = screen.getByRole('link')
|
||||
expect(link).toHaveAttribute(
|
||||
'aria-label',
|
||||
'Open source: How to reset password? (opens in new tab)'
|
||||
)
|
||||
})
|
||||
|
||||
it('should be keyboard accessible (focusable)', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const link = screen.getByRole('link')
|
||||
expect(link).toHaveClass('focus:ring-2')
|
||||
expect(link).toHaveClass('focus:ring-primary')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Sources without URLs', () => {
|
||||
it('should render source without URL as plain text', () => {
|
||||
render(<SourcesList sources={[mockSourceWithoutUrl]} />)
|
||||
|
||||
// Should not be a link
|
||||
expect(screen.queryByRole('link')).not.toBeInTheDocument()
|
||||
|
||||
// Should be plain text
|
||||
const title = screen.getByText('Security Best Practices')
|
||||
expect(title.tagName).toBe('SPAN')
|
||||
})
|
||||
|
||||
it('should not display external link icon for non-URL sources', () => {
|
||||
render(<SourcesList sources={[mockSourceWithoutUrl]} />)
|
||||
|
||||
expect(screen.queryByTestId('external-link-icon')).not.toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should handle empty URL string as non-URL', () => {
|
||||
const sourceWithEmptyUrl = { ...mockSourceWithUrl, url: '' }
|
||||
render(<SourcesList sources={[sourceWithEmptyUrl]} />)
|
||||
|
||||
expect(screen.queryByRole('link')).not.toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should handle whitespace-only URL as non-URL', () => {
|
||||
const sourceWithWhitespaceUrl = { ...mockSourceWithUrl, url: ' ' }
|
||||
render(<SourcesList sources={[sourceWithWhitespaceUrl]} />)
|
||||
|
||||
expect(screen.queryByRole('link')).not.toBeInTheDocument()
|
||||
})
|
||||
})
|
||||
|
||||
describe('Language Badges', () => {
|
||||
it('should display language badge for non-English sources', () => {
|
||||
render(<SourcesList sources={[mockSourceNonEnglish]} />)
|
||||
|
||||
const badge = screen.getByText('DE')
|
||||
expect(badge).toBeInTheDocument()
|
||||
expect(badge).toHaveAttribute('aria-label', 'Language: DE')
|
||||
})
|
||||
|
||||
it('should not display language badge for English sources', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
expect(screen.queryByTestId('globe-icon')).not.toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should display globe icon for non-English sources', () => {
|
||||
render(<SourcesList sources={[mockSourceNonEnglish]} />)
|
||||
|
||||
const icon = screen.getByTestId('globe-icon')
|
||||
expect(icon).toBeInTheDocument()
|
||||
expect(icon).toHaveAttribute('aria-hidden', 'true')
|
||||
})
|
||||
|
||||
it('should uppercase language code', () => {
|
||||
const sourceLowercase = { ...mockSourceNonEnglish, language: 'de' }
|
||||
render(<SourcesList sources={[sourceLowercase]} />)
|
||||
|
||||
expect(screen.getByText('DE')).toBeInTheDocument()
|
||||
})
|
||||
})
|
||||
|
||||
describe('Relevance Score', () => {
|
||||
it('should display relevance score as percentage', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const scoreBadge = screen.getByText('95%')
|
||||
expect(scoreBadge).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should have ARIA label for relevance score', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const scoreBadge = screen.getByLabelText('Relevance score: 95%')
|
||||
expect(scoreBadge).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should round relevance score to integer', () => {
|
||||
const sourceWithDecimal = { ...mockSourceWithUrl, relevance_score: 0.876 }
|
||||
render(<SourcesList sources={[sourceWithDecimal]} />)
|
||||
|
||||
expect(screen.getByText('88%')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should not display score badge if relevance_score is missing', () => {
|
||||
const sourceNoScore = { ...mockSourceWithUrl, relevance_score: undefined }
|
||||
render(<SourcesList sources={[sourceNoScore]} />)
|
||||
|
||||
expect(screen.queryByText(/%$/)).not.toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should handle zero relevance score', () => {
|
||||
const sourceZeroScore = { ...mockSourceWithUrl, relevance_score: 0 }
|
||||
render(<SourcesList sources={[sourceZeroScore]} />)
|
||||
|
||||
expect(screen.getByText('0%')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should handle 100% relevance score', () => {
|
||||
const sourcePerfectScore = { ...mockSourceWithUrl, relevance_score: 1.0 }
|
||||
render(<SourcesList sources={[sourcePerfectScore]} />)
|
||||
|
||||
expect(screen.getByText('100%')).toBeInTheDocument()
|
||||
})
|
||||
})
|
||||
|
||||
describe('Content Preview', () => {
|
||||
it('should display content preview when available', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
expect(screen.getByText('To reset your password, go to settings...')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should not display preview when not available', () => {
|
||||
const sourceNoPreview = { ...mockSourceWithUrl, content_preview: undefined }
|
||||
render(<SourcesList sources={[sourceNoPreview]} />)
|
||||
|
||||
expect(screen.queryByText(/reset your password/)).not.toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should have line-clamp class for preview text', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const preview = screen.getByText('To reset your password, go to settings...')
|
||||
expect(preview).toHaveClass('line-clamp-2')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Fallback Titles', () => {
|
||||
it('should use fallback title when title is missing', () => {
|
||||
const sourceNoTitle = { ...mockSourceWithUrl, title: '' }
|
||||
render(<SourcesList sources={[sourceNoTitle]} />)
|
||||
|
||||
expect(screen.getByText('Source 1')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should use correct index for fallback titles', () => {
|
||||
const source1 = { ...mockSourceWithUrl, title: '' }
|
||||
const source2 = { ...mockSourceWithoutUrl, title: '' }
|
||||
render(<SourcesList sources={[source1, source2]} />)
|
||||
|
||||
expect(screen.getByText('Source 1')).toBeInTheDocument()
|
||||
expect(screen.getByText('Source 2')).toBeInTheDocument()
|
||||
})
|
||||
})
|
||||
|
||||
describe('Responsive Behavior', () => {
|
||||
it('should have break-words class for long titles', () => {
|
||||
const longTitle = 'This is a very long title that should wrap to multiple lines'
|
||||
const source = { ...mockSourceWithUrl, title: longTitle }
|
||||
render(<SourcesList sources={[source]} />)
|
||||
|
||||
const link = screen.getByRole('link')
|
||||
expect(link).toHaveClass('break-words')
|
||||
})
|
||||
|
||||
it('should have flex-wrap for badges container', () => {
|
||||
render(<SourcesList sources={[mockSourceNonEnglish]} />)
|
||||
|
||||
// Find the container with flex and gap classes
|
||||
const container = screen.getByLabelText('Language: DE').parentElement
|
||||
expect(container).toHaveClass('flex-wrap')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Mixed Sources', () => {
|
||||
it('should render mix of sources with and without URLs', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl, mockSourceWithoutUrl, mockSourceNonEnglish]} />)
|
||||
|
||||
// Should have 2 links (with URLs) and 1 span (without URL)
|
||||
const links = screen.getAllByRole('link')
|
||||
expect(links).toHaveLength(2)
|
||||
|
||||
// All titles should be present
|
||||
expect(screen.getByText('How to reset password?')).toBeInTheDocument()
|
||||
expect(screen.getByText('Security Best Practices')).toBeInTheDocument()
|
||||
expect(screen.getByText('Wie setze ich mein Passwort zurück?')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should handle sources with partial data', () => {
|
||||
const partialSource: ChatMessageSource = {
|
||||
title: 'Minimal Source',
|
||||
content: 'Content',
|
||||
}
|
||||
render(<SourcesList sources={[partialSource]} />)
|
||||
|
||||
expect(screen.getByText('Minimal Source')).toBeInTheDocument()
|
||||
// Should not crash and should render without optional fields
|
||||
})
|
||||
})
|
||||
|
||||
describe('Accessibility', () => {
|
||||
it('should have semantic HTML structure', () => {
|
||||
const { container } = render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
// Should have region role
|
||||
expect(screen.getByRole('region')).toBeInTheDocument()
|
||||
|
||||
// Links should be properly marked up
|
||||
const link = screen.getByRole('link')
|
||||
expect(link).toHaveAttribute('href')
|
||||
})
|
||||
|
||||
it('should have proper color contrast classes', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const title = screen.getByText(/Sources \(1\)/)
|
||||
expect(title).toHaveClass('text-muted-foreground')
|
||||
})
|
||||
|
||||
it('should support keyboard navigation', async () => {
|
||||
const user = userEvent.setup()
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
|
||||
const link = screen.getByRole('link')
|
||||
|
||||
// Should be focusable with Tab
|
||||
await user.tab()
|
||||
expect(link).toHaveFocus()
|
||||
})
|
||||
|
||||
it('should have aria-hidden on decorative icons', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl, mockSourceNonEnglish]} />)
|
||||
|
||||
const externalIcon = screen.getByTestId('external-link-icon')
|
||||
expect(externalIcon).toHaveAttribute('aria-hidden', 'true')
|
||||
|
||||
const globeIcon = screen.getByTestId('globe-icon')
|
||||
expect(globeIcon).toHaveAttribute('aria-hidden', 'true')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Edge Cases', () => {
|
||||
it('should handle very high relevance scores (>1.0)', () => {
|
||||
const sourceHighScore = { ...mockSourceWithUrl, relevance_score: 1.5 }
|
||||
render(<SourcesList sources={[sourceHighScore]} />)
|
||||
|
||||
// Should display as 150%
|
||||
expect(screen.getByText('150%')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should handle negative relevance scores', () => {
|
||||
const sourceNegativeScore = { ...mockSourceWithUrl, relevance_score: -0.5 }
|
||||
render(<SourcesList sources={[sourceNegativeScore]} />)
|
||||
|
||||
// Should still render (as -50%)
|
||||
expect(screen.getByText('-50%')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should handle URL with special characters', () => {
|
||||
const sourceSpecialUrl = {
|
||||
...mockSourceWithUrl,
|
||||
url: 'https://example.com/faq?id=123&lang=en#section',
|
||||
}
|
||||
render(<SourcesList sources={[sourceSpecialUrl]} />)
|
||||
|
||||
const link = screen.getByRole('link')
|
||||
expect(link).toHaveAttribute('href', 'https://example.com/faq?id=123&lang=en#section')
|
||||
})
|
||||
|
||||
it('should handle very long content previews', () => {
|
||||
const longPreview = 'A'.repeat(500)
|
||||
const sourceLongPreview = { ...mockSourceWithUrl, content_preview: longPreview }
|
||||
render(<SourcesList sources={[sourceLongPreview]} />)
|
||||
|
||||
const preview = screen.getByText(longPreview)
|
||||
expect(preview).toHaveClass('line-clamp-2')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Source Count Display', () => {
|
||||
it('should display correct count for single source', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl]} />)
|
||||
expect(screen.getByText('Sources (1):')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should display correct count for multiple sources', () => {
|
||||
render(<SourcesList sources={[mockSourceWithUrl, mockSourceWithoutUrl, mockSourceNonEnglish]} />)
|
||||
expect(screen.getByText('Sources (3):')).toBeInTheDocument()
|
||||
})
|
||||
})
|
||||
})
|
||||
90
frontend/src/components/chat/SourcesList.tsx
Normal file
90
frontend/src/components/chat/SourcesList.tsx
Normal file
@@ -0,0 +1,90 @@
|
||||
"use client"
|
||||
|
||||
import { ExternalLink, Globe } from "lucide-react"
|
||||
import { Badge } from "@/components/ui/badge"
|
||||
import { ChatMessageSource } from "@/types/chatbot"
|
||||
|
||||
interface SourcesListProps {
|
||||
sources: ChatMessageSource[]
|
||||
}
|
||||
|
||||
export function SourcesList({ sources }: SourcesListProps) {
|
||||
if (!sources || sources.length === 0) {
|
||||
return null
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="mt-3 space-y-2" role="region" aria-label="Information sources">
|
||||
<p className="text-xs font-medium text-muted-foreground">
|
||||
Sources ({sources.length}):
|
||||
</p>
|
||||
<div className="space-y-2">
|
||||
{sources.map((source, index) => {
|
||||
const hasUrl = source.url && source.url.trim() !== ""
|
||||
const isNonEnglish = source.language && source.language.toLowerCase() !== "en"
|
||||
const hasRelevanceScore = typeof source.relevance_score === "number"
|
||||
|
||||
return (
|
||||
<div
|
||||
key={index}
|
||||
className="flex items-start gap-2 p-3 rounded-lg bg-muted/50 dark:bg-slate-800/50 border border-border/50"
|
||||
>
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="flex items-start gap-2 flex-wrap">
|
||||
{hasUrl ? (
|
||||
<a
|
||||
href={source.url!}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-sm font-medium text-primary hover:underline focus:outline-none focus:ring-2 focus:ring-primary focus:ring-offset-2 rounded inline-flex items-center gap-1 break-words"
|
||||
aria-label={`Open source: ${source.title} (opens in new tab)`}
|
||||
>
|
||||
{source.title || `Source ${index + 1}`}
|
||||
<ExternalLink
|
||||
className="h-3 w-3 flex-shrink-0"
|
||||
aria-hidden="true"
|
||||
/>
|
||||
</a>
|
||||
) : (
|
||||
<span className="text-sm font-medium text-foreground break-words">
|
||||
{source.title || `Source ${index + 1}`}
|
||||
</span>
|
||||
)}
|
||||
|
||||
<div className="flex items-center gap-1.5 flex-wrap">
|
||||
{isNonEnglish && (
|
||||
<Badge
|
||||
variant="outline"
|
||||
className="text-xs px-1.5 py-0 h-5 flex items-center gap-1"
|
||||
aria-label={`Language: ${source.language}`}
|
||||
>
|
||||
<Globe className="h-3 w-3" aria-hidden="true" />
|
||||
{source.language?.toUpperCase()}
|
||||
</Badge>
|
||||
)}
|
||||
|
||||
{hasRelevanceScore && (
|
||||
<Badge
|
||||
variant="secondary"
|
||||
className="text-xs px-1.5 py-0 h-5"
|
||||
aria-label={`Relevance score: ${source.relevance_score!.toFixed(0)}%`}
|
||||
>
|
||||
{source.relevance_score!.toFixed(0)}%
|
||||
</Badge>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{source.content_preview && (
|
||||
<p className="text-xs text-muted-foreground mt-1.5 line-clamp-2 break-words">
|
||||
{source.content_preview}
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -15,6 +15,7 @@ import { chatbotApi } from "@/lib/api-client"
|
||||
import ReactMarkdown from "react-markdown"
|
||||
import remarkGfm from "remark-gfm"
|
||||
import rehypeHighlight from "rehype-highlight"
|
||||
import { SourcesList } from "@/components/chat/SourcesList"
|
||||
|
||||
interface ChatMessage {
|
||||
id: string
|
||||
@@ -265,16 +266,7 @@ export function ChatInterface({ chatbotId, chatbotName, onClose }: ChatInterface
|
||||
|
||||
{/* Sources for assistant messages */}
|
||||
{message.role === 'assistant' && message.sources && message.sources.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<p className="text-xs text-foreground/60">Sources:</p>
|
||||
<div className="space-y-1">
|
||||
{message.sources.map((source, index) => (
|
||||
<Badge key={index} variant="outline" className="text-xs">
|
||||
{source.title || `Source ${index + 1}`}
|
||||
</Badge>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
<SourcesList sources={message.sources} />
|
||||
)}
|
||||
|
||||
<div className="flex items-center justify-between text-xs text-foreground/50 dark:text-slate-400 chat-timestamp">
|
||||
|
||||
@@ -14,7 +14,11 @@ export interface ChatMessage {
|
||||
export interface ChatMessageSource {
|
||||
title: string
|
||||
content: string
|
||||
url?: string
|
||||
url?: string | null
|
||||
language?: string
|
||||
article_id?: string
|
||||
relevance_score?: number
|
||||
content_preview?: string
|
||||
metadata?: Record<string, unknown>
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user