Files
enclava/backend/app/models/rag_document.py
2025-11-20 11:11:18 +01:00

115 lines
3.9 KiB
Python

"""
RAG Document Model
Represents documents within RAG collections
"""
from sqlalchemy import (
Column,
Integer,
String,
Text,
DateTime,
Boolean,
BigInteger,
ForeignKey,
JSON,
)
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from app.db.database import Base
class RagDocument(Base):
__tablename__ = "rag_documents"
id = Column(Integer, primary_key=True, index=True)
# Collection relationship
collection_id = Column(
Integer,
ForeignKey("rag_collections.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
collection = relationship("RagCollection", back_populates="documents")
# File information
filename = Column(String(255), nullable=False) # sanitized filename for storage
original_filename = Column(String(255), nullable=False) # user's original filename
file_path = Column(String(500), nullable=False) # path to stored file
file_type = Column(String(50), nullable=False) # pdf, docx, txt, etc.
file_size = Column(BigInteger, nullable=False) # file size in bytes
mime_type = Column(String(100), nullable=True)
# Processing status
status = Column(
String(50), default="processing", nullable=False
) # 'processing', 'processed', 'error', 'indexed'
processing_error = Column(Text, nullable=True)
# Content information
converted_content = Column(Text, nullable=True) # markdown converted content
word_count = Column(Integer, default=0, nullable=False)
character_count = Column(Integer, default=0, nullable=False)
# Vector information
vector_count = Column(
Integer, default=0, nullable=False
) # number of chunks/vectors created
chunk_size = Column(
Integer, default=1000, nullable=False
) # chunk size used for vectorization
# Metadata extracted from document
document_metadata = Column(
JSON, nullable=True
) # language, entities, keywords, etc.
# Processing timestamps
created_at = Column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
processed_at = Column(DateTime(timezone=True), nullable=True)
indexed_at = Column(DateTime(timezone=True), nullable=True)
updated_at = Column(
DateTime(timezone=True),
server_default=func.now(),
onupdate=func.now(),
nullable=False,
)
# Soft delete
is_deleted = Column(Boolean, default=False, nullable=False)
deleted_at = Column(DateTime(timezone=True), nullable=True)
def to_dict(self):
"""Convert model to dictionary for API responses"""
return {
"id": str(self.id),
"collection_id": str(self.collection_id),
"collection_name": self.collection.name if self.collection else None,
"filename": self.filename,
"original_filename": self.original_filename,
"file_type": self.file_type,
"size": self.file_size,
"mime_type": self.mime_type,
"status": self.status,
"processing_error": self.processing_error,
"converted_content": self.converted_content,
"word_count": self.word_count,
"character_count": self.character_count,
"vector_count": self.vector_count,
"chunk_size": self.chunk_size,
"metadata": self.document_metadata or {},
"created_at": self.created_at.isoformat() if self.created_at else None,
"processed_at": self.processed_at.isoformat()
if self.processed_at
else None,
"indexed_at": self.indexed_at.isoformat() if self.indexed_at else None,
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
"is_deleted": self.is_deleted,
}
def __repr__(self):
return f"<RagDocument(id={self.id}, filename='{self.original_filename}', status='{self.status}')>"