""" RAG Document Model Represents documents within RAG collections """ from sqlalchemy import ( Column, Integer, String, Text, DateTime, Boolean, BigInteger, ForeignKey, JSON, ) from sqlalchemy.sql import func from sqlalchemy.orm import relationship from app.db.database import Base class RagDocument(Base): __tablename__ = "rag_documents" id = Column(Integer, primary_key=True, index=True) # Collection relationship collection_id = Column( Integer, ForeignKey("rag_collections.id", ondelete="CASCADE"), nullable=False, index=True, ) collection = relationship("RagCollection", back_populates="documents") # File information filename = Column(String(255), nullable=False) # sanitized filename for storage original_filename = Column(String(255), nullable=False) # user's original filename file_path = Column(String(500), nullable=False) # path to stored file file_type = Column(String(50), nullable=False) # pdf, docx, txt, etc. file_size = Column(BigInteger, nullable=False) # file size in bytes mime_type = Column(String(100), nullable=True) # Processing status status = Column( String(50), default="processing", nullable=False ) # 'processing', 'processed', 'error', 'indexed' processing_error = Column(Text, nullable=True) # Content information converted_content = Column(Text, nullable=True) # markdown converted content word_count = Column(Integer, default=0, nullable=False) character_count = Column(Integer, default=0, nullable=False) # Vector information vector_count = Column( Integer, default=0, nullable=False ) # number of chunks/vectors created chunk_size = Column( Integer, default=1000, nullable=False ) # chunk size used for vectorization # Metadata extracted from document document_metadata = Column( JSON, nullable=True ) # language, entities, keywords, etc. # Processing timestamps created_at = Column( DateTime(timezone=True), server_default=func.now(), nullable=False ) processed_at = Column(DateTime(timezone=True), nullable=True) indexed_at = Column(DateTime(timezone=True), nullable=True) updated_at = Column( DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False, ) # Soft delete is_deleted = Column(Boolean, default=False, nullable=False) deleted_at = Column(DateTime(timezone=True), nullable=True) def to_dict(self): """Convert model to dictionary for API responses""" return { "id": str(self.id), "collection_id": str(self.collection_id), "collection_name": self.collection.name if self.collection else None, "filename": self.filename, "original_filename": self.original_filename, "file_type": self.file_type, "size": self.file_size, "mime_type": self.mime_type, "status": self.status, "processing_error": self.processing_error, "converted_content": self.converted_content, "word_count": self.word_count, "character_count": self.character_count, "vector_count": self.vector_count, "chunk_size": self.chunk_size, "metadata": self.document_metadata or {}, "created_at": self.created_at.isoformat() if self.created_at else None, "processed_at": self.processed_at.isoformat() if self.processed_at else None, "indexed_at": self.indexed_at.isoformat() if self.indexed_at else None, "updated_at": self.updated_at.isoformat() if self.updated_at else None, "is_deleted": self.is_deleted, } def __repr__(self): return f""