enclava/backend/app/models/rag_document.py

"""
RAG Document Model
Represents documents within RAG collections
"""

from sqlalchemy import (
    Column,
    Integer,
    String,
    Text,
    DateTime,
    Boolean,
    BigInteger,
    ForeignKey,
    JSON,
)
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from app.db.database import Base


class RagDocument(Base):
    __tablename__ = "rag_documents"

    id = Column(Integer, primary_key=True, index=True)

    # Collection relationship
    collection_id = Column(
        Integer,
        ForeignKey("rag_collections.id", ondelete="CASCADE"),
        nullable=False,
        index=True,
    )
    collection = relationship("RagCollection", back_populates="documents")

    # File information
    filename = Column(String(255), nullable=False)  # sanitized filename for storage
    original_filename = Column(String(255), nullable=False)  # user's original filename
    file_path = Column(String(500), nullable=False)  # path to stored file
    file_type = Column(String(50), nullable=False)  # pdf, docx, txt, etc.
    file_size = Column(BigInteger, nullable=False)  # file size in bytes
    mime_type = Column(String(100), nullable=True)

    # Processing status
    status = Column(
        String(50), default="processing", nullable=False
    )  # 'processing', 'processed', 'error', 'indexed'
    processing_error = Column(Text, nullable=True)

    # Content information
    converted_content = Column(Text, nullable=True)  # markdown converted content
    word_count = Column(Integer, default=0, nullable=False)
    character_count = Column(Integer, default=0, nullable=False)

    # Vector information
    vector_count = Column(
        Integer, default=0, nullable=False
    )  # number of chunks/vectors created
    chunk_size = Column(
        Integer, default=1000, nullable=False
    )  # chunk size used for vectorization

    # Metadata extracted from document
    document_metadata = Column(
        JSON, nullable=True
    )  # language, entities, keywords, etc.

    # Processing timestamps
    created_at = Column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
    processed_at = Column(DateTime(timezone=True), nullable=True)
    indexed_at = Column(DateTime(timezone=True), nullable=True)
    updated_at = Column(
        DateTime(timezone=True),
        server_default=func.now(),
        onupdate=func.now(),
        nullable=False,
    )

    # Soft delete
    is_deleted = Column(Boolean, default=False, nullable=False)
    deleted_at = Column(DateTime(timezone=True), nullable=True)

    def to_dict(self):
        """Convert model to dictionary for API responses"""
        return {
            "id": str(self.id),
            "collection_id": str(self.collection_id),
            "collection_name": self.collection.name if self.collection else None,
            "filename": self.filename,
            "original_filename": self.original_filename,
            "file_type": self.file_type,
            "size": self.file_size,
            "mime_type": self.mime_type,
            "status": self.status,
            "processing_error": self.processing_error,
            "converted_content": self.converted_content,
            "word_count": self.word_count,
            "character_count": self.character_count,
            "vector_count": self.vector_count,
            "chunk_size": self.chunk_size,
            "metadata": self.document_metadata or {},
            "created_at": self.created_at.isoformat() if self.created_at else None,
            "processed_at": self.processed_at.isoformat()
            if self.processed_at
            else None,
            "indexed_at": self.indexed_at.isoformat() if self.indexed_at else None,
            "updated_at": self.updated_at.isoformat() if self.updated_at else None,
            "is_deleted": self.is_deleted,
        }

    def __repr__(self):
        return f"<RagDocument(id={self.id}, filename='{self.original_filename}', status='{self.status}')>"