Files
enclava/backend/app/models/rag_document.py
2025-08-19 09:50:15 +02:00

82 lines
3.7 KiB
Python

"""
RAG Document Model
Represents documents within RAG collections
"""
from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean, BigInteger, ForeignKey, JSON
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from app.db.database import Base
class RagDocument(Base):
__tablename__ = "rag_documents"
id = Column(Integer, primary_key=True, index=True)
# Collection relationship
collection_id = Column(Integer, ForeignKey("rag_collections.id", ondelete="CASCADE"), nullable=False, index=True)
collection = relationship("RagCollection", back_populates="documents")
# File information
filename = Column(String(255), nullable=False) # sanitized filename for storage
original_filename = Column(String(255), nullable=False) # user's original filename
file_path = Column(String(500), nullable=False) # path to stored file
file_type = Column(String(50), nullable=False) # pdf, docx, txt, etc.
file_size = Column(BigInteger, nullable=False) # file size in bytes
mime_type = Column(String(100), nullable=True)
# Processing status
status = Column(String(50), default='processing', nullable=False) # 'processing', 'processed', 'error', 'indexed'
processing_error = Column(Text, nullable=True)
# Content information
converted_content = Column(Text, nullable=True) # markdown converted content
word_count = Column(Integer, default=0, nullable=False)
character_count = Column(Integer, default=0, nullable=False)
# Vector information
vector_count = Column(Integer, default=0, nullable=False) # number of chunks/vectors created
chunk_size = Column(Integer, default=1000, nullable=False) # chunk size used for vectorization
# Metadata extracted from document
document_metadata = Column(JSON, nullable=True) # language, entities, keywords, etc.
# Processing timestamps
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
processed_at = Column(DateTime(timezone=True), nullable=True)
indexed_at = Column(DateTime(timezone=True), nullable=True)
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
# Soft delete
is_deleted = Column(Boolean, default=False, nullable=False)
deleted_at = Column(DateTime(timezone=True), nullable=True)
def to_dict(self):
"""Convert model to dictionary for API responses"""
return {
"id": str(self.id),
"collection_id": str(self.collection_id),
"collection_name": self.collection.name if self.collection else None,
"filename": self.filename,
"original_filename": self.original_filename,
"file_type": self.file_type,
"size": self.file_size,
"mime_type": self.mime_type,
"status": self.status,
"processing_error": self.processing_error,
"converted_content": self.converted_content,
"word_count": self.word_count,
"character_count": self.character_count,
"vector_count": self.vector_count,
"chunk_size": self.chunk_size,
"metadata": self.document_metadata or {},
"created_at": self.created_at.isoformat() if self.created_at else None,
"processed_at": self.processed_at.isoformat() if self.processed_at else None,
"indexed_at": self.indexed_at.isoformat() if self.indexed_at else None,
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
"is_deleted": self.is_deleted
}
def __repr__(self):
return f"<RagDocument(id={self.id}, filename='{self.original_filename}', status='{self.status}')>"