mirror of
https://github.com/aljazceru/enclava.git
synced 2025-12-17 15:34:36 +01:00
82 lines
3.7 KiB
Python
82 lines
3.7 KiB
Python
"""
|
|
RAG Document Model
|
|
Represents documents within RAG collections
|
|
"""
|
|
|
|
from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean, BigInteger, ForeignKey, JSON
|
|
from sqlalchemy.sql import func
|
|
from sqlalchemy.orm import relationship
|
|
from app.db.database import Base
|
|
|
|
|
|
class RagDocument(Base):
|
|
__tablename__ = "rag_documents"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
# Collection relationship
|
|
collection_id = Column(Integer, ForeignKey("rag_collections.id", ondelete="CASCADE"), nullable=False, index=True)
|
|
collection = relationship("RagCollection", back_populates="documents")
|
|
|
|
# File information
|
|
filename = Column(String(255), nullable=False) # sanitized filename for storage
|
|
original_filename = Column(String(255), nullable=False) # user's original filename
|
|
file_path = Column(String(500), nullable=False) # path to stored file
|
|
file_type = Column(String(50), nullable=False) # pdf, docx, txt, etc.
|
|
file_size = Column(BigInteger, nullable=False) # file size in bytes
|
|
mime_type = Column(String(100), nullable=True)
|
|
|
|
# Processing status
|
|
status = Column(String(50), default='processing', nullable=False) # 'processing', 'processed', 'error', 'indexed'
|
|
processing_error = Column(Text, nullable=True)
|
|
|
|
# Content information
|
|
converted_content = Column(Text, nullable=True) # markdown converted content
|
|
word_count = Column(Integer, default=0, nullable=False)
|
|
character_count = Column(Integer, default=0, nullable=False)
|
|
|
|
# Vector information
|
|
vector_count = Column(Integer, default=0, nullable=False) # number of chunks/vectors created
|
|
chunk_size = Column(Integer, default=1000, nullable=False) # chunk size used for vectorization
|
|
|
|
# Metadata extracted from document
|
|
document_metadata = Column(JSON, nullable=True) # language, entities, keywords, etc.
|
|
|
|
# Processing timestamps
|
|
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
|
processed_at = Column(DateTime(timezone=True), nullable=True)
|
|
indexed_at = Column(DateTime(timezone=True), nullable=True)
|
|
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
|
|
|
|
# Soft delete
|
|
is_deleted = Column(Boolean, default=False, nullable=False)
|
|
deleted_at = Column(DateTime(timezone=True), nullable=True)
|
|
|
|
def to_dict(self):
|
|
"""Convert model to dictionary for API responses"""
|
|
return {
|
|
"id": str(self.id),
|
|
"collection_id": str(self.collection_id),
|
|
"collection_name": self.collection.name if self.collection else None,
|
|
"filename": self.filename,
|
|
"original_filename": self.original_filename,
|
|
"file_type": self.file_type,
|
|
"size": self.file_size,
|
|
"mime_type": self.mime_type,
|
|
"status": self.status,
|
|
"processing_error": self.processing_error,
|
|
"converted_content": self.converted_content,
|
|
"word_count": self.word_count,
|
|
"character_count": self.character_count,
|
|
"vector_count": self.vector_count,
|
|
"chunk_size": self.chunk_size,
|
|
"metadata": self.document_metadata or {},
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|
"processed_at": self.processed_at.isoformat() if self.processed_at else None,
|
|
"indexed_at": self.indexed_at.isoformat() if self.indexed_at else None,
|
|
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
|
"is_deleted": self.is_deleted
|
|
}
|
|
|
|
def __repr__(self):
|
|
return f"<RagDocument(id={self.id}, filename='{self.original_filename}', status='{self.status}')>" |