mirror of
https://github.com/aljazceru/enclava.git
synced 2025-12-17 07:24:34 +01:00
117 lines
4.0 KiB
Python
117 lines
4.0 KiB
Python
"""
|
|
RAG Document Model
|
|
Represents documents within RAG collections
|
|
"""
|
|
|
|
from sqlalchemy import (
|
|
Column,
|
|
Integer,
|
|
String,
|
|
Text,
|
|
DateTime,
|
|
Boolean,
|
|
BigInteger,
|
|
ForeignKey,
|
|
JSON,
|
|
)
|
|
from sqlalchemy.sql import func
|
|
from sqlalchemy.orm import relationship
|
|
from app.db.database import Base
|
|
|
|
|
|
class RagDocument(Base):
|
|
__tablename__ = "rag_documents"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
# Collection relationship
|
|
collection_id = Column(
|
|
Integer,
|
|
ForeignKey("rag_collections.id", ondelete="CASCADE"),
|
|
nullable=False,
|
|
index=True,
|
|
)
|
|
collection = relationship("RagCollection", back_populates="documents")
|
|
|
|
# File information
|
|
filename = Column(String(255), nullable=False) # sanitized filename for storage
|
|
original_filename = Column(String(255), nullable=False) # user's original filename
|
|
file_path = Column(String(500), nullable=False) # path to stored file
|
|
file_type = Column(String(50), nullable=False) # pdf, docx, txt, etc.
|
|
file_size = Column(BigInteger, nullable=False) # file size in bytes
|
|
mime_type = Column(String(100), nullable=True)
|
|
source_url = Column(String(500), nullable=True, index=True) # original source URL
|
|
|
|
# Processing status
|
|
status = Column(
|
|
String(50), default="processing", nullable=False
|
|
) # 'processing', 'processed', 'error', 'indexed'
|
|
processing_error = Column(Text, nullable=True)
|
|
|
|
# Content information
|
|
converted_content = Column(Text, nullable=True) # markdown converted content
|
|
word_count = Column(Integer, default=0, nullable=False)
|
|
character_count = Column(Integer, default=0, nullable=False)
|
|
|
|
# Vector information
|
|
vector_count = Column(
|
|
Integer, default=0, nullable=False
|
|
) # number of chunks/vectors created
|
|
chunk_size = Column(
|
|
Integer, default=1000, nullable=False
|
|
) # chunk size used for vectorization
|
|
|
|
# Metadata extracted from document
|
|
document_metadata = Column(
|
|
JSON, nullable=True
|
|
) # language, entities, keywords, etc.
|
|
|
|
# Processing timestamps
|
|
created_at = Column(
|
|
DateTime(timezone=True), server_default=func.now(), nullable=False
|
|
)
|
|
processed_at = Column(DateTime(timezone=True), nullable=True)
|
|
indexed_at = Column(DateTime(timezone=True), nullable=True)
|
|
updated_at = Column(
|
|
DateTime(timezone=True),
|
|
server_default=func.now(),
|
|
onupdate=func.now(),
|
|
nullable=False,
|
|
)
|
|
|
|
# Soft delete
|
|
is_deleted = Column(Boolean, default=False, nullable=False)
|
|
deleted_at = Column(DateTime(timezone=True), nullable=True)
|
|
|
|
def to_dict(self):
|
|
"""Convert model to dictionary for API responses"""
|
|
return {
|
|
"id": str(self.id),
|
|
"collection_id": str(self.collection_id),
|
|
"collection_name": self.collection.name if self.collection else None,
|
|
"filename": self.filename,
|
|
"original_filename": self.original_filename,
|
|
"file_type": self.file_type,
|
|
"size": self.file_size,
|
|
"mime_type": self.mime_type,
|
|
"source_url": self.source_url,
|
|
"status": self.status,
|
|
"processing_error": self.processing_error,
|
|
"converted_content": self.converted_content,
|
|
"word_count": self.word_count,
|
|
"character_count": self.character_count,
|
|
"vector_count": self.vector_count,
|
|
"chunk_size": self.chunk_size,
|
|
"metadata": self.document_metadata or {},
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|
"processed_at": self.processed_at.isoformat()
|
|
if self.processed_at
|
|
else None,
|
|
"indexed_at": self.indexed_at.isoformat() if self.indexed_at else None,
|
|
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
|
"is_deleted": self.is_deleted,
|
|
}
|
|
|
|
def __repr__(self):
|
|
return f"<RagDocument(id={self.id}, filename='{self.original_filename}', status='{self.status}')>"
|