mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 05:54:22 +01:00
Replace Pinecone/OpenAI with Qdrant/Ollama
This commit is contained in:
@@ -23,3 +23,6 @@ APP_NAME=RSS Feed Processor
|
||||
VERSION=1.0.0
|
||||
|
||||
STORAGE_STRATEGY=s3
|
||||
QDRANT_URL=http://localhost:6333
|
||||
QDRANT_COLLECTION_NAME=open-rss-articles
|
||||
OLLAMA_HOST=http://localhost:11434
|
||||
|
||||
@@ -3,9 +3,9 @@ pymongo==4.*
|
||||
python-dotenv==1.0.*
|
||||
requests==2.32.*
|
||||
constructs==10.2.69
|
||||
# Optional, yet necessary for the Pinecone SDK functionality.
|
||||
pinecone
|
||||
openai
|
||||
# Vector database and embedding libraries
|
||||
qdrant-client
|
||||
ollama
|
||||
tqdm
|
||||
prometheus-clien
|
||||
redis
|
||||
|
||||
@@ -4,8 +4,7 @@ import sys
|
||||
import json
|
||||
from src.utils.retry_logic import retry_with_backoff
|
||||
from botocore.exceptions import ClientError
|
||||
from pinecone import Pinecone
|
||||
from pinecone import ServerlessSpec
|
||||
from qdrant_client import QdrantClient, models
|
||||
|
||||
|
||||
|
||||
@@ -180,21 +179,17 @@ def deploy_infrastructure():
|
||||
}
|
||||
])
|
||||
|
||||
if os.getenv("STORAGE_STRATEGY") == 'pinecone':
|
||||
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
||||
index_name = os.getenv("PINECONE_DB_NAME")
|
||||
embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
|
||||
vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
|
||||
|
||||
if index_name not in pc.list_indexes().names():
|
||||
pc.create_index(
|
||||
name=index_name,
|
||||
dimension=int(embedding_dim),
|
||||
metric=vector_search_metric,
|
||||
spec = ServerlessSpec(
|
||||
cloud="aws",
|
||||
region="us-east-1",
|
||||
),
|
||||
if os.getenv("STORAGE_STRATEGY") == 'qdrant':
|
||||
client = QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))
|
||||
collection = os.getenv("QDRANT_COLLECTION_NAME")
|
||||
embedding_dim = int(os.getenv("VECTOR_EMBEDDING_DIM"))
|
||||
metric = os.getenv("VECTOR_SEARCH_METRIC", "cosine").upper()
|
||||
|
||||
existing = [c.name for c in client.get_collections().collections]
|
||||
if collection not in existing:
|
||||
client.create_collection(
|
||||
collection_name=collection,
|
||||
vectors_config=models.VectorParams(size=embedding_dim, distance=getattr(models.Distance, metric))
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,75 +1,60 @@
|
||||
import os
|
||||
import requests
|
||||
|
||||
from pinecone import Pinecone
|
||||
from openai import OpenAI
|
||||
from qdrant_client import QdrantClient, models
|
||||
|
||||
from utils import setup_logging
|
||||
|
||||
logger = setup_logging()
|
||||
|
||||
qdrant_url = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
qdrant_api_key = os.getenv("QDRANT_API_KEY")
|
||||
collection_name = os.getenv("QDRANT_COLLECTION_NAME")
|
||||
|
||||
# Set up Pinecone client
|
||||
api_key = os.getenv("PINCEONE_API_KEY")
|
||||
shards = os.getenv("PINECONE_SHARDS")
|
||||
embedding_model = os.getenv("VECTOR_EMBEDDING_MODEL")
|
||||
embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
|
||||
vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
|
||||
index_name = os.getenv("PINECONE_DB_NAME")
|
||||
vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC", "cosine")
|
||||
|
||||
client = OpenAI() # For Embedding Models, Not LLMs
|
||||
pc = Pinecone(api_key=api_key)
|
||||
ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
||||
ollama_embedding_model = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
|
||||
|
||||
client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
|
||||
|
||||
def get_index():
|
||||
if index_name not in pc.list_indexes().names():
|
||||
return KeyError(f"Index {index_name} not found")
|
||||
collections = client.get_collections().collections
|
||||
if collection_name not in [c.name for c in collections]:
|
||||
raise KeyError(f"Collection {collection_name} not found")
|
||||
return client
|
||||
|
||||
index = pc.Index(index_name)
|
||||
return index
|
||||
|
||||
def vectorize(article:str) -> list[float]:
|
||||
response = client.embeddings.create(
|
||||
input=article, # FIXME: This fails when article is something else, find what the 'something else' is and implement fix.
|
||||
model=os.getenv('OPENAI_EMBEDDING_MODEL', 'text-')
|
||||
def vectorize(article: str) -> list[float]:
|
||||
response = requests.post(
|
||||
f"{ollama_host}/api/embeddings",
|
||||
json={"model": ollama_embedding_model, "prompt": article},
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
return response.data[0].embedding
|
||||
response.raise_for_status()
|
||||
return response.json().get("embedding", [])
|
||||
|
||||
|
||||
def upsert_vectors(index:Pinecone.Index, data:list[dict], namespace:str): # [ ] Check if the data is being upserted.
|
||||
response = index.upsert(
|
||||
vectors=data,
|
||||
namespace=namespace
|
||||
)
|
||||
logger.info(f'Upserted Vector Response : {response.to_dict()}')
|
||||
logger.info(f'Upserted Vector Length : {len(data[0]["values"])}')
|
||||
logger.info(f'Upserted Vector Response Type : {type(response)}')
|
||||
logger.info(f'Upserted Vector Response - status : {response.status_code}')
|
||||
|
||||
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working.
|
||||
|
||||
def upsert_vectors(index: QdrantClient, data: list[dict]):
|
||||
points = [
|
||||
models.PointStruct(id=item["id"], vector=item["vector"], payload=item.get("payload"))
|
||||
for item in data
|
||||
]
|
||||
index.upsert(collection_name=collection_name, points=points)
|
||||
|
||||
|
||||
def query_vectors(index: QdrantClient, vector: list[float], top_k: int, filter_query: dict | None = None):
|
||||
if len(vector) != int(embedding_dim):
|
||||
raise ValueError("Length of vector does not match the embedding dimension")
|
||||
|
||||
if filter_query:
|
||||
query = index.query(
|
||||
namespace=namespace,
|
||||
vector=vector,
|
||||
filter_query=filter_query,
|
||||
top_k=top_k,
|
||||
include_metadata=True
|
||||
)
|
||||
|
||||
else:
|
||||
query = index.query(
|
||||
namespace=namespace,
|
||||
vector=vector,
|
||||
top_k=top_k
|
||||
)
|
||||
|
||||
return query
|
||||
return index.search(
|
||||
collection_name=collection_name,
|
||||
query_vector=vector,
|
||||
limit=top_k,
|
||||
with_payload=True,
|
||||
query_filter=filter_query,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Create a large paragraph
|
||||
paragraph = '''This is a test.'''
|
||||
vectorize("This is a test string")
|
||||
paragraph = "This is a test."
|
||||
vectorize(paragraph)
|
||||
|
||||
@@ -32,39 +32,29 @@ mongo_client = MongoClient(MONGODB_URL)
|
||||
feeds_collection = mongo_client[MONGODB_DB_NAME][MONGODB_COLLECTION_NAME]
|
||||
|
||||
##### Article Storage #####
|
||||
def save_article(article:dict, strategy:str):
|
||||
def save_article(article: dict, strategy: str):
|
||||
if strategy == "s3":
|
||||
s3_save_article(article)
|
||||
elif strategy == "pinecone":
|
||||
pinecone_save_article(article)
|
||||
elif strategy == 'both':
|
||||
pinecone_save_article(article)
|
||||
elif strategy == "qdrant":
|
||||
qdrant_save_article(article)
|
||||
elif strategy == "both":
|
||||
qdrant_save_article(article)
|
||||
s3_save_article(article)
|
||||
else:
|
||||
raise ValueError(f"Invalid storage strategy: {strategy}")
|
||||
|
||||
|
||||
def pinecone_save_article(article:dict):
|
||||
logger.info("Saving article to Pinecone")
|
||||
def qdrant_save_article(article: dict):
|
||||
logger.info("Saving article to Qdrant")
|
||||
index = get_index()
|
||||
|
||||
# Expected Keys from Pinecone *MUST* include 'id' and 'values'
|
||||
data = dict()
|
||||
logging.info(f"Article ID into Pinecone")
|
||||
data["id"] = article["article_id"]
|
||||
logging.info(f"Article content into Pinecone")
|
||||
data["values"] = vectorize(article=article["content"])
|
||||
|
||||
print(type(data["values"]))
|
||||
print(data["id"])
|
||||
|
||||
data = [data]
|
||||
|
||||
|
||||
namespace = os.getenv('PINECONE_NAMESPACE')
|
||||
|
||||
logger.info("Upserting article to Pinecone")
|
||||
upsert_vectors(index, data, namespace)
|
||||
data = {
|
||||
"id": article["article_id"],
|
||||
"vector": vectorize(article["content"]),
|
||||
"payload": {"rss": article.get("rss"), "title": article.get("title")},
|
||||
}
|
||||
|
||||
upsert_vectors(index, [data])
|
||||
|
||||
|
||||
def s3_save_article(article:dict):
|
||||
|
||||
@@ -16,7 +16,7 @@ echo "Python 3.12 found. Proceeding..."
|
||||
echo "Section 2: Installing Dependencies"
|
||||
|
||||
# Install dependencies
|
||||
python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] openai pinecone -t python/
|
||||
python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] qdrant-client ollama -t python/
|
||||
echo "Dependencies installed successfully."
|
||||
|
||||
####### Section 3: Creating ZIP File ########
|
||||
|
||||
@@ -48,20 +48,19 @@ def update_env_vars(function_name):
|
||||
# Storage Configuration
|
||||
'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY', 's3'), # Default to s3 storage
|
||||
|
||||
# Pinecone Configuration (only used if STORAGE_STRATEGY is 'pinecone')
|
||||
'PINECONE_API_KEY': os.environ.get('PINECONE_API_KEY'),
|
||||
'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
|
||||
'PINECONE_SHARDS': os.environ.get('PINECONE_SHARDS'),
|
||||
'PINECONE_NAMESPACE': os.environ.get('PINECONE_NAMESPACE'),
|
||||
# Qdrant Configuration (only used if STORAGE_STRATEGY is 'qdrant')
|
||||
'QDRANT_URL': os.environ.get('QDRANT_URL'),
|
||||
'QDRANT_API_KEY': os.environ.get('QDRANT_API_KEY'),
|
||||
'QDRANT_COLLECTION_NAME': os.environ.get('QDRANT_COLLECTION_NAME'),
|
||||
|
||||
# Vector Configuration
|
||||
'VECTOR_EMBEDDING_MODEL': os.environ.get('VECTOR_EMBEDDING_MODEL'),
|
||||
'VECTOR_EMBEDDING_DIM': os.environ.get('VECTOR_EMBEDDING_DIM'),
|
||||
'VECTOR_SEARCH_METRIC': os.environ.get('VECTOR_SEARCH_METRIC'),
|
||||
|
||||
# OpenAI Configuration
|
||||
'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY'),
|
||||
"OPENAI_EMBEDDING_MODEL": os.environ.get('OPENAI_EMBEDDING_MODEL'),
|
||||
# Ollama Configuration
|
||||
'OLLAMA_HOST': os.environ.get('OLLAMA_HOST'),
|
||||
'OLLAMA_EMBEDDING_MODEL': os.environ.get('OLLAMA_EMBEDDING_MODEL'),
|
||||
}
|
||||
|
||||
return lambda_client.update_function_configuration(
|
||||
|
||||
@@ -90,13 +90,13 @@ def main():
|
||||
env_vars["TEST"] = get_env_value("TEST", "Enter Test Value:", options=["0", "1"], advanced=advanced_mode)
|
||||
|
||||
# Storage Strategy
|
||||
env_vars["STORAGE_STRATEGY"] = get_env_value("STORAGE_STRATEGY", "Choose Storage Strategy:", options=["s3", "pinecone"], advanced=advanced_mode)
|
||||
|
||||
# Pinecone Configuration (only if pinecone is selected)
|
||||
if env_vars["STORAGE_STRATEGY"] == "pinecone":
|
||||
env_vars["PINECONE_API_KEY"] = get_env_value("PINECONE_API_KEY", "Enter Pinecone API Key:", advanced=advanced_mode)
|
||||
env_vars["PINECONE_DB_NAME"] = get_env_value("PINECONE_DB_NAME", "Enter Pinecone DB Name:", options=["open-rss-articles", "custom-rss-db"], advanced=advanced_mode)
|
||||
|
||||
env_vars["STORAGE_STRATEGY"] = get_env_value("STORAGE_STRATEGY", "Choose Storage Strategy:", options=["s3", "qdrant"], advanced=advanced_mode)
|
||||
|
||||
# Qdrant Configuration (only if qdrant is selected)
|
||||
if env_vars["STORAGE_STRATEGY"] == "qdrant":
|
||||
env_vars["QDRANT_URL"] = get_env_value("QDRANT_URL", "Enter Qdrant URL:", options=["http://localhost:6333"], advanced=advanced_mode)
|
||||
env_vars["QDRANT_COLLECTION_NAME"] = get_env_value("QDRANT_COLLECTION_NAME", "Enter Qdrant Collection Name:", options=["open-rss-articles"], advanced=advanced_mode)
|
||||
|
||||
# Display summary
|
||||
display_summary(env_vars)
|
||||
|
||||
|
||||
@@ -48,13 +48,14 @@ def check_env() -> None:
|
||||
|
||||
# Variables that are optional depending on the storage strategy
|
||||
optional_vars = {
|
||||
"PINECONE_API_KEY": "pinecone",
|
||||
"PINECONE_DB_NAME": "pinecone",
|
||||
"OPENAI_API_KEY": "all",
|
||||
"PINECONE_SHARDS": "pinecone",
|
||||
"VECTOR_EMBEDDING_MODEL": "pinecone",
|
||||
"VECTOR_EMBEDDING_DIM": "pinecone",
|
||||
"VECTOR_SEARCH_METRIC": "pinecone"
|
||||
"QDRANT_URL": "qdrant",
|
||||
"QDRANT_API_KEY": "qdrant",
|
||||
"QDRANT_COLLECTION_NAME": "qdrant",
|
||||
"OLLAMA_HOST": "all",
|
||||
"OLLAMA_EMBEDDING_MODEL": "all",
|
||||
"VECTOR_EMBEDDING_MODEL": "qdrant",
|
||||
"VECTOR_EMBEDDING_DIM": "qdrant",
|
||||
"VECTOR_SEARCH_METRIC": "qdrant"
|
||||
}
|
||||
|
||||
missing_vars: List[str] = []
|
||||
|
||||
15
template.env
15
template.env
@@ -52,17 +52,16 @@ APP_NAME=RSS Feed Processor
|
||||
VERSION=1.0.0
|
||||
|
||||
|
||||
STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
|
||||
STORAGE_STRATEGY=s3 # 's3' or 'qdrant'
|
||||
|
||||
# Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
|
||||
PINECONE_API_KEY=***
|
||||
PINECONE_DB_NAME=open-rss-articles
|
||||
PINECONE_SHARDS=***
|
||||
PINECONE_NAMESPACE=IngestRSS-Articles
|
||||
# Only need to fill out this if your storage strategy is qdrant
|
||||
QDRANT_URL=http://localhost:6333
|
||||
QDRANT_API_KEY=***
|
||||
QDRANT_COLLECTION_NAME=open-rss-articles
|
||||
|
||||
VECTOR_EMBEDDING_MODEL=***
|
||||
VECTOR_EMBEDDING_DIM=***
|
||||
VECTOR_SEARCH_METRIC=***
|
||||
|
||||
OPENAI_API_KEY=sk**
|
||||
OPENAI_EMBEDDING_MODEL=text-embedding-3-large
|
||||
OLLAMA_HOST=http://localhost:11434
|
||||
OLLAMA_EMBEDDING_MODEL=nomic-embed-text
|
||||
|
||||
Reference in New Issue
Block a user