mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2026-02-17 20:24:31 +01:00
Stuff
This commit is contained in:
@@ -3,4 +3,5 @@ python-dotenv==1.0.*
|
||||
requests==2.32.*
|
||||
constructs==10.2.69
|
||||
# Optional, yet necessary for the Pinecone SDK functionality.
|
||||
pinecone==5.1.*
|
||||
pinecone
|
||||
openai
|
||||
@@ -6,8 +6,7 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
region = os.getenv("AWS_REGION")
|
||||
index_name = os.getenv("PINECONE_DB_NAME")
|
||||
|
||||
index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs.
|
||||
|
||||
if index_name not in pc.list_indexes().names():
|
||||
@@ -32,6 +31,17 @@ index.upsert(
|
||||
namespace="example-namespace1"
|
||||
)
|
||||
|
||||
index.upsert(
|
||||
vectors=[
|
||||
{"id": "vec2124", "values": [1.0, -2.5]},
|
||||
{"id": "vec21214", "values": [3.0, -2.0]},
|
||||
{"id": "vec31251", "values": [0.5, -1.5]},
|
||||
],
|
||||
namespace="example-namespace2"
|
||||
)
|
||||
|
||||
|
||||
|
||||
index.upsert(
|
||||
vectors=[
|
||||
{"id": "vec1", "values": [1.0, -2.5]},
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
from pinecone import Pinecone
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
# Set up Pinecone client
|
||||
api_key = os.getenv("PINCEONE_API_KEY")
|
||||
|
||||
pc = Pinecone(api_key=api_key)
|
||||
@@ -0,0 +1,83 @@
|
||||
# from pinecone import Pinecone
|
||||
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Set up Pinecone client
|
||||
api_key = os.getenv("PINCEONE_API_KEY")
|
||||
shards = os.getenv("PINECONE_SHARDS")
|
||||
embedding_model = os.getenv("VECTOR_EMBEDDING_MODEL")
|
||||
embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
|
||||
vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
|
||||
index_name = os.getenv("PINECONE_DB_NAME")
|
||||
|
||||
client = OpenAI()
|
||||
|
||||
pc = Pinecone(api_key=api_key)
|
||||
|
||||
def get_index():
|
||||
if index_name not in pc.list_indexes().names():
|
||||
pc.create_index(
|
||||
name=index_name,
|
||||
dimension=embedding_dim,
|
||||
metric=vector_search_metric,
|
||||
shards=shards
|
||||
)
|
||||
|
||||
index = pc.Index(index_name)
|
||||
return index
|
||||
|
||||
def vectorize(article:str) -> list[float]:
|
||||
response = client.embeddings.create(
|
||||
input=article,
|
||||
model="text-embedding-3-large"
|
||||
)
|
||||
|
||||
return response.data[0].embedding
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
|
||||
index.upsert(
|
||||
vectors=vectors,
|
||||
namespace=namespace
|
||||
)
|
||||
|
||||
|
||||
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None):
|
||||
|
||||
if len(vector) != int(embedding_dim):
|
||||
raise ValueError("Length of vector does not match the embedding dimension")
|
||||
|
||||
|
||||
if filter_query:
|
||||
query = index.query(
|
||||
namespace=namespace,
|
||||
vector=vector,
|
||||
filter_query=filter_query,
|
||||
top_k=top_k,
|
||||
include_metadata=True
|
||||
)
|
||||
|
||||
|
||||
else:
|
||||
query = index.query(
|
||||
namespace=namespace,
|
||||
vector=vector,
|
||||
top_k=top_k
|
||||
)
|
||||
|
||||
return query
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Create a large paragraph
|
||||
paragraph = '''This is a test '''
|
||||
vectorize("This is a test string")
|
||||
@@ -0,0 +1,6 @@
|
||||
|
||||
|
||||
def summarize(text:str):
|
||||
sub_prompt = "Summarize the follow"
|
||||
|
||||
|
||||
@@ -4,6 +4,9 @@ import os
|
||||
import logging
|
||||
from random import randint
|
||||
|
||||
# TODO: Move this article storage logic to a separate module inside of lambda.
|
||||
from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize
|
||||
|
||||
from utils import generate_key
|
||||
|
||||
logger = logging.getLogger()
|
||||
@@ -19,12 +22,25 @@ storage_strategy = os.environ.get('STORAGE_STRATEGY')
|
||||
def save_article(article:dict, strategy:str):
|
||||
if strategy == "s3":
|
||||
s3_save_article(article)
|
||||
elif strategy == "pinecone":
|
||||
pinecone_save_article()
|
||||
else:
|
||||
raise ValueError(f"Invalid storage strategy: {strategy}")
|
||||
|
||||
|
||||
def pinecone_save_article(article:dict):
|
||||
pass
|
||||
logger.info("Saving article to Pinecone")
|
||||
index = get_index()
|
||||
article_id = article['article_id']
|
||||
rss_feed_id = article['rss_id']
|
||||
article["id"] = article["article_id"]
|
||||
article["values"] = vectorize(article["content"])
|
||||
|
||||
namespace = f"{rss_feed_id}-{article_id}"
|
||||
|
||||
upsert_vectors(index, [article], namespace)
|
||||
|
||||
logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}")
|
||||
|
||||
def dynamodb_save_article(article:dict):
|
||||
pass
|
||||
|
||||
@@ -16,7 +16,7 @@ echo "Python 3.12 found. Proceeding..."
|
||||
echo "Section 2: Installing Dependencies"
|
||||
|
||||
# Install dependencies
|
||||
python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] -t python/
|
||||
python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] openai pinecone -t python/
|
||||
echo "Dependencies installed successfully."
|
||||
|
||||
####### Section 3: Creating ZIP File ########
|
||||
|
||||
@@ -51,4 +51,6 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
|
||||
|
||||
# Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
|
||||
PINECONE_API_KEY=***
|
||||
PINECONE_DB_NAME=open-rss-articles
|
||||
PINECONE_DB_NAME=open-rss-articles
|
||||
|
||||
OPENAI_API_KEY=sk**
|
||||
2
todo.md
2
todo.md
@@ -8,7 +8,7 @@
|
||||
* Add in console setup python script for new project into launch.py
|
||||
|
||||
# Next Modules
|
||||
* Vector Database Storage Solution w/ Pinecone.
|
||||
* Vector Database Storage Solution w/ Pinecone.
|
||||
* Gen AI Summarization Module.
|
||||
* Duplicate Article Check Module.
|
||||
* Github Issues & Github Actions.
|
||||
|
||||
Reference in New Issue
Block a user