From 8dc42556d7656a32fd5b37fe398bc98bbc332a81 Mon Sep 17 00:00:00 2001 From: Charles-Gormley Date: Wed, 25 Sep 2024 17:38:12 -0400 Subject: [PATCH] Stuff --- requirements.txt | 3 +- src/article_storage/create_index.py | 14 +++- src/article_storage/initialize.py | 9 -- .../src/analytics/embeddings/pinecone.py | 83 +++++++++++++++++++ .../src/analytics/genai/summarization.py | 6 ++ .../src/data_storage.py | 18 +++- .../lambda_layer/lambda_layer_cloud9.sh | 2 +- template.env | 4 +- todo.md | 2 +- 9 files changed, 125 insertions(+), 16 deletions(-) create mode 100644 src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py create mode 100644 src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py diff --git a/requirements.txt b/requirements.txt index de57f73..2e3cc2e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ python-dotenv==1.0.* requests==2.32.* constructs==10.2.69 # Optional, yet necessary for the Pinecone SDK functionality. -pinecone==5.1.* \ No newline at end of file +pinecone +openai \ No newline at end of file diff --git a/src/article_storage/create_index.py b/src/article_storage/create_index.py index 7833fda..7a309f3 100644 --- a/src/article_storage/create_index.py +++ b/src/article_storage/create_index.py @@ -6,8 +6,7 @@ import os from dotenv import load_dotenv load_dotenv() -region = os.getenv("AWS_REGION") -index_name = os.getenv("PINECONE_DB_NAME") + index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs. if index_name not in pc.list_indexes().names(): @@ -32,6 +31,17 @@ index.upsert( namespace="example-namespace1" ) +index.upsert( + vectors=[ + {"id": "vec2124", "values": [1.0, -2.5]}, + {"id": "vec21214", "values": [3.0, -2.0]}, + {"id": "vec31251", "values": [0.5, -1.5]}, + ], + namespace="example-namespace2" +) + + + index.upsert( vectors=[ {"id": "vec1", "values": [1.0, -2.5]}, diff --git a/src/article_storage/initialize.py b/src/article_storage/initialize.py index cba1ece..e69de29 100644 --- a/src/article_storage/initialize.py +++ b/src/article_storage/initialize.py @@ -1,9 +0,0 @@ -from pinecone import Pinecone -import os -from dotenv import load_dotenv -load_dotenv() - -# Set up Pinecone client -api_key = os.getenv("PINCEONE_API_KEY") - -pc = Pinecone(api_key=api_key) \ No newline at end of file diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py new file mode 100644 index 0000000..fdc6765 --- /dev/null +++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py @@ -0,0 +1,83 @@ +# from pinecone import Pinecone + + +import os +from dotenv import load_dotenv +from openai import OpenAI + +load_dotenv() + +# Set up Pinecone client +api_key = os.getenv("PINCEONE_API_KEY") +shards = os.getenv("PINECONE_SHARDS") +embedding_model = os.getenv("VECTOR_EMBEDDING_MODEL") +embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM") +vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC") +index_name = os.getenv("PINECONE_DB_NAME") + +client = OpenAI() + +pc = Pinecone(api_key=api_key) + +def get_index(): + if index_name not in pc.list_indexes().names(): + pc.create_index( + name=index_name, + dimension=embedding_dim, + metric=vector_search_metric, + shards=shards + ) + + index = pc.Index(index_name) + return index + +def vectorize(article:str) -> list[float]: + response = client.embeddings.create( + input=article, + model="text-embedding-3-large" + ) + + return response.data[0].embedding + + + + + + +def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): + index.upsert( + vectors=vectors, + namespace=namespace + ) + + +def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): + + if len(vector) != int(embedding_dim): + raise ValueError("Length of vector does not match the embedding dimension") + + + if filter_query: + query = index.query( + namespace=namespace, + vector=vector, + filter_query=filter_query, + top_k=top_k, + include_metadata=True + ) + + + else: + query = index.query( + namespace=namespace, + vector=vector, + top_k=top_k + ) + + return query + + +if __name__ == "__main__": + # Create a large paragraph + paragraph = '''This is a test ''' + vectorize("This is a test string") \ No newline at end of file diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py new file mode 100644 index 0000000..b1f298f --- /dev/null +++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py @@ -0,0 +1,6 @@ + + +def summarize(text:str): + sub_prompt = "Summarize the follow" + + \ No newline at end of file diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py index 0cc787f..ee6da46 100644 --- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py +++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py @@ -4,6 +4,9 @@ import os import logging from random import randint +# TODO: Move this article storage logic to a separate module inside of lambda. +from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize + from utils import generate_key logger = logging.getLogger() @@ -19,12 +22,25 @@ storage_strategy = os.environ.get('STORAGE_STRATEGY') def save_article(article:dict, strategy:str): if strategy == "s3": s3_save_article(article) + elif strategy == "pinecone": + pinecone_save_article() else: raise ValueError(f"Invalid storage strategy: {strategy}") def pinecone_save_article(article:dict): - pass + logger.info("Saving article to Pinecone") + index = get_index() + article_id = article['article_id'] + rss_feed_id = article['rss_id'] + article["id"] = article["article_id"] + article["values"] = vectorize(article["content"]) + + namespace = f"{rss_feed_id}-{article_id}" + + upsert_vectors(index, [article], namespace) + + logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}") def dynamodb_save_article(article:dict): pass diff --git a/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh b/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh index 8cfa791..14b7e13 100644 --- a/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh +++ b/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh @@ -16,7 +16,7 @@ echo "Python 3.12 found. Proceeding..." echo "Section 2: Installing Dependencies" # Install dependencies -python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] -t python/ +python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] openai pinecone -t python/ echo "Dependencies installed successfully." ####### Section 3: Creating ZIP File ######## diff --git a/template.env b/template.env index fd062ee..ea00255 100644 --- a/template.env +++ b/template.env @@ -51,4 +51,6 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future. # Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ] PINECONE_API_KEY=*** -PINECONE_DB_NAME=open-rss-articles \ No newline at end of file +PINECONE_DB_NAME=open-rss-articles + +OPENAI_API_KEY=sk** \ No newline at end of file diff --git a/todo.md b/todo.md index 7ecd395..4600ff6 100644 --- a/todo.md +++ b/todo.md @@ -8,7 +8,7 @@ * Add in console setup python script for new project into launch.py # Next Modules -* Vector Database Storage Solution w/ Pinecone. +* Vector Database Storage Solution w/ Pinecone. * Gen AI Summarization Module. * Duplicate Article Check Module. * Github Issues & Github Actions.