From 8dc42556d7656a32fd5b37fe398bc98bbc332a81 Mon Sep 17 00:00:00 2001
From: Charles-Gormley <ceg98@drexel.edu>
Date: Wed, 25 Sep 2024 17:38:12 -0400
Subject: [PATCH] Stuff

---
 requirements.txt                              |  3 +-
 src/article_storage/create_index.py           | 14 +++-
 src/article_storage/initialize.py             |  9 --
 .../src/analytics/embeddings/pinecone.py      | 83 +++++++++++++++++++
 .../src/analytics/genai/summarization.py      |  6 ++
 .../src/data_storage.py                       | 18 +++-
 .../lambda_layer/lambda_layer_cloud9.sh       |  2 +-
 template.env                                  |  4 +-
 todo.md                                       |  2 +-
 9 files changed, 125 insertions(+), 16 deletions(-)
 create mode 100644 src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
 create mode 100644 src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py

diff --git a/requirements.txt b/requirements.txt
index de57f73..2e3cc2e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ python-dotenv==1.0.*
 requests==2.32.*
 constructs==10.2.69
 # Optional, yet necessary for the Pinecone SDK functionality.
-pinecone==5.1.*
\ No newline at end of file
+pinecone
+openai
\ No newline at end of file
diff --git a/src/article_storage/create_index.py b/src/article_storage/create_index.py
index 7833fda..7a309f3 100644
--- a/src/article_storage/create_index.py
+++ b/src/article_storage/create_index.py
@@ -6,8 +6,7 @@ import os
 from dotenv import load_dotenv
 load_dotenv()
 
-region = os.getenv("AWS_REGION")
-index_name = os.getenv("PINECONE_DB_NAME")
+
 index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs. 
 
 if index_name not in pc.list_indexes().names():
@@ -32,6 +31,17 @@ index.upsert(
     namespace="example-namespace1"
 )
 
+index.upsert(
+    vectors=[
+        {"id": "vec2124", "values": [1.0, -2.5]},
+        {"id": "vec21214", "values": [3.0, -2.0]},
+        {"id": "vec31251", "values": [0.5, -1.5]},
+    ],
+    namespace="example-namespace2"
+)
+
+
+
 index.upsert(
     vectors=[
         {"id": "vec1", "values": [1.0, -2.5]},
diff --git a/src/article_storage/initialize.py b/src/article_storage/initialize.py
index cba1ece..e69de29 100644
--- a/src/article_storage/initialize.py
+++ b/src/article_storage/initialize.py
@@ -1,9 +0,0 @@
-from pinecone import Pinecone
-import os
-from dotenv import load_dotenv
-load_dotenv()
-
-# Set up Pinecone client
-api_key = os.getenv("PINCEONE_API_KEY")
-
-pc = Pinecone(api_key=api_key)
\ No newline at end of file
diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
new file mode 100644
index 0000000..fdc6765
--- /dev/null
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
@@ -0,0 +1,83 @@
+# from pinecone import Pinecone
+
+
+import os
+from dotenv import load_dotenv
+from openai import OpenAI
+
+load_dotenv()
+
+# Set up Pinecone client
+api_key = os.getenv("PINCEONE_API_KEY")
+shards = os.getenv("PINECONE_SHARDS")
+embedding_model = os.getenv("VECTOR_EMBEDDING_MODEL")
+embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
+vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
+index_name = os.getenv("PINECONE_DB_NAME")
+
+client = OpenAI()
+
+pc = Pinecone(api_key=api_key)
+
+def get_index():
+    if index_name not in pc.list_indexes().names():
+        pc.create_index(
+            name=index_name,
+            dimension=embedding_dim,
+            metric=vector_search_metric,
+            shards=shards
+        ) 
+
+    index = pc.Index(index_name)
+    return index
+
+def vectorize(article:str) -> list[float]:
+    response = client.embeddings.create(
+        input=article,
+        model="text-embedding-3-large"
+    )
+    
+    return response.data[0].embedding 
+    
+    
+    
+
+
+
+def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
+    index.upsert(
+        vectors=vectors,
+        namespace=namespace
+    )
+
+
+def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None):
+    
+    if len(vector) != int(embedding_dim):
+        raise ValueError("Length of vector does not match the embedding dimension")
+    
+    
+    if filter_query: 
+        query = index.query(
+            namespace=namespace,
+            vector=vector,
+            filter_query=filter_query,
+            top_k=top_k,
+            include_metadata=True
+        )
+         
+        
+    else:
+        query = index.query(
+            namespace=namespace,
+            vector=vector,
+            top_k=top_k
+        )
+    
+    return query
+
+
+if __name__ == "__main__":
+    # Create a large paragraph
+    paragraph = '''This is a test '''
+    vectorize("This is a test string")
\ No newline at end of file
diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py
new file mode 100644
index 0000000..b1f298f
--- /dev/null
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py
@@ -0,0 +1,6 @@
+
+
+def summarize(text:str): 
+    sub_prompt = "Summarize the follow"
+    
+    
\ No newline at end of file
diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
index 0cc787f..ee6da46 100644
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -4,6 +4,9 @@ import os
 import logging
 from random import randint
 
+# TODO: Move this article storage logic to a separate module inside of lambda. 
+from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize
+
 from utils import generate_key
 
 logger = logging.getLogger()
@@ -19,12 +22,25 @@ storage_strategy = os.environ.get('STORAGE_STRATEGY')
 def save_article(article:dict, strategy:str):
     if strategy == "s3":
         s3_save_article(article)
+    elif strategy == "pinecone":
+        pinecone_save_article()
     else:
         raise ValueError(f"Invalid storage strategy: {strategy}")
     
 
 def pinecone_save_article(article:dict):
-    pass
+    logger.info("Saving article to Pinecone")
+    index = get_index()
+    article_id = article['article_id']
+    rss_feed_id = article['rss_id']
+    article["id"] = article["article_id"]
+    article["values"] = vectorize(article["content"])
+    
+    namespace = f"{rss_feed_id}-{article_id}"
+    
+    upsert_vectors(index, [article], namespace)
+
+    logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}")
 
 def dynamodb_save_article(article:dict):
     pass
diff --git a/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh b/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh
index 8cfa791..14b7e13 100644
--- a/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh
+++ b/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh
@@ -16,7 +16,7 @@ echo "Python 3.12 found. Proceeding..."
 echo "Section 2: Installing Dependencies"
 
 # Install dependencies
-python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] -t python/
+python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] openai pinecone -t python/
 echo "Dependencies installed successfully."
 
 ####### Section 3: Creating ZIP File ########
diff --git a/template.env b/template.env
index fd062ee..ea00255 100644
--- a/template.env
+++ b/template.env
@@ -51,4 +51,6 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
 
 # Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
 PINECONE_API_KEY=***
-PINECONE_DB_NAME=open-rss-articles
\ No newline at end of file
+PINECONE_DB_NAME=open-rss-articles
+
+OPENAI_API_KEY=sk**
\ No newline at end of file
diff --git a/todo.md b/todo.md
index 7ecd395..4600ff6 100644
--- a/todo.md
+++ b/todo.md
@@ -8,7 +8,7 @@
 * Add in console setup python script for new project into launch.py
 
 # Next Modules
-* Vector Database Storage Solution w/ Pinecone.
+* Vector Database Storage Solution w/ Pinecone. 
 * Gen AI Summarization Module.
 * Duplicate Article Check Module.
 * Github Issues & Github Actions.