Stuff

2026-02-17 20:24:31 +01:00 · 2024-09-25 17:38:12 -04:00
parent 048770aa45
commit 8dc42556d7
9 changed files with 125 additions and 16 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ python-dotenv==1.0.*
 requests==2.32.*
 constructs==10.2.69
 # Optional, yet necessary for the Pinecone SDK functionality.
-pinecone==5.1.*
+pinecone
+openai
--- a/src/article_storage/create_index.py
+++ b/src/article_storage/create_index.py
@@ -6,8 +6,7 @@ import os
 from dotenv import load_dotenv
 load_dotenv()

-region = os.getenv("AWS_REGION")
-index_name = os.getenv("PINECONE_DB_NAME")
+
 index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs. 

 if index_name not in pc.list_indexes().names():
@@ -32,6 +31,17 @@ index.upsert(
    namespace="example-namespace1"
 )

+index.upsert(
+    vectors=[
+        {"id": "vec2124", "values": [1.0, -2.5]},
+        {"id": "vec21214", "values": [3.0, -2.0]},
+        {"id": "vec31251", "values": [0.5, -1.5]},
+    ],
+    namespace="example-namespace2"
+)
+
+
+
 index.upsert(
    vectors=[
        {"id": "vec1", "values": [1.0, -2.5]},
--- a/src/article_storage/initialize.py
+++ b/src/article_storage/initialize.py
@@ -1,9 +0,0 @@
-from pinecone import Pinecone
-import os
-from dotenv import load_dotenv
-load_dotenv()
-
-# Set up Pinecone client
-api_key = os.getenv("PINCEONE_API_KEY")
-
-pc = Pinecone(api_key=api_key)
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
@@ -0,0 +1,83 @@
+# from pinecone import Pinecone
+
+
+import os
+from dotenv import load_dotenv
+from openai import OpenAI
+
+load_dotenv()
+
+# Set up Pinecone client
+api_key = os.getenv("PINCEONE_API_KEY")
+shards = os.getenv("PINECONE_SHARDS")
+embedding_model = os.getenv("VECTOR_EMBEDDING_MODEL")
+embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
+vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
+index_name = os.getenv("PINECONE_DB_NAME")
+
+client = OpenAI()
+
+pc = Pinecone(api_key=api_key)
+
+def get_index():
+    if index_name not in pc.list_indexes().names():
+        pc.create_index(
+            name=index_name,
+            dimension=embedding_dim,
+            metric=vector_search_metric,
+            shards=shards
+        ) 
+
+    index = pc.Index(index_name)
+    return index
+
+def vectorize(article:str) -> list[float]:
+    response = client.embeddings.create(
+        input=article,
+        model="text-embedding-3-large"
+    )
+    
+    return response.data[0].embedding 
+    
+    
+    
+
+
+
+def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
+    index.upsert(
+        vectors=vectors,
+        namespace=namespace
+    )
+
+
+def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None):
+    
+    if len(vector) != int(embedding_dim):
+        raise ValueError("Length of vector does not match the embedding dimension")
+    
+    
+    if filter_query: 
+        query = index.query(
+            namespace=namespace,
+            vector=vector,
+            filter_query=filter_query,
+            top_k=top_k,
+            include_metadata=True
+        )
+         
+        
+    else:
+        query = index.query(
+            namespace=namespace,
+            vector=vector,
+            top_k=top_k
+        )
+    
+    return query
+
+
+if __name__ == "__main__":
+    # Create a large paragraph
+    paragraph = '''This is a test '''
+    vectorize("This is a test string")
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py
@@ -0,0 +1,6 @@
+
+
+def summarize(text:str): 
+    sub_prompt = "Summarize the follow"
+    
+    
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -4,6 +4,9 @@ import os
 import logging
 from random import randint

+# TODO: Move this article storage logic to a separate module inside of lambda. 
+from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize
+
 from utils import generate_key

 logger = logging.getLogger()
@@ -19,12 +22,25 @@ storage_strategy = os.environ.get('STORAGE_STRATEGY')
 def save_article(article:dict, strategy:str):
    if strategy == "s3":
        s3_save_article(article)
+    elif strategy == "pinecone":
+        pinecone_save_article()
    else:
        raise ValueError(f"Invalid storage strategy: {strategy}")
    

 def pinecone_save_article(article:dict):
-    pass
+    logger.info("Saving article to Pinecone")
+    index = get_index()
+    article_id = article['article_id']
+    rss_feed_id = article['rss_id']
+    article["id"] = article["article_id"]
+    article["values"] = vectorize(article["content"])
+    
+    namespace = f"{rss_feed_id}-{article_id}"
+    
+    upsert_vectors(index, [article], namespace)
+
+    logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}")

 def dynamodb_save_article(article:dict):
    pass
--- a/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh
+++ b/src/infra/lambdas/lambda_utils/lambda_layer/lambda_layer_cloud9.sh
@@ -16,7 +16,7 @@ echo "Python 3.12 found. Proceeding..."
 echo "Section 2: Installing Dependencies"

 # Install dependencies
-python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] -t python/
+python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] openai pinecone -t python/
 echo "Dependencies installed successfully."

 ####### Section 3: Creating ZIP File ########
--- a/template.env
+++ b/template.env
@@ -51,4 +51,6 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.

 # Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
 PINECONE_API_KEY=***
-PINECONE_DB_NAME=open-rss-articles
+PINECONE_DB_NAME=open-rss-articles
+
+OPENAI_API_KEY=sk**
--- a/todo.md
+++ b/todo.md
@@ -8,7 +8,7 @@
 * Add in console setup python script for new project into launch.py

 # Next Modules
-* Vector Database Storage Solution w/ Pinecone.
+* Vector Database Storage Solution w/ Pinecone. 
 * Gen AI Summarization Module.
 * Duplicate Article Check Module.
 * Github Issues & Github Actions.