From 9549fcc7492b0ae2c47cdbdb898876be354b6b2f Mon Sep 17 00:00:00 2001
From: "Charles E. Gormley" <ceg98@drexel.edu>
Date: Sat, 28 Sep 2024 13:09:35 -0400
Subject: [PATCH]

---
 .gitignore                                    |  2 +-
 src/article_storage/create_index.py           | 72 -------------------
 src/article_storage/initialize.py             |  0
 .../src/analytics/embeddings/pinecone.py      | 12 +---
 .../src/analytics/genai/summarization.py      |  2 +-
 .../src/data_storage.py                       | 16 ++---
 template.env                                  |  6 +-
 todo.md                                       | 27 ++++---
 8 files changed, 30 insertions(+), 107 deletions(-)
 delete mode 100644 src/article_storage/create_index.py
 delete mode 100644 src/article_storage/initialize.py

diff --git a/.gitignore b/.gitignore
index 829f5ce..759ae33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,4 @@ repo_structure.txt
 .env
 /layer/python*
 *__pycache__*
-rss_feeds.json
+*feeds.json
diff --git a/src/article_storage/create_index.py b/src/article_storage/create_index.py
deleted file mode 100644
index 7a309f3..0000000
--- a/src/article_storage/create_index.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from pinecone import Pinecone, ServerlessSpec
-
-from initialize import pc
-
-import os
-from dotenv import load_dotenv
-load_dotenv()
-
-
-index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs. 
-
-if index_name not in pc.list_indexes().names():
-    pc.create_index(
-        name=index_name,
-        dimension=2,
-        metric="cosine",
-        spec=ServerlessSpec(
-            cloud='aws', 
-            region='us-east-1'
-        ) 
-    ) 
-
-index = pc.Index(index_name)
-
-index.upsert(
-    vectors=[
-        {"id": "vec1", "values": [1.0, 1.5]},
-        {"id": "vec2", "values": [2.0, 1.0]},
-        {"id": "vec3", "values": [0.1, 3.0]},
-    ],
-    namespace="example-namespace1"
-)
-
-index.upsert(
-    vectors=[
-        {"id": "vec2124", "values": [1.0, -2.5]},
-        {"id": "vec21214", "values": [3.0, -2.0]},
-        {"id": "vec31251", "values": [0.5, -1.5]},
-    ],
-    namespace="example-namespace2"
-)
-
-
-
-index.upsert(
-    vectors=[
-        {"id": "vec1", "values": [1.0, -2.5]},
-        {"id": "vec2", "values": [3.0, -2.0]},
-        {"id": "vec3", "values": [0.5, -1.5]},
-    ],
-    namespace="example-namespace2"
-)
-
-print(index.describe_index_stats())
-
-query_results1 = index.query(
-    namespace="example-namespace1",
-    vector=[1.0, 1.5],
-    top_k=3,
-    include_values=True
-)
-
-print(query_results1)
-
-query_results2 = index.query(
-    namespace="example-namespace2",
-    vector=[1.0,-2.5],
-    top_k=3,
-    include_values=True
-)
-
-print(query_results2)
diff --git a/src/article_storage/initialize.py b/src/article_storage/initialize.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
index fdc6765..ff2e154 100644
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
@@ -1,4 +1,4 @@
-# from pinecone import Pinecone
+from pinecone import Pinecone
 
 
 import os
@@ -38,10 +38,6 @@ def vectorize(article:str) -> list[float]:
     )
     
     return response.data[0].embedding 
-    
-    
-    
-
 
 
 def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
@@ -56,7 +52,6 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
     if len(vector) != int(embedding_dim):
         raise ValueError("Length of vector does not match the embedding dimension")
     
-    
     if filter_query: 
         query = index.query(
             namespace=namespace,
@@ -64,8 +59,7 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
             filter_query=filter_query,
             top_k=top_k,
             include_metadata=True
-        )
-         
+        ) 
         
     else:
         query = index.query(
@@ -79,5 +73,5 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
 
 if __name__ == "__main__":
     # Create a large paragraph
-    paragraph = '''This is a test '''
+    paragraph = '''This is a test.'''
     vectorize("This is a test string")
\ No newline at end of file
diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py
index b1f298f..3e109e9 100644
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py
@@ -1,6 +1,6 @@
 
 
 def summarize(text:str): 
-    sub_prompt = "Summarize the follow"
+    sub_prompt = "Summarize the following passage"
     
     
\ No newline at end of file
diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
index ee6da46..71c18a9 100644
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -5,9 +5,7 @@ import logging
 from random import randint
 
 # TODO: Move this article storage logic to a separate module inside of lambda. 
-from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize
-
-from utils import generate_key
+from src.analytics.embeddings.pinecone import get_index, upsert_vectors, vectorize
 
 logger = logging.getLogger()
 
@@ -23,7 +21,7 @@ def save_article(article:dict, strategy:str):
     if strategy == "s3":
         s3_save_article(article)
     elif strategy == "pinecone":
-        pinecone_save_article()
+        pinecone_save_article(article)
     else:
         raise ValueError(f"Invalid storage strategy: {strategy}")
     
@@ -31,16 +29,16 @@ def save_article(article:dict, strategy:str):
 def pinecone_save_article(article:dict):
     logger.info("Saving article to Pinecone")
     index = get_index()
-    article_id = article['article_id']
-    rss_feed_id = article['rss_id']
+
+    # Expected Keys from Pinecone *MUST* include 'id' and 'values'
     article["id"] = article["article_id"]
     article["values"] = vectorize(article["content"])
     
-    namespace = f"{rss_feed_id}-{article_id}"
+    namespace = f"IngestRSS-Articles"
     
+    logger.info("Upserting article to Pinecone")
     upsert_vectors(index, [article], namespace)
-
-    logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}")
+    logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone index {index.name} with namespace {namespace}")
 
 def dynamodb_save_article(article:dict):
     pass
diff --git a/template.env b/template.env
index ea00255..b9bdd46 100644
--- a/template.env
+++ b/template.env
@@ -17,7 +17,7 @@ DYNAMODB_TABLE_NAME=rss-feeds-table
 SQS_QUEUE_NAME=rss-feed-queue
 
 
-LAMBDA_LAYER_VERSION=3 # This is fixed.
+LAMBDA_LAYER_VERSION=4 # This is fixed.
 LAMBDA_LAYER_NAME=ingest-rss-lambda-layer-${AWS_REGION}
 LAMBDA_LAYER_ARN=arn:aws:lambda:${AWS_REGION}:966265353179:layer:${LAMBDA_LAYER_NAME}:${LAMBDA_LAYER_VERSION}
 
@@ -38,15 +38,11 @@ QUEUE_FILLER_LAMBDA_S3_KEY=RSSQueueFiller.zip
 # Logging Configuration
 LOG_LEVEL=INFO
 
-Slide 1Slide 2
-
-
 # Other Application Settings
 APP_NAME=RSS Feed Processor
 VERSION=1.0.0
 
 
-TEST=1
 STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future. 
 
 # Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
diff --git a/todo.md b/todo.md
index 4600ff6..484a4d7 100644
--- a/todo.md
+++ b/todo.md
@@ -1,15 +1,22 @@
-# TODO Before Launch 🚀
-* Fresh AWS Account Test.
+# Testing🧪
+* Testing from 3rd party aws account. [ Today ]
+* Test Large Amounts of Feeds ( Decrease the cadence of ingesting. ) [ Today ]
+* Test out Vector Databases at Small Scale
+* Test out Vector Databases at Scale. 
+* Test out LLM Summarizaiton At Small Scale
+* Test out LLM Summarization At Scall
 
-# Misc TODO:
-* More RSS Feeds for my own personal account.
-* Database versioning Parameter.
-* More RSS Feed Module - Easy
-* Add in console setup python script for new project into launch.py
 
-# Next Modules
-* Vector Database Storage Solution w/ Pinecone. 
-* Gen AI Summarization Module.
+# Application Modules
+* Vector Storage
+* Vector Analysis
+* LLM Summarization
+* Semantic Understanding & Reasoning 
+* AWS Budget, Pinecone Budget, & LLM Budget
+* Integration with bumblebee
+* Visualization System ( Ingesting, Clustering, etc...) 
+
+# Misc
 * Duplicate Article Check Module.
 * Github Issues & Github Actions.