2026-02-12 09:44:29 +01:00 · 2024-09-28 13:09:35 -04:00
parent 8dc42556d7
commit 9549fcc749
8 changed files with 30 additions and 107 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,4 @@ repo_structure.txt
 .env
 /layer/python*
 *__pycache__*
-rss_feeds.json
+*feeds.json
--- a/src/article_storage/create_index.py
+++ b/src/article_storage/create_index.py
@@ -1,72 +0,0 @@
-from pinecone import Pinecone, ServerlessSpec
-
-from initialize import pc
-
-import os
-from dotenv import load_dotenv
-load_dotenv()
-
-
-index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs. 
-
-if index_name not in pc.list_indexes().names():
-    pc.create_index(
-        name=index_name,
-        dimension=2,
-        metric="cosine",
-        spec=ServerlessSpec(
-            cloud='aws', 
-            region='us-east-1'
-        ) 
-    ) 
-
-index = pc.Index(index_name)
-
-index.upsert(
-    vectors=[
-        {"id": "vec1", "values": [1.0, 1.5]},
-        {"id": "vec2", "values": [2.0, 1.0]},
-        {"id": "vec3", "values": [0.1, 3.0]},
-    ],
-    namespace="example-namespace1"
-)
-
-index.upsert(
-    vectors=[
-        {"id": "vec2124", "values": [1.0, -2.5]},
-        {"id": "vec21214", "values": [3.0, -2.0]},
-        {"id": "vec31251", "values": [0.5, -1.5]},
-    ],
-    namespace="example-namespace2"
-)
-
-
-
-index.upsert(
-    vectors=[
-        {"id": "vec1", "values": [1.0, -2.5]},
-        {"id": "vec2", "values": [3.0, -2.0]},
-        {"id": "vec3", "values": [0.5, -1.5]},
-    ],
-    namespace="example-namespace2"
-)
-
-print(index.describe_index_stats())
-
-query_results1 = index.query(
-    namespace="example-namespace1",
-    vector=[1.0, 1.5],
-    top_k=3,
-    include_values=True
-)
-
-print(query_results1)
-
-query_results2 = index.query(
-    namespace="example-namespace2",
-    vector=[1.0,-2.5],
-    top_k=3,
-    include_values=True
-)
-
-print(query_results2)
--- a/src/article_storage/initialize.py
+++ b/src/article_storage/initialize.py
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py
@@ -1,4 +1,4 @@
-# from pinecone import Pinecone
+from pinecone import Pinecone


 import os
@@ -38,10 +38,6 @@ def vectorize(article:str) -> list[float]:
    )
    
    return response.data[0].embedding 
-    
-    
-    
-


 def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
@@ -56,7 +52,6 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
    if len(vector) != int(embedding_dim):
        raise ValueError("Length of vector does not match the embedding dimension")
    
-    
    if filter_query: 
        query = index.query(
            namespace=namespace,
@@ -64,8 +59,7 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
            filter_query=filter_query,
            top_k=top_k,
            include_metadata=True
-        )
-         
+        ) 
        
    else:
        query = index.query(
@@ -79,5 +73,5 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k

 if __name__ == "__main__":
    # Create a large paragraph
-    paragraph = '''This is a test '''
+    paragraph = '''This is a test.'''
    vectorize("This is a test string")
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py
@@ -1,6 +1,6 @@


 def summarize(text:str): 
-    sub_prompt = "Summarize the follow"
+    sub_prompt = "Summarize the following passage"
    
    
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -5,9 +5,7 @@ import logging
 from random import randint

 # TODO: Move this article storage logic to a separate module inside of lambda. 
-from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize
-
-from utils import generate_key
+from src.analytics.embeddings.pinecone import get_index, upsert_vectors, vectorize

 logger = logging.getLogger()

@@ -23,7 +21,7 @@ def save_article(article:dict, strategy:str):
    if strategy == "s3":
        s3_save_article(article)
    elif strategy == "pinecone":
-        pinecone_save_article()
+        pinecone_save_article(article)
    else:
        raise ValueError(f"Invalid storage strategy: {strategy}")
    
@@ -31,16 +29,16 @@ def save_article(article:dict, strategy:str):
 def pinecone_save_article(article:dict):
    logger.info("Saving article to Pinecone")
    index = get_index()
-    article_id = article['article_id']
-    rss_feed_id = article['rss_id']
+
+    # Expected Keys from Pinecone *MUST* include 'id' and 'values'
    article["id"] = article["article_id"]
    article["values"] = vectorize(article["content"])
    
-    namespace = f"{rss_feed_id}-{article_id}"
+    namespace = f"IngestRSS-Articles"
    
+    logger.info("Upserting article to Pinecone")
    upsert_vectors(index, [article], namespace)
-
-    logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}")
+    logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone index {index.name} with namespace {namespace}")

 def dynamodb_save_article(article:dict):
    pass
--- a/template.env
+++ b/template.env
@@ -17,7 +17,7 @@ DYNAMODB_TABLE_NAME=rss-feeds-table
 SQS_QUEUE_NAME=rss-feed-queue


-LAMBDA_LAYER_VERSION=3 # This is fixed.
+LAMBDA_LAYER_VERSION=4 # This is fixed.
 LAMBDA_LAYER_NAME=ingest-rss-lambda-layer-${AWS_REGION}
 LAMBDA_LAYER_ARN=arn:aws:lambda:${AWS_REGION}:966265353179:layer:${LAMBDA_LAYER_NAME}:${LAMBDA_LAYER_VERSION}

@@ -38,15 +38,11 @@ QUEUE_FILLER_LAMBDA_S3_KEY=RSSQueueFiller.zip
 # Logging Configuration
 LOG_LEVEL=INFO

-Slide 1Slide 2
-
-
 # Other Application Settings
 APP_NAME=RSS Feed Processor
 VERSION=1.0.0


-TEST=1
 STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future. 

 # Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
--- a/todo.md
+++ b/todo.md
@@ -1,15 +1,22 @@
-# TODO Before Launch 🚀
-* Fresh AWS Account Test.
+# Testing🧪
+* Testing from 3rd party aws account. [ Today ]
+* Test Large Amounts of Feeds ( Decrease the cadence of ingesting. ) [ Today ]
+* Test out Vector Databases at Small Scale
+* Test out Vector Databases at Scale. 
+* Test out LLM Summarizaiton At Small Scale
+* Test out LLM Summarization At Scall

-# Misc TODO:
-* More RSS Feeds for my own personal account.
-* Database versioning Parameter.
-* More RSS Feed Module - Easy
-* Add in console setup python script for new project into launch.py

-# Next Modules
-* Vector Database Storage Solution w/ Pinecone. 
-* Gen AI Summarization Module.
+# Application Modules
+* Vector Storage
+* Vector Analysis
+* LLM Summarization
+* Semantic Understanding & Reasoning 
+* AWS Budget, Pinecone Budget, & LLM Budget
+* Integration with bumblebee
+* Visualization System ( Ingesting, Clustering, etc...) 
+
+# Misc
 * Duplicate Article Check Module.
 * Github Issues & Github Actions.