From 9549fcc7492b0ae2c47cdbdb898876be354b6b2f Mon Sep 17 00:00:00 2001 From: "Charles E. Gormley" Date: Sat, 28 Sep 2024 13:09:35 -0400 Subject: [PATCH] --- .gitignore | 2 +- src/article_storage/create_index.py | 72 ------------------- src/article_storage/initialize.py | 0 .../src/analytics/embeddings/pinecone.py | 12 +--- .../src/analytics/genai/summarization.py | 2 +- .../src/data_storage.py | 16 ++--- template.env | 6 +- todo.md | 27 ++++--- 8 files changed, 30 insertions(+), 107 deletions(-) delete mode 100644 src/article_storage/create_index.py delete mode 100644 src/article_storage/initialize.py diff --git a/.gitignore b/.gitignore index 829f5ce..759ae33 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ repo_structure.txt .env /layer/python* *__pycache__* -rss_feeds.json +*feeds.json diff --git a/src/article_storage/create_index.py b/src/article_storage/create_index.py deleted file mode 100644 index 7a309f3..0000000 --- a/src/article_storage/create_index.py +++ /dev/null @@ -1,72 +0,0 @@ -from pinecone import Pinecone, ServerlessSpec - -from initialize import pc - -import os -from dotenv import load_dotenv -load_dotenv() - - -index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs. - -if index_name not in pc.list_indexes().names(): - pc.create_index( - name=index_name, - dimension=2, - metric="cosine", - spec=ServerlessSpec( - cloud='aws', - region='us-east-1' - ) - ) - -index = pc.Index(index_name) - -index.upsert( - vectors=[ - {"id": "vec1", "values": [1.0, 1.5]}, - {"id": "vec2", "values": [2.0, 1.0]}, - {"id": "vec3", "values": [0.1, 3.0]}, - ], - namespace="example-namespace1" -) - -index.upsert( - vectors=[ - {"id": "vec2124", "values": [1.0, -2.5]}, - {"id": "vec21214", "values": [3.0, -2.0]}, - {"id": "vec31251", "values": [0.5, -1.5]}, - ], - namespace="example-namespace2" -) - - - -index.upsert( - vectors=[ - {"id": "vec1", "values": [1.0, -2.5]}, - {"id": "vec2", "values": [3.0, -2.0]}, - {"id": "vec3", "values": [0.5, -1.5]}, - ], - namespace="example-namespace2" -) - -print(index.describe_index_stats()) - -query_results1 = index.query( - namespace="example-namespace1", - vector=[1.0, 1.5], - top_k=3, - include_values=True -) - -print(query_results1) - -query_results2 = index.query( - namespace="example-namespace2", - vector=[1.0,-2.5], - top_k=3, - include_values=True -) - -print(query_results2) diff --git a/src/article_storage/initialize.py b/src/article_storage/initialize.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py index fdc6765..ff2e154 100644 --- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py +++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/pinecone.py @@ -1,4 +1,4 @@ -# from pinecone import Pinecone +from pinecone import Pinecone import os @@ -38,10 +38,6 @@ def vectorize(article:str) -> list[float]: ) return response.data[0].embedding - - - - def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): @@ -56,7 +52,6 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k if len(vector) != int(embedding_dim): raise ValueError("Length of vector does not match the embedding dimension") - if filter_query: query = index.query( namespace=namespace, @@ -64,8 +59,7 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k filter_query=filter_query, top_k=top_k, include_metadata=True - ) - + ) else: query = index.query( @@ -79,5 +73,5 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k if __name__ == "__main__": # Create a large paragraph - paragraph = '''This is a test ''' + paragraph = '''This is a test.''' vectorize("This is a test string") \ No newline at end of file diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py index b1f298f..3e109e9 100644 --- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py +++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/genai/summarization.py @@ -1,6 +1,6 @@ def summarize(text:str): - sub_prompt = "Summarize the follow" + sub_prompt = "Summarize the following passage" \ No newline at end of file diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py index ee6da46..71c18a9 100644 --- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py +++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py @@ -5,9 +5,7 @@ import logging from random import randint # TODO: Move this article storage logic to a separate module inside of lambda. -from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize - -from utils import generate_key +from src.analytics.embeddings.pinecone import get_index, upsert_vectors, vectorize logger = logging.getLogger() @@ -23,7 +21,7 @@ def save_article(article:dict, strategy:str): if strategy == "s3": s3_save_article(article) elif strategy == "pinecone": - pinecone_save_article() + pinecone_save_article(article) else: raise ValueError(f"Invalid storage strategy: {strategy}") @@ -31,16 +29,16 @@ def save_article(article:dict, strategy:str): def pinecone_save_article(article:dict): logger.info("Saving article to Pinecone") index = get_index() - article_id = article['article_id'] - rss_feed_id = article['rss_id'] + + # Expected Keys from Pinecone *MUST* include 'id' and 'values' article["id"] = article["article_id"] article["values"] = vectorize(article["content"]) - namespace = f"{rss_feed_id}-{article_id}" + namespace = f"IngestRSS-Articles" + logger.info("Upserting article to Pinecone") upsert_vectors(index, [article], namespace) - - logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}") + logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone index {index.name} with namespace {namespace}") def dynamodb_save_article(article:dict): pass diff --git a/template.env b/template.env index ea00255..b9bdd46 100644 --- a/template.env +++ b/template.env @@ -17,7 +17,7 @@ DYNAMODB_TABLE_NAME=rss-feeds-table SQS_QUEUE_NAME=rss-feed-queue -LAMBDA_LAYER_VERSION=3 # This is fixed. +LAMBDA_LAYER_VERSION=4 # This is fixed. LAMBDA_LAYER_NAME=ingest-rss-lambda-layer-${AWS_REGION} LAMBDA_LAYER_ARN=arn:aws:lambda:${AWS_REGION}:966265353179:layer:${LAMBDA_LAYER_NAME}:${LAMBDA_LAYER_VERSION} @@ -38,15 +38,11 @@ QUEUE_FILLER_LAMBDA_S3_KEY=RSSQueueFiller.zip # Logging Configuration LOG_LEVEL=INFO -Slide 1Slide 2 - - # Other Application Settings APP_NAME=RSS Feed Processor VERSION=1.0.0 -TEST=1 STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future. # Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ] diff --git a/todo.md b/todo.md index 4600ff6..484a4d7 100644 --- a/todo.md +++ b/todo.md @@ -1,15 +1,22 @@ -# TODO Before Launch 🚀 -* Fresh AWS Account Test. +# Testing🧪 +* Testing from 3rd party aws account. [ Today ] +* Test Large Amounts of Feeds ( Decrease the cadence of ingesting. ) [ Today ] +* Test out Vector Databases at Small Scale +* Test out Vector Databases at Scale. +* Test out LLM Summarizaiton At Small Scale +* Test out LLM Summarization At Scall -# Misc TODO: -* More RSS Feeds for my own personal account. -* Database versioning Parameter. -* More RSS Feed Module - Easy -* Add in console setup python script for new project into launch.py -# Next Modules -* Vector Database Storage Solution w/ Pinecone. -* Gen AI Summarization Module. +# Application Modules +* Vector Storage +* Vector Analysis +* LLM Summarization +* Semantic Understanding & Reasoning +* AWS Budget, Pinecone Budget, & LLM Budget +* Integration with bumblebee +* Visualization System ( Ingesting, Clustering, etc...) + +# Misc * Duplicate Article Check Module. * Github Issues & Github Actions.