This commit is contained in:
Charles E. Gormley
2024-09-28 13:09:35 -04:00
parent 8dc42556d7
commit 9549fcc749
8 changed files with 30 additions and 107 deletions

2
.gitignore vendored
View File

@@ -2,4 +2,4 @@ repo_structure.txt
.env
/layer/python*
*__pycache__*
rss_feeds.json
*feeds.json

View File

@@ -1,72 +0,0 @@
from pinecone import Pinecone, ServerlessSpec
from initialize import pc
import os
from dotenv import load_dotenv
load_dotenv()
index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs.
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=2,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
index = pc.Index(index_name)
index.upsert(
vectors=[
{"id": "vec1", "values": [1.0, 1.5]},
{"id": "vec2", "values": [2.0, 1.0]},
{"id": "vec3", "values": [0.1, 3.0]},
],
namespace="example-namespace1"
)
index.upsert(
vectors=[
{"id": "vec2124", "values": [1.0, -2.5]},
{"id": "vec21214", "values": [3.0, -2.0]},
{"id": "vec31251", "values": [0.5, -1.5]},
],
namespace="example-namespace2"
)
index.upsert(
vectors=[
{"id": "vec1", "values": [1.0, -2.5]},
{"id": "vec2", "values": [3.0, -2.0]},
{"id": "vec3", "values": [0.5, -1.5]},
],
namespace="example-namespace2"
)
print(index.describe_index_stats())
query_results1 = index.query(
namespace="example-namespace1",
vector=[1.0, 1.5],
top_k=3,
include_values=True
)
print(query_results1)
query_results2 = index.query(
namespace="example-namespace2",
vector=[1.0,-2.5],
top_k=3,
include_values=True
)
print(query_results2)

View File

@@ -1,4 +1,4 @@
# from pinecone import Pinecone
from pinecone import Pinecone
import os
@@ -38,10 +38,6 @@ def vectorize(article:str) -> list[float]:
)
return response.data[0].embedding
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
@@ -56,7 +52,6 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
if len(vector) != int(embedding_dim):
raise ValueError("Length of vector does not match the embedding dimension")
if filter_query:
query = index.query(
namespace=namespace,
@@ -64,8 +59,7 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
filter_query=filter_query,
top_k=top_k,
include_metadata=True
)
)
else:
query = index.query(
@@ -79,5 +73,5 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
if __name__ == "__main__":
# Create a large paragraph
paragraph = '''This is a test '''
paragraph = '''This is a test.'''
vectorize("This is a test string")

View File

@@ -1,6 +1,6 @@
def summarize(text:str):
sub_prompt = "Summarize the follow"
sub_prompt = "Summarize the following passage"

View File

@@ -5,9 +5,7 @@ import logging
from random import randint
# TODO: Move this article storage logic to a separate module inside of lambda.
from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize
from utils import generate_key
from src.analytics.embeddings.pinecone import get_index, upsert_vectors, vectorize
logger = logging.getLogger()
@@ -23,7 +21,7 @@ def save_article(article:dict, strategy:str):
if strategy == "s3":
s3_save_article(article)
elif strategy == "pinecone":
pinecone_save_article()
pinecone_save_article(article)
else:
raise ValueError(f"Invalid storage strategy: {strategy}")
@@ -31,16 +29,16 @@ def save_article(article:dict, strategy:str):
def pinecone_save_article(article:dict):
logger.info("Saving article to Pinecone")
index = get_index()
article_id = article['article_id']
rss_feed_id = article['rss_id']
# Expected Keys from Pinecone *MUST* include 'id' and 'values'
article["id"] = article["article_id"]
article["values"] = vectorize(article["content"])
namespace = f"{rss_feed_id}-{article_id}"
namespace = f"IngestRSS-Articles"
logger.info("Upserting article to Pinecone")
upsert_vectors(index, [article], namespace)
logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}")
logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone index {index.name} with namespace {namespace}")
def dynamodb_save_article(article:dict):
pass

View File

@@ -17,7 +17,7 @@ DYNAMODB_TABLE_NAME=rss-feeds-table
SQS_QUEUE_NAME=rss-feed-queue
LAMBDA_LAYER_VERSION=3 # This is fixed.
LAMBDA_LAYER_VERSION=4 # This is fixed.
LAMBDA_LAYER_NAME=ingest-rss-lambda-layer-${AWS_REGION}
LAMBDA_LAYER_ARN=arn:aws:lambda:${AWS_REGION}:966265353179:layer:${LAMBDA_LAYER_NAME}:${LAMBDA_LAYER_VERSION}
@@ -38,15 +38,11 @@ QUEUE_FILLER_LAMBDA_S3_KEY=RSSQueueFiller.zip
# Logging Configuration
LOG_LEVEL=INFO
Slide 1Slide 2
# Other Application Settings
APP_NAME=RSS Feed Processor
VERSION=1.0.0
TEST=1
STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
# Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]

27
todo.md
View File

@@ -1,15 +1,22 @@
# TODO Before Launch 🚀
* Fresh AWS Account Test.
# Testing🧪
* Testing from 3rd party aws account. [ Today ]
* Test Large Amounts of Feeds ( Decrease the cadence of ingesting. ) [ Today ]
* Test out Vector Databases at Small Scale
* Test out Vector Databases at Scale.
* Test out LLM Summarizaiton At Small Scale
* Test out LLM Summarization At Scall
# Misc TODO:
* More RSS Feeds for my own personal account.
* Database versioning Parameter.
* More RSS Feed Module - Easy
* Add in console setup python script for new project into launch.py
# Next Modules
* Vector Database Storage Solution w/ Pinecone.
* Gen AI Summarization Module.
# Application Modules
* Vector Storage
* Vector Analysis
* LLM Summarization
* Semantic Understanding & Reasoning
* AWS Budget, Pinecone Budget, & LLM Budget
* Integration with bumblebee
* Visualization System ( Ingesting, Clustering, etc...)
# Misc
* Duplicate Article Check Module.
* Github Issues & Github Actions.