mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2026-02-12 09:44:29 +01:00
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,4 +2,4 @@ repo_structure.txt
|
||||
.env
|
||||
/layer/python*
|
||||
*__pycache__*
|
||||
rss_feeds.json
|
||||
*feeds.json
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
from pinecone import Pinecone, ServerlessSpec
|
||||
|
||||
from initialize import pc
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
|
||||
index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs.
|
||||
|
||||
if index_name not in pc.list_indexes().names():
|
||||
pc.create_index(
|
||||
name=index_name,
|
||||
dimension=2,
|
||||
metric="cosine",
|
||||
spec=ServerlessSpec(
|
||||
cloud='aws',
|
||||
region='us-east-1'
|
||||
)
|
||||
)
|
||||
|
||||
index = pc.Index(index_name)
|
||||
|
||||
index.upsert(
|
||||
vectors=[
|
||||
{"id": "vec1", "values": [1.0, 1.5]},
|
||||
{"id": "vec2", "values": [2.0, 1.0]},
|
||||
{"id": "vec3", "values": [0.1, 3.0]},
|
||||
],
|
||||
namespace="example-namespace1"
|
||||
)
|
||||
|
||||
index.upsert(
|
||||
vectors=[
|
||||
{"id": "vec2124", "values": [1.0, -2.5]},
|
||||
{"id": "vec21214", "values": [3.0, -2.0]},
|
||||
{"id": "vec31251", "values": [0.5, -1.5]},
|
||||
],
|
||||
namespace="example-namespace2"
|
||||
)
|
||||
|
||||
|
||||
|
||||
index.upsert(
|
||||
vectors=[
|
||||
{"id": "vec1", "values": [1.0, -2.5]},
|
||||
{"id": "vec2", "values": [3.0, -2.0]},
|
||||
{"id": "vec3", "values": [0.5, -1.5]},
|
||||
],
|
||||
namespace="example-namespace2"
|
||||
)
|
||||
|
||||
print(index.describe_index_stats())
|
||||
|
||||
query_results1 = index.query(
|
||||
namespace="example-namespace1",
|
||||
vector=[1.0, 1.5],
|
||||
top_k=3,
|
||||
include_values=True
|
||||
)
|
||||
|
||||
print(query_results1)
|
||||
|
||||
query_results2 = index.query(
|
||||
namespace="example-namespace2",
|
||||
vector=[1.0,-2.5],
|
||||
top_k=3,
|
||||
include_values=True
|
||||
)
|
||||
|
||||
print(query_results2)
|
||||
@@ -1,4 +1,4 @@
|
||||
# from pinecone import Pinecone
|
||||
from pinecone import Pinecone
|
||||
|
||||
|
||||
import os
|
||||
@@ -38,10 +38,6 @@ def vectorize(article:str) -> list[float]:
|
||||
)
|
||||
|
||||
return response.data[0].embedding
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
|
||||
@@ -56,7 +52,6 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
|
||||
if len(vector) != int(embedding_dim):
|
||||
raise ValueError("Length of vector does not match the embedding dimension")
|
||||
|
||||
|
||||
if filter_query:
|
||||
query = index.query(
|
||||
namespace=namespace,
|
||||
@@ -64,8 +59,7 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
|
||||
filter_query=filter_query,
|
||||
top_k=top_k,
|
||||
include_metadata=True
|
||||
)
|
||||
|
||||
)
|
||||
|
||||
else:
|
||||
query = index.query(
|
||||
@@ -79,5 +73,5 @@ def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Create a large paragraph
|
||||
paragraph = '''This is a test '''
|
||||
paragraph = '''This is a test.'''
|
||||
vectorize("This is a test string")
|
||||
@@ -1,6 +1,6 @@
|
||||
|
||||
|
||||
def summarize(text:str):
|
||||
sub_prompt = "Summarize the follow"
|
||||
sub_prompt = "Summarize the following passage"
|
||||
|
||||
|
||||
@@ -5,9 +5,7 @@ import logging
|
||||
from random import randint
|
||||
|
||||
# TODO: Move this article storage logic to a separate module inside of lambda.
|
||||
from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize
|
||||
|
||||
from utils import generate_key
|
||||
from src.analytics.embeddings.pinecone import get_index, upsert_vectors, vectorize
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
@@ -23,7 +21,7 @@ def save_article(article:dict, strategy:str):
|
||||
if strategy == "s3":
|
||||
s3_save_article(article)
|
||||
elif strategy == "pinecone":
|
||||
pinecone_save_article()
|
||||
pinecone_save_article(article)
|
||||
else:
|
||||
raise ValueError(f"Invalid storage strategy: {strategy}")
|
||||
|
||||
@@ -31,16 +29,16 @@ def save_article(article:dict, strategy:str):
|
||||
def pinecone_save_article(article:dict):
|
||||
logger.info("Saving article to Pinecone")
|
||||
index = get_index()
|
||||
article_id = article['article_id']
|
||||
rss_feed_id = article['rss_id']
|
||||
|
||||
# Expected Keys from Pinecone *MUST* include 'id' and 'values'
|
||||
article["id"] = article["article_id"]
|
||||
article["values"] = vectorize(article["content"])
|
||||
|
||||
namespace = f"{rss_feed_id}-{article_id}"
|
||||
namespace = f"IngestRSS-Articles"
|
||||
|
||||
logger.info("Upserting article to Pinecone")
|
||||
upsert_vectors(index, [article], namespace)
|
||||
|
||||
logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}")
|
||||
logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone index {index.name} with namespace {namespace}")
|
||||
|
||||
def dynamodb_save_article(article:dict):
|
||||
pass
|
||||
|
||||
@@ -17,7 +17,7 @@ DYNAMODB_TABLE_NAME=rss-feeds-table
|
||||
SQS_QUEUE_NAME=rss-feed-queue
|
||||
|
||||
|
||||
LAMBDA_LAYER_VERSION=3 # This is fixed.
|
||||
LAMBDA_LAYER_VERSION=4 # This is fixed.
|
||||
LAMBDA_LAYER_NAME=ingest-rss-lambda-layer-${AWS_REGION}
|
||||
LAMBDA_LAYER_ARN=arn:aws:lambda:${AWS_REGION}:966265353179:layer:${LAMBDA_LAYER_NAME}:${LAMBDA_LAYER_VERSION}
|
||||
|
||||
@@ -38,15 +38,11 @@ QUEUE_FILLER_LAMBDA_S3_KEY=RSSQueueFiller.zip
|
||||
# Logging Configuration
|
||||
LOG_LEVEL=INFO
|
||||
|
||||
Slide 1Slide 2
|
||||
|
||||
|
||||
# Other Application Settings
|
||||
APP_NAME=RSS Feed Processor
|
||||
VERSION=1.0.0
|
||||
|
||||
|
||||
TEST=1
|
||||
STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
|
||||
|
||||
# Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
|
||||
|
||||
27
todo.md
27
todo.md
@@ -1,15 +1,22 @@
|
||||
# TODO Before Launch 🚀
|
||||
* Fresh AWS Account Test.
|
||||
# Testing🧪
|
||||
* Testing from 3rd party aws account. [ Today ]
|
||||
* Test Large Amounts of Feeds ( Decrease the cadence of ingesting. ) [ Today ]
|
||||
* Test out Vector Databases at Small Scale
|
||||
* Test out Vector Databases at Scale.
|
||||
* Test out LLM Summarizaiton At Small Scale
|
||||
* Test out LLM Summarization At Scall
|
||||
|
||||
# Misc TODO:
|
||||
* More RSS Feeds for my own personal account.
|
||||
* Database versioning Parameter.
|
||||
* More RSS Feed Module - Easy
|
||||
* Add in console setup python script for new project into launch.py
|
||||
|
||||
# Next Modules
|
||||
* Vector Database Storage Solution w/ Pinecone.
|
||||
* Gen AI Summarization Module.
|
||||
# Application Modules
|
||||
* Vector Storage
|
||||
* Vector Analysis
|
||||
* LLM Summarization
|
||||
* Semantic Understanding & Reasoning
|
||||
* AWS Budget, Pinecone Budget, & LLM Budget
|
||||
* Integration with bumblebee
|
||||
* Visualization System ( Ingesting, Clustering, etc...)
|
||||
|
||||
# Misc
|
||||
* Duplicate Article Check Module.
|
||||
* Github Issues & Github Actions.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user