This commit is contained in:
Charles-Gormley
2024-09-25 17:38:12 -04:00
parent 048770aa45
commit 8dc42556d7
9 changed files with 125 additions and 16 deletions

View File

@@ -3,4 +3,5 @@ python-dotenv==1.0.*
requests==2.32.*
constructs==10.2.69
# Optional, yet necessary for the Pinecone SDK functionality.
pinecone==5.1.*
pinecone
openai

View File

@@ -6,8 +6,7 @@ import os
from dotenv import load_dotenv
load_dotenv()
region = os.getenv("AWS_REGION")
index_name = os.getenv("PINECONE_DB_NAME")
index_name = "quickstart" # TODO: Remove this line after we are done testing with vector dbs.
if index_name not in pc.list_indexes().names():
@@ -32,6 +31,17 @@ index.upsert(
namespace="example-namespace1"
)
index.upsert(
vectors=[
{"id": "vec2124", "values": [1.0, -2.5]},
{"id": "vec21214", "values": [3.0, -2.0]},
{"id": "vec31251", "values": [0.5, -1.5]},
],
namespace="example-namespace2"
)
index.upsert(
vectors=[
{"id": "vec1", "values": [1.0, -2.5]},

View File

@@ -1,9 +0,0 @@
from pinecone import Pinecone
import os
from dotenv import load_dotenv
load_dotenv()
# Set up Pinecone client
api_key = os.getenv("PINCEONE_API_KEY")
pc = Pinecone(api_key=api_key)

View File

@@ -0,0 +1,83 @@
# from pinecone import Pinecone
import os
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
# Set up Pinecone client
api_key = os.getenv("PINCEONE_API_KEY")
shards = os.getenv("PINECONE_SHARDS")
embedding_model = os.getenv("VECTOR_EMBEDDING_MODEL")
embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
index_name = os.getenv("PINECONE_DB_NAME")
client = OpenAI()
pc = Pinecone(api_key=api_key)
def get_index():
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=embedding_dim,
metric=vector_search_metric,
shards=shards
)
index = pc.Index(index_name)
return index
def vectorize(article:str) -> list[float]:
response = client.embeddings.create(
input=article,
model="text-embedding-3-large"
)
return response.data[0].embedding
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
index.upsert(
vectors=vectors,
namespace=namespace
)
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None):
if len(vector) != int(embedding_dim):
raise ValueError("Length of vector does not match the embedding dimension")
if filter_query:
query = index.query(
namespace=namespace,
vector=vector,
filter_query=filter_query,
top_k=top_k,
include_metadata=True
)
else:
query = index.query(
namespace=namespace,
vector=vector,
top_k=top_k
)
return query
if __name__ == "__main__":
# Create a large paragraph
paragraph = '''This is a test '''
vectorize("This is a test string")

View File

@@ -0,0 +1,6 @@
def summarize(text:str):
sub_prompt = "Summarize the follow"

View File

@@ -4,6 +4,9 @@ import os
import logging
from random import randint
# TODO: Move this article storage logic to a separate module inside of lambda.
from src.article_storage.initialize import get_index, upsert_vectors, query_vectors, vectorize
from utils import generate_key
logger = logging.getLogger()
@@ -19,12 +22,25 @@ storage_strategy = os.environ.get('STORAGE_STRATEGY')
def save_article(article:dict, strategy:str):
if strategy == "s3":
s3_save_article(article)
elif strategy == "pinecone":
pinecone_save_article()
else:
raise ValueError(f"Invalid storage strategy: {strategy}")
def pinecone_save_article(article:dict):
pass
logger.info("Saving article to Pinecone")
index = get_index()
article_id = article['article_id']
rss_feed_id = article['rss_id']
article["id"] = article["article_id"]
article["values"] = vectorize(article["content"])
namespace = f"{rss_feed_id}-{article_id}"
upsert_vectors(index, [article], namespace)
logger.info(f"Saved article {article_id} to Pinecone index {index.name} with namespace {namespace}")
def dynamodb_save_article(article:dict):
pass

View File

@@ -16,7 +16,7 @@ echo "Python 3.12 found. Proceeding..."
echo "Section 2: Installing Dependencies"
# Install dependencies
python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] -t python/
python3.12 -m pip install --upgrade Pillow feedfinder2==0.0.4 python-dateutil newspaper3k==0.2.8 feedparser lxml[html5lib] lxml_html_clean lxml[html_clean] openai pinecone -t python/
echo "Dependencies installed successfully."
####### Section 3: Creating ZIP File ########

View File

@@ -51,4 +51,6 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
# Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
PINECONE_API_KEY=***
PINECONE_DB_NAME=open-rss-articles
PINECONE_DB_NAME=open-rss-articles
OPENAI_API_KEY=sk**

View File

@@ -8,7 +8,7 @@
* Add in console setup python script for new project into launch.py
# Next Modules
* Vector Database Storage Solution w/ Pinecone.
* Vector Database Storage Solution w/ Pinecone.
* Gen AI Summarization Module.
* Duplicate Article Check Module.
* Github Issues & Github Actions.