This commit is contained in:
Charles-Gormley
2024-11-30 16:14:33 -05:00
parent 111e985afc
commit 72e75d4d7d
10 changed files with 123 additions and 11125 deletions

33
delete.py Normal file
View File

@@ -0,0 +1,33 @@
import requests
import openai
def mock_bank_api_call(data):
# Mock response from the bank API
bank_response = {
"status": "success",
"account_balance": 1000,
"currency": "USD"
}
return bank_response
def process_data_with_openai(data):
# Call the mock bank API
bank_data = mock_bank_api_call(data)
# Prepare the prompt for OpenAI API
prompt = f"Bank API returned the following data: {bank_data}. Process this data."
# Call the OpenAI API
openai.api_key = 'your-openai-api-key'
response = openai.Completion.create(
engine="davinci",
prompt=prompt,
max_tokens=50
)
return response.choices[0].text.strip()
# Example usage
data = {"account_id": "12345"}
result = process_data_with_openai(data)
print(result)

View File

@@ -365,6 +365,27 @@
"df.sample(20)[\"content\"].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def pinecone_save_article(article:dict):\n",
" logger.info(\"Saving article to Pinecone\")\n",
" index = get_index()\n",
"\n",
" # Expected Keys from Pinecone *MUST* include 'id' and 'values'\n",
" article[\"id\"] = article[\"article_id\"]\n",
" article[\"values\"] = vectorize(article[\"content\"])\n",
" \n",
" namespace = f\"IngestRSS-Articles\"\n",
" \n",
" logger.info(\"Upserting article to Pinecone\")\n",
" upsert_vectors(index, [article], namespace)\n",
" logger.info(f\"Successfully upserted article w/ article-id: {article[\"article_id\"]} to Pinecone index {index.name} with namespace {namespace}\")"
]
},
{
"cell_type": "code",
"execution_count": null,

File diff suppressed because it is too large Load Diff

View File

@@ -82,7 +82,7 @@ def get_or_create_kms_key():
for key in response['Keys']:
try:
tags = kms_client.list_resource_tags(KeyId=key['KeyId'])['Tags']
if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # TODO: This is inefficient and should be fixed and more readable.
if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # FIXME: This is inefficient and should be fixed and more readable.
print(f"Found existing KMS key with ID: {key['KeyId']}")
return key['KeyId']
except ClientError:

View File

@@ -128,8 +128,6 @@ def create_function(lambda_client, function_name, runtime, role, handler, zip_fi
raise
def get_pillow_layer_arn():
url = f"https://api.klayers.cloud/api/v2/p{os.getenv('PYTHON_VERSION')}/layers/latest/{os.getenv('AWS_REGION')}/json"
try:
response = requests.get(url)

View File

@@ -29,20 +29,19 @@ def get_index():
def vectorize(article:str) -> list[float]:
response = client.embeddings.create(
input=article,
model="text-embedding-3-large"
model=os.getenv('OPENAI_EMBEDDING_MODEL')
)
return response.data[0].embedding
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted.
index.upsert(
vectors=vectors,
namespace=namespace
)
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None):
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working.
if len(vector) != int(embedding_dim):
raise ValueError("Length of vector does not match the embedding dimension")

View File

@@ -4,8 +4,7 @@ import os
import logging
from random import randint
from datetime import datetime
# TODO: Move this article storage logic to a separate module inside of lambda.
# TODO: Get better at handling loading local moduels insdie of the lambdda.
from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
logger = logging.getLogger()
@@ -35,7 +34,8 @@ def pinecone_save_article(article:dict):
article["id"] = article["article_id"]
article["values"] = vectorize(article["content"])
namespace = f"IngestRSS-Articles"
namespace = os.getenv('PINECONE_NAMESPACE')
logger.info("Upserting article to Pinecone")
upsert_vectors(index, [article], namespace)

View File

@@ -10,17 +10,67 @@ def update_env_vars(function_name):
lambda_client = boto3.client('lambda')
env_vars = {
'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
# AWS Configuration
'AWS_REGION': os.environ.get('AWS_REGION'),
'AWS_ACCOUNT_ID': os.environ.get('AWS_ACCOUNT_ID'),
'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),
'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),
'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION'),
# Lambda Configuration
'LAMBDA_FUNCTION_NAME': os.environ.get('LAMBDA_FUNCTION_NAME'),
'STACK_BASE': os.environ.get('STACK_BASE'),
'LAMBDA_EXECUTION_ROLE_NAME': os.environ.get('LAMBDA_EXECUTION_ROLE_NAME'),
'LAMBDA_ROLE_ARN': os.environ.get('LAMBDA_ROLE_ARN'),
'LAMBDA_LAYER_VERSION': os.environ.get('LAMBDA_LAYER_VERSION'),
'LAMBDA_LAYER_NAME': os.environ.get('LAMBDA_LAYER_NAME'),
'LAMBDA_LAYER_ARN': os.environ.get('LAMBDA_LAYER_ARN'),
'LAMBDA_RUNTIME': os.environ.get('LAMBDA_RUNTIME'),
'LAMBDA_TIMEOUT': os.environ.get('LAMBDA_TIMEOUT', '300'), # Reasonable default timeout
'LAMBDA_MEMORY': os.environ.get('LAMBDA_MEMORY', '512'), # Reasonable default memory
# S3 Configuration
'S3_BUCKET_NAME': os.environ.get('S3_BUCKET_NAME'),
'S3_LAMBDA_ZIPPED_BUCKET_NAME': os.environ.get('S3_LAMBDA_ZIPPED_BUCKET_NAME'),
'S3_LAYER_BUCKET_NAME': os.environ.get('S3_LAYER_BUCKET_NAME'),
'S3_LAYER_KEY_NAME': os.environ.get('S3_LAYER_KEY_NAME'),
# DynamoDB Configuration
'DYNAMODB_TABLE_NAME': os.environ.get('DYNAMODB_TABLE_NAME'),
'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'),
'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY'),
'DYNAMODB_TABLE_ARN': os.environ.get('DYNAMODB_TABLE_ARN'),
# SQS Configuration
'SQS_QUEUE_NAME': os.environ.get('SQS_QUEUE_NAME'),
'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
'SQS_QUEUE_ARN': os.environ.get('SQS_QUEUE_ARN'),
# Queue Filler Lambda Configuration
'QUEUE_FILLER_LAMBDA_NAME': os.environ.get('QUEUE_FILLER_LAMBDA_NAME'),
'QUEUE_FILLER_LAMBDA_S3_KEY': os.environ.get('QUEUE_FILLER_LAMBDA_S3_KEY'),
# Python Configuration
'PYTHON_VERSION': os.environ.get('PYTHON_VERSION', '3.12'), # Default Python version
# Application Settings
'APP_NAME': os.environ.get('APP_NAME', 'RSS Feed Processor'), # Default app name is fine
'VERSION': os.environ.get('VERSION', '1.0.0'), # Default version is fine
'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'), # Default to INFO logging
# Storage Configuration
'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY', 's3'), # Default to s3 storage
# Pinecone Configuration (only used if STORAGE_STRATEGY is 'pinecone')
'PINECONE_API_KEY': os.environ.get('PINECONE_API_KEY'),
'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
'PINECONE_SHARDS': os.environ.get('PINECONE_SHARDS'),
'PINECONE_NAMESPACE': os.environ.get('PINECONE_NAMESPACE'),
# Vector Configuration
'VECTOR_EMBEDDING_MODEL': os.environ.get('VECTOR_EMBEDDING_MODEL'),
'VECTOR_EMBEDDING_DIM': os.environ.get('VECTOR_EMBEDDING_DIM'),
'VECTOR_SEARCH_METRIC': os.environ.get('VECTOR_SEARCH_METRIC'),
'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
# OpenAI Configuration
'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY')
}

View File

@@ -50,11 +50,12 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
# Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
PINECONE_API_KEY=***
PINECONE_DB_NAME=open-rss-articles
PINECONE_API_KEY=***
PINECONE_SHARDS=***
PINECONE_NAMESPACE=IngestRSS-Articles
VECTOR_EMBEDDING_MODEL=***
VECTOR_EMBEDDING_DIM=***
VECTOR_SEARCH_METRIC=***
PINECONE_DB_NAME=***
OPENAI_API_KEY=sk**
OPENAI_EMBEDDING_MODEL=text-embedding-3-large

13
todo.md
View File

@@ -1,18 +1,9 @@
# Before Public Launch
# After Public Launch
* Monthly Kaggle Dataset Publishing.
* Test out Vector Databases at Small Scale.
* Test out Vector Databases at Scale.
* check in to ensure text cleaning processes are working. [ ]
* Fix up ReadMe
* Publish RoadMap.
* Tips on where to gather RSS Feeds.
* Public Launch Posts
* Reddit
* Twitter
* Kaggle
* Protocol for annotating data.
* Promotional Material
* Stuff that is not news