mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-18 06:24:21 +01:00
Update
This commit is contained in:
33
delete.py
Normal file
33
delete.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import requests
|
||||||
|
import openai
|
||||||
|
|
||||||
|
def mock_bank_api_call(data):
|
||||||
|
# Mock response from the bank API
|
||||||
|
bank_response = {
|
||||||
|
"status": "success",
|
||||||
|
"account_balance": 1000,
|
||||||
|
"currency": "USD"
|
||||||
|
}
|
||||||
|
return bank_response
|
||||||
|
|
||||||
|
def process_data_with_openai(data):
|
||||||
|
# Call the mock bank API
|
||||||
|
bank_data = mock_bank_api_call(data)
|
||||||
|
|
||||||
|
# Prepare the prompt for OpenAI API
|
||||||
|
prompt = f"Bank API returned the following data: {bank_data}. Process this data."
|
||||||
|
|
||||||
|
# Call the OpenAI API
|
||||||
|
openai.api_key = 'your-openai-api-key'
|
||||||
|
response = openai.Completion.create(
|
||||||
|
engine="davinci",
|
||||||
|
prompt=prompt,
|
||||||
|
max_tokens=50
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.choices[0].text.strip()
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
data = {"account_id": "12345"}
|
||||||
|
result = process_data_with_openai(data)
|
||||||
|
print(result)
|
||||||
@@ -365,6 +365,27 @@
|
|||||||
"df.sample(20)[\"content\"].values"
|
"df.sample(20)[\"content\"].values"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def pinecone_save_article(article:dict):\n",
|
||||||
|
" logger.info(\"Saving article to Pinecone\")\n",
|
||||||
|
" index = get_index()\n",
|
||||||
|
"\n",
|
||||||
|
" # Expected Keys from Pinecone *MUST* include 'id' and 'values'\n",
|
||||||
|
" article[\"id\"] = article[\"article_id\"]\n",
|
||||||
|
" article[\"values\"] = vectorize(article[\"content\"])\n",
|
||||||
|
" \n",
|
||||||
|
" namespace = f\"IngestRSS-Articles\"\n",
|
||||||
|
" \n",
|
||||||
|
" logger.info(\"Upserting article to Pinecone\")\n",
|
||||||
|
" upsert_vectors(index, [article], namespace)\n",
|
||||||
|
" logger.info(f\"Successfully upserted article w/ article-id: {article[\"article_id\"]} to Pinecone index {index.name} with namespace {namespace}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
|||||||
11095
rss_feeds.json
11095
rss_feeds.json
File diff suppressed because it is too large
Load Diff
@@ -82,7 +82,7 @@ def get_or_create_kms_key():
|
|||||||
for key in response['Keys']:
|
for key in response['Keys']:
|
||||||
try:
|
try:
|
||||||
tags = kms_client.list_resource_tags(KeyId=key['KeyId'])['Tags']
|
tags = kms_client.list_resource_tags(KeyId=key['KeyId'])['Tags']
|
||||||
if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # TODO: This is inefficient and should be fixed and more readable.
|
if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # FIXME: This is inefficient and should be fixed and more readable.
|
||||||
print(f"Found existing KMS key with ID: {key['KeyId']}")
|
print(f"Found existing KMS key with ID: {key['KeyId']}")
|
||||||
return key['KeyId']
|
return key['KeyId']
|
||||||
except ClientError:
|
except ClientError:
|
||||||
|
|||||||
@@ -128,8 +128,6 @@ def create_function(lambda_client, function_name, runtime, role, handler, zip_fi
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
def get_pillow_layer_arn():
|
def get_pillow_layer_arn():
|
||||||
|
|
||||||
|
|
||||||
url = f"https://api.klayers.cloud/api/v2/p{os.getenv('PYTHON_VERSION')}/layers/latest/{os.getenv('AWS_REGION')}/json"
|
url = f"https://api.klayers.cloud/api/v2/p{os.getenv('PYTHON_VERSION')}/layers/latest/{os.getenv('AWS_REGION')}/json"
|
||||||
try:
|
try:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
|
|||||||
@@ -29,20 +29,19 @@ def get_index():
|
|||||||
def vectorize(article:str) -> list[float]:
|
def vectorize(article:str) -> list[float]:
|
||||||
response = client.embeddings.create(
|
response = client.embeddings.create(
|
||||||
input=article,
|
input=article,
|
||||||
model="text-embedding-3-large"
|
model=os.getenv('OPENAI_EMBEDDING_MODEL')
|
||||||
)
|
)
|
||||||
|
|
||||||
return response.data[0].embedding
|
return response.data[0].embedding
|
||||||
|
|
||||||
|
|
||||||
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
|
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted.
|
||||||
index.upsert(
|
index.upsert(
|
||||||
vectors=vectors,
|
vectors=vectors,
|
||||||
namespace=namespace
|
namespace=namespace
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working.
|
||||||
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None):
|
|
||||||
|
|
||||||
if len(vector) != int(embedding_dim):
|
if len(vector) != int(embedding_dim):
|
||||||
raise ValueError("Length of vector does not match the embedding dimension")
|
raise ValueError("Length of vector does not match the embedding dimension")
|
||||||
|
|||||||
@@ -4,8 +4,7 @@ import os
|
|||||||
import logging
|
import logging
|
||||||
from random import randint
|
from random import randint
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
# TODO: Move this article storage logic to a separate module inside of lambda.
|
|
||||||
# TODO: Get better at handling loading local moduels insdie of the lambdda.
|
|
||||||
from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
|
from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
|
||||||
|
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
@@ -35,7 +34,8 @@ def pinecone_save_article(article:dict):
|
|||||||
article["id"] = article["article_id"]
|
article["id"] = article["article_id"]
|
||||||
article["values"] = vectorize(article["content"])
|
article["values"] = vectorize(article["content"])
|
||||||
|
|
||||||
namespace = f"IngestRSS-Articles"
|
|
||||||
|
namespace = os.getenv('PINECONE_NAMESPACE')
|
||||||
|
|
||||||
logger.info("Upserting article to Pinecone")
|
logger.info("Upserting article to Pinecone")
|
||||||
upsert_vectors(index, [article], namespace)
|
upsert_vectors(index, [article], namespace)
|
||||||
|
|||||||
@@ -10,17 +10,67 @@ def update_env_vars(function_name):
|
|||||||
lambda_client = boto3.client('lambda')
|
lambda_client = boto3.client('lambda')
|
||||||
|
|
||||||
env_vars = {
|
env_vars = {
|
||||||
'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
|
# AWS Configuration
|
||||||
|
'AWS_REGION': os.environ.get('AWS_REGION'),
|
||||||
|
'AWS_ACCOUNT_ID': os.environ.get('AWS_ACCOUNT_ID'),
|
||||||
|
'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),
|
||||||
|
'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),
|
||||||
|
'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION'),
|
||||||
|
|
||||||
|
# Lambda Configuration
|
||||||
|
'LAMBDA_FUNCTION_NAME': os.environ.get('LAMBDA_FUNCTION_NAME'),
|
||||||
|
'STACK_BASE': os.environ.get('STACK_BASE'),
|
||||||
|
'LAMBDA_EXECUTION_ROLE_NAME': os.environ.get('LAMBDA_EXECUTION_ROLE_NAME'),
|
||||||
|
'LAMBDA_ROLE_ARN': os.environ.get('LAMBDA_ROLE_ARN'),
|
||||||
|
'LAMBDA_LAYER_VERSION': os.environ.get('LAMBDA_LAYER_VERSION'),
|
||||||
|
'LAMBDA_LAYER_NAME': os.environ.get('LAMBDA_LAYER_NAME'),
|
||||||
|
'LAMBDA_LAYER_ARN': os.environ.get('LAMBDA_LAYER_ARN'),
|
||||||
|
'LAMBDA_RUNTIME': os.environ.get('LAMBDA_RUNTIME'),
|
||||||
|
'LAMBDA_TIMEOUT': os.environ.get('LAMBDA_TIMEOUT', '300'), # Reasonable default timeout
|
||||||
|
'LAMBDA_MEMORY': os.environ.get('LAMBDA_MEMORY', '512'), # Reasonable default memory
|
||||||
|
|
||||||
|
# S3 Configuration
|
||||||
'S3_BUCKET_NAME': os.environ.get('S3_BUCKET_NAME'),
|
'S3_BUCKET_NAME': os.environ.get('S3_BUCKET_NAME'),
|
||||||
|
'S3_LAMBDA_ZIPPED_BUCKET_NAME': os.environ.get('S3_LAMBDA_ZIPPED_BUCKET_NAME'),
|
||||||
|
'S3_LAYER_BUCKET_NAME': os.environ.get('S3_LAYER_BUCKET_NAME'),
|
||||||
|
'S3_LAYER_KEY_NAME': os.environ.get('S3_LAYER_KEY_NAME'),
|
||||||
|
|
||||||
|
# DynamoDB Configuration
|
||||||
'DYNAMODB_TABLE_NAME': os.environ.get('DYNAMODB_TABLE_NAME'),
|
'DYNAMODB_TABLE_NAME': os.environ.get('DYNAMODB_TABLE_NAME'),
|
||||||
'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'),
|
'DYNAMODB_TABLE_ARN': os.environ.get('DYNAMODB_TABLE_ARN'),
|
||||||
'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY'),
|
|
||||||
|
# SQS Configuration
|
||||||
|
'SQS_QUEUE_NAME': os.environ.get('SQS_QUEUE_NAME'),
|
||||||
|
'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
|
||||||
|
'SQS_QUEUE_ARN': os.environ.get('SQS_QUEUE_ARN'),
|
||||||
|
|
||||||
|
# Queue Filler Lambda Configuration
|
||||||
|
'QUEUE_FILLER_LAMBDA_NAME': os.environ.get('QUEUE_FILLER_LAMBDA_NAME'),
|
||||||
|
'QUEUE_FILLER_LAMBDA_S3_KEY': os.environ.get('QUEUE_FILLER_LAMBDA_S3_KEY'),
|
||||||
|
|
||||||
|
# Python Configuration
|
||||||
|
'PYTHON_VERSION': os.environ.get('PYTHON_VERSION', '3.12'), # Default Python version
|
||||||
|
|
||||||
|
# Application Settings
|
||||||
|
'APP_NAME': os.environ.get('APP_NAME', 'RSS Feed Processor'), # Default app name is fine
|
||||||
|
'VERSION': os.environ.get('VERSION', '1.0.0'), # Default version is fine
|
||||||
|
'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'), # Default to INFO logging
|
||||||
|
|
||||||
|
# Storage Configuration
|
||||||
|
'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY', 's3'), # Default to s3 storage
|
||||||
|
|
||||||
|
# Pinecone Configuration (only used if STORAGE_STRATEGY is 'pinecone')
|
||||||
'PINECONE_API_KEY': os.environ.get('PINECONE_API_KEY'),
|
'PINECONE_API_KEY': os.environ.get('PINECONE_API_KEY'),
|
||||||
|
'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
|
||||||
'PINECONE_SHARDS': os.environ.get('PINECONE_SHARDS'),
|
'PINECONE_SHARDS': os.environ.get('PINECONE_SHARDS'),
|
||||||
|
'PINECONE_NAMESPACE': os.environ.get('PINECONE_NAMESPACE'),
|
||||||
|
|
||||||
|
# Vector Configuration
|
||||||
'VECTOR_EMBEDDING_MODEL': os.environ.get('VECTOR_EMBEDDING_MODEL'),
|
'VECTOR_EMBEDDING_MODEL': os.environ.get('VECTOR_EMBEDDING_MODEL'),
|
||||||
'VECTOR_EMBEDDING_DIM': os.environ.get('VECTOR_EMBEDDING_DIM'),
|
'VECTOR_EMBEDDING_DIM': os.environ.get('VECTOR_EMBEDDING_DIM'),
|
||||||
'VECTOR_SEARCH_METRIC': os.environ.get('VECTOR_SEARCH_METRIC'),
|
'VECTOR_SEARCH_METRIC': os.environ.get('VECTOR_SEARCH_METRIC'),
|
||||||
'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
|
|
||||||
|
# OpenAI Configuration
|
||||||
'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY')
|
'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -50,11 +50,12 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
|
|||||||
# Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
|
# Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
|
||||||
PINECONE_API_KEY=***
|
PINECONE_API_KEY=***
|
||||||
PINECONE_DB_NAME=open-rss-articles
|
PINECONE_DB_NAME=open-rss-articles
|
||||||
PINECONE_API_KEY=***
|
|
||||||
PINECONE_SHARDS=***
|
PINECONE_SHARDS=***
|
||||||
|
PINECONE_NAMESPACE=IngestRSS-Articles
|
||||||
|
|
||||||
VECTOR_EMBEDDING_MODEL=***
|
VECTOR_EMBEDDING_MODEL=***
|
||||||
VECTOR_EMBEDDING_DIM=***
|
VECTOR_EMBEDDING_DIM=***
|
||||||
VECTOR_SEARCH_METRIC=***
|
VECTOR_SEARCH_METRIC=***
|
||||||
PINECONE_DB_NAME=***
|
|
||||||
|
|
||||||
OPENAI_API_KEY=sk**
|
OPENAI_API_KEY=sk**
|
||||||
|
OPENAI_EMBEDDING_MODEL=text-embedding-3-large
|
||||||
13
todo.md
13
todo.md
@@ -1,18 +1,9 @@
|
|||||||
# Before Public Launch
|
# After Public Launch
|
||||||
|
* Monthly Kaggle Dataset Publishing.
|
||||||
|
|
||||||
* Test out Vector Databases at Small Scale.
|
* Test out Vector Databases at Small Scale.
|
||||||
* Test out Vector Databases at Scale.
|
* Test out Vector Databases at Scale.
|
||||||
|
|
||||||
* check in to ensure text cleaning processes are working. [ ]
|
|
||||||
|
|
||||||
* Fix up ReadMe
|
|
||||||
* Publish RoadMap.
|
|
||||||
* Tips on where to gather RSS Feeds.
|
|
||||||
* Public Launch Posts
|
|
||||||
* Reddit
|
|
||||||
* Twitter
|
|
||||||
* Kaggle
|
|
||||||
|
|
||||||
* Protocol for annotating data.
|
* Protocol for annotating data.
|
||||||
* Promotional Material
|
* Promotional Material
|
||||||
* Stuff that is not news
|
* Stuff that is not news
|
||||||
|
|||||||
Reference in New Issue
Block a user