mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 22:14:20 +01:00
Update
This commit is contained in:
33
delete.py
Normal file
33
delete.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import requests
|
||||
import openai
|
||||
|
||||
def mock_bank_api_call(data):
|
||||
# Mock response from the bank API
|
||||
bank_response = {
|
||||
"status": "success",
|
||||
"account_balance": 1000,
|
||||
"currency": "USD"
|
||||
}
|
||||
return bank_response
|
||||
|
||||
def process_data_with_openai(data):
|
||||
# Call the mock bank API
|
||||
bank_data = mock_bank_api_call(data)
|
||||
|
||||
# Prepare the prompt for OpenAI API
|
||||
prompt = f"Bank API returned the following data: {bank_data}. Process this data."
|
||||
|
||||
# Call the OpenAI API
|
||||
openai.api_key = 'your-openai-api-key'
|
||||
response = openai.Completion.create(
|
||||
engine="davinci",
|
||||
prompt=prompt,
|
||||
max_tokens=50
|
||||
)
|
||||
|
||||
return response.choices[0].text.strip()
|
||||
|
||||
# Example usage
|
||||
data = {"account_id": "12345"}
|
||||
result = process_data_with_openai(data)
|
||||
print(result)
|
||||
@@ -365,6 +365,27 @@
|
||||
"df.sample(20)[\"content\"].values"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def pinecone_save_article(article:dict):\n",
|
||||
" logger.info(\"Saving article to Pinecone\")\n",
|
||||
" index = get_index()\n",
|
||||
"\n",
|
||||
" # Expected Keys from Pinecone *MUST* include 'id' and 'values'\n",
|
||||
" article[\"id\"] = article[\"article_id\"]\n",
|
||||
" article[\"values\"] = vectorize(article[\"content\"])\n",
|
||||
" \n",
|
||||
" namespace = f\"IngestRSS-Articles\"\n",
|
||||
" \n",
|
||||
" logger.info(\"Upserting article to Pinecone\")\n",
|
||||
" upsert_vectors(index, [article], namespace)\n",
|
||||
" logger.info(f\"Successfully upserted article w/ article-id: {article[\"article_id\"]} to Pinecone index {index.name} with namespace {namespace}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
11095
rss_feeds.json
11095
rss_feeds.json
File diff suppressed because it is too large
Load Diff
@@ -82,7 +82,7 @@ def get_or_create_kms_key():
|
||||
for key in response['Keys']:
|
||||
try:
|
||||
tags = kms_client.list_resource_tags(KeyId=key['KeyId'])['Tags']
|
||||
if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # TODO: This is inefficient and should be fixed and more readable.
|
||||
if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # FIXME: This is inefficient and should be fixed and more readable.
|
||||
print(f"Found existing KMS key with ID: {key['KeyId']}")
|
||||
return key['KeyId']
|
||||
except ClientError:
|
||||
|
||||
@@ -128,8 +128,6 @@ def create_function(lambda_client, function_name, runtime, role, handler, zip_fi
|
||||
raise
|
||||
|
||||
def get_pillow_layer_arn():
|
||||
|
||||
|
||||
url = f"https://api.klayers.cloud/api/v2/p{os.getenv('PYTHON_VERSION')}/layers/latest/{os.getenv('AWS_REGION')}/json"
|
||||
try:
|
||||
response = requests.get(url)
|
||||
|
||||
@@ -29,20 +29,19 @@ def get_index():
|
||||
def vectorize(article:str) -> list[float]:
|
||||
response = client.embeddings.create(
|
||||
input=article,
|
||||
model="text-embedding-3-large"
|
||||
model=os.getenv('OPENAI_EMBEDDING_MODEL')
|
||||
)
|
||||
|
||||
return response.data[0].embedding
|
||||
|
||||
|
||||
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
|
||||
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted.
|
||||
index.upsert(
|
||||
vectors=vectors,
|
||||
namespace=namespace
|
||||
)
|
||||
|
||||
|
||||
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None):
|
||||
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working.
|
||||
|
||||
if len(vector) != int(embedding_dim):
|
||||
raise ValueError("Length of vector does not match the embedding dimension")
|
||||
|
||||
@@ -4,8 +4,7 @@ import os
|
||||
import logging
|
||||
from random import randint
|
||||
from datetime import datetime
|
||||
# TODO: Move this article storage logic to a separate module inside of lambda.
|
||||
# TODO: Get better at handling loading local moduels insdie of the lambdda.
|
||||
|
||||
from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
|
||||
|
||||
logger = logging.getLogger()
|
||||
@@ -35,7 +34,8 @@ def pinecone_save_article(article:dict):
|
||||
article["id"] = article["article_id"]
|
||||
article["values"] = vectorize(article["content"])
|
||||
|
||||
namespace = f"IngestRSS-Articles"
|
||||
|
||||
namespace = os.getenv('PINECONE_NAMESPACE')
|
||||
|
||||
logger.info("Upserting article to Pinecone")
|
||||
upsert_vectors(index, [article], namespace)
|
||||
|
||||
@@ -10,17 +10,67 @@ def update_env_vars(function_name):
|
||||
lambda_client = boto3.client('lambda')
|
||||
|
||||
env_vars = {
|
||||
'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
|
||||
# AWS Configuration
|
||||
'AWS_REGION': os.environ.get('AWS_REGION'),
|
||||
'AWS_ACCOUNT_ID': os.environ.get('AWS_ACCOUNT_ID'),
|
||||
'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),
|
||||
'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),
|
||||
'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION'),
|
||||
|
||||
# Lambda Configuration
|
||||
'LAMBDA_FUNCTION_NAME': os.environ.get('LAMBDA_FUNCTION_NAME'),
|
||||
'STACK_BASE': os.environ.get('STACK_BASE'),
|
||||
'LAMBDA_EXECUTION_ROLE_NAME': os.environ.get('LAMBDA_EXECUTION_ROLE_NAME'),
|
||||
'LAMBDA_ROLE_ARN': os.environ.get('LAMBDA_ROLE_ARN'),
|
||||
'LAMBDA_LAYER_VERSION': os.environ.get('LAMBDA_LAYER_VERSION'),
|
||||
'LAMBDA_LAYER_NAME': os.environ.get('LAMBDA_LAYER_NAME'),
|
||||
'LAMBDA_LAYER_ARN': os.environ.get('LAMBDA_LAYER_ARN'),
|
||||
'LAMBDA_RUNTIME': os.environ.get('LAMBDA_RUNTIME'),
|
||||
'LAMBDA_TIMEOUT': os.environ.get('LAMBDA_TIMEOUT', '300'), # Reasonable default timeout
|
||||
'LAMBDA_MEMORY': os.environ.get('LAMBDA_MEMORY', '512'), # Reasonable default memory
|
||||
|
||||
# S3 Configuration
|
||||
'S3_BUCKET_NAME': os.environ.get('S3_BUCKET_NAME'),
|
||||
'S3_LAMBDA_ZIPPED_BUCKET_NAME': os.environ.get('S3_LAMBDA_ZIPPED_BUCKET_NAME'),
|
||||
'S3_LAYER_BUCKET_NAME': os.environ.get('S3_LAYER_BUCKET_NAME'),
|
||||
'S3_LAYER_KEY_NAME': os.environ.get('S3_LAYER_KEY_NAME'),
|
||||
|
||||
# DynamoDB Configuration
|
||||
'DYNAMODB_TABLE_NAME': os.environ.get('DYNAMODB_TABLE_NAME'),
|
||||
'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'),
|
||||
'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY'),
|
||||
'DYNAMODB_TABLE_ARN': os.environ.get('DYNAMODB_TABLE_ARN'),
|
||||
|
||||
# SQS Configuration
|
||||
'SQS_QUEUE_NAME': os.environ.get('SQS_QUEUE_NAME'),
|
||||
'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
|
||||
'SQS_QUEUE_ARN': os.environ.get('SQS_QUEUE_ARN'),
|
||||
|
||||
# Queue Filler Lambda Configuration
|
||||
'QUEUE_FILLER_LAMBDA_NAME': os.environ.get('QUEUE_FILLER_LAMBDA_NAME'),
|
||||
'QUEUE_FILLER_LAMBDA_S3_KEY': os.environ.get('QUEUE_FILLER_LAMBDA_S3_KEY'),
|
||||
|
||||
# Python Configuration
|
||||
'PYTHON_VERSION': os.environ.get('PYTHON_VERSION', '3.12'), # Default Python version
|
||||
|
||||
# Application Settings
|
||||
'APP_NAME': os.environ.get('APP_NAME', 'RSS Feed Processor'), # Default app name is fine
|
||||
'VERSION': os.environ.get('VERSION', '1.0.0'), # Default version is fine
|
||||
'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'), # Default to INFO logging
|
||||
|
||||
# Storage Configuration
|
||||
'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY', 's3'), # Default to s3 storage
|
||||
|
||||
# Pinecone Configuration (only used if STORAGE_STRATEGY is 'pinecone')
|
||||
'PINECONE_API_KEY': os.environ.get('PINECONE_API_KEY'),
|
||||
'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
|
||||
'PINECONE_SHARDS': os.environ.get('PINECONE_SHARDS'),
|
||||
'PINECONE_NAMESPACE': os.environ.get('PINECONE_NAMESPACE'),
|
||||
|
||||
# Vector Configuration
|
||||
'VECTOR_EMBEDDING_MODEL': os.environ.get('VECTOR_EMBEDDING_MODEL'),
|
||||
'VECTOR_EMBEDDING_DIM': os.environ.get('VECTOR_EMBEDDING_DIM'),
|
||||
'VECTOR_SEARCH_METRIC': os.environ.get('VECTOR_SEARCH_METRIC'),
|
||||
'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
|
||||
|
||||
# OpenAI Configuration
|
||||
'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY')
|
||||
}
|
||||
|
||||
|
||||
@@ -50,11 +50,12 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
|
||||
# Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ]
|
||||
PINECONE_API_KEY=***
|
||||
PINECONE_DB_NAME=open-rss-articles
|
||||
PINECONE_API_KEY=***
|
||||
PINECONE_SHARDS=***
|
||||
PINECONE_NAMESPACE=IngestRSS-Articles
|
||||
|
||||
VECTOR_EMBEDDING_MODEL=***
|
||||
VECTOR_EMBEDDING_DIM=***
|
||||
VECTOR_SEARCH_METRIC=***
|
||||
PINECONE_DB_NAME=***
|
||||
|
||||
OPENAI_API_KEY=sk**
|
||||
OPENAI_EMBEDDING_MODEL=text-embedding-3-large
|
||||
13
todo.md
13
todo.md
@@ -1,18 +1,9 @@
|
||||
# Before Public Launch
|
||||
# After Public Launch
|
||||
* Monthly Kaggle Dataset Publishing.
|
||||
|
||||
* Test out Vector Databases at Small Scale.
|
||||
* Test out Vector Databases at Scale.
|
||||
|
||||
* check in to ensure text cleaning processes are working. [ ]
|
||||
|
||||
* Fix up ReadMe
|
||||
* Publish RoadMap.
|
||||
* Tips on where to gather RSS Feeds.
|
||||
* Public Launch Posts
|
||||
* Reddit
|
||||
* Twitter
|
||||
* Kaggle
|
||||
|
||||
* Protocol for annotating data.
|
||||
* Promotional Material
|
||||
* Stuff that is not news
|
||||
|
||||
Reference in New Issue
Block a user