mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-18 22:44:27 +01:00
Switch RSS feed storage to MongoDB
This commit is contained in:
@@ -48,7 +48,12 @@ def main():
|
|||||||
if os.path.exists(rss_feeds_file):
|
if os.path.exists(rss_feeds_file):
|
||||||
with open(rss_feeds_file, 'r') as f:
|
with open(rss_feeds_file, 'r') as f:
|
||||||
rss_feeds = json.load(f)
|
rss_feeds = json.load(f)
|
||||||
upload_rss_feeds(rss_feeds, os.getenv('DYNAMODB_TABLE_NAME'))
|
upload_rss_feeds(
|
||||||
|
rss_feeds,
|
||||||
|
os.getenv('MONGODB_URL'),
|
||||||
|
os.getenv('MONGODB_DB_NAME'),
|
||||||
|
os.getenv('MONGODB_COLLECTION_NAME', 'rss_feeds')
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print(f"WARNING: {rss_feeds_file} not found. Skipping RSS feed upload.")
|
print(f"WARNING: {rss_feeds_file} not found. Skipping RSS feed upload.")
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
boto3==1.35.*
|
boto3==1.35.*
|
||||||
|
pymongo==4.*
|
||||||
python-dotenv==1.0.*
|
python-dotenv==1.0.*
|
||||||
requests==2.32.*
|
requests==2.32.*
|
||||||
constructs==10.2.69
|
constructs==10.2.69
|
||||||
|
|||||||
@@ -1,58 +1,43 @@
|
|||||||
import json
|
import json
|
||||||
import boto3
|
import os
|
||||||
from boto3.dynamodb.conditions import Key
|
from pymongo import MongoClient
|
||||||
from botocore.exceptions import ClientError
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
# Set up logging
|
# Set up logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def upload_rss_feeds(rss_feeds, table_name):
|
def upload_rss_feeds(rss_feeds, mongo_url, db_name, collection_name):
|
||||||
dynamodb = boto3.resource('dynamodb')
|
client = MongoClient(mongo_url)
|
||||||
table = dynamodb.Table(table_name)
|
collection = client[db_name][collection_name]
|
||||||
|
|
||||||
logger.info(f"Uploading RSS feeds to table: {table_name}")
|
logger.info(f"Uploading RSS feeds to MongoDB collection: {collection_name}")
|
||||||
|
|
||||||
try:
|
|
||||||
# Get the table's key schema
|
|
||||||
key_schema = table.key_schema
|
|
||||||
partition_key = next(key['AttributeName'] for key in key_schema if key['KeyType'] == 'HASH')
|
|
||||||
except ClientError as e:
|
|
||||||
logger.error(f"Error getting table schema: {e.response['Error']['Message']}")
|
|
||||||
return
|
|
||||||
|
|
||||||
new_items = 0
|
new_items = 0
|
||||||
existing_items = 0
|
existing_items = 0
|
||||||
|
|
||||||
for feed in rss_feeds:
|
for feed in rss_feeds:
|
||||||
# Check if the item already exists
|
url = feed.get('u')
|
||||||
try:
|
dt = int(feed.get('dt', 0))
|
||||||
response = table.get_item(Key={partition_key: feed['u']})
|
result = collection.update_one(
|
||||||
except ClientError as e:
|
{'url': url},
|
||||||
logger.error(f"Error checking for existing item: {e.response['Error']['Message']}")
|
{'$setOnInsert': {'url': url, 'dt': dt}},
|
||||||
continue
|
upsert=True
|
||||||
|
)
|
||||||
if 'Item' not in response:
|
if result.upserted_id:
|
||||||
# Item doesn't exist, insert new item
|
|
||||||
item = {partition_key: feed['u'], 'dt': 0}
|
|
||||||
feed['dt'] = int(feed['dt'])
|
|
||||||
item.update()
|
|
||||||
|
|
||||||
try:
|
|
||||||
table.put_item(Item=item)
|
|
||||||
new_items += 1
|
new_items += 1
|
||||||
except ClientError as e:
|
|
||||||
logger.error(f"Error inserting new item: {e.response['Error']['Message']}")
|
|
||||||
else:
|
else:
|
||||||
existing_items += 1
|
existing_items += 1
|
||||||
|
|
||||||
logger.info(f"Upload complete. {new_items} new items inserted. {existing_items} items already existed.")
|
logger.info(
|
||||||
|
f"Upload complete. {new_items} new items inserted. {existing_items} items already existed."
|
||||||
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
table_name = 'rss-feeds-table'
|
mongo_url = os.getenv('MONGODB_URL', 'mongodb://localhost:27017')
|
||||||
rss_feed_path = 'rss_feeds.json'
|
db_name = os.getenv('MONGODB_DB_NAME', 'ingestrss')
|
||||||
with open(rss_feed_path) as f:
|
collection_name = os.getenv('MONGODB_COLLECTION_NAME', 'rss_feeds')
|
||||||
|
with open('rss_feeds.json') as f:
|
||||||
rss_feeds = json.load(f)
|
rss_feeds = json.load(f)
|
||||||
logger.info(f"Loaded RSS feeds: {rss_feeds}")
|
logger.info(f"Loaded RSS feeds: {rss_feeds}")
|
||||||
upload_rss_feeds(rss_feeds, table_name)
|
upload_rss_feeds(rss_feeds, mongo_url, db_name, collection_name)
|
||||||
|
|||||||
@@ -4,18 +4,24 @@ import os
|
|||||||
import logging
|
import logging
|
||||||
from random import randint
|
from random import randint
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from pymongo import MongoClient
|
||||||
|
|
||||||
from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
|
from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
|
||||||
|
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
|
||||||
s3 = boto3.client('s3')
|
s3 = boto3.client('s3')
|
||||||
dynamodb = boto3.resource('dynamodb')
|
|
||||||
|
|
||||||
CONTENT_BUCKET = os.getenv("S3_BUCKET_NAME", os.getenv("CONTENT_BUCKET"))
|
CONTENT_BUCKET = os.getenv("S3_BUCKET_NAME", os.getenv("CONTENT_BUCKET"))
|
||||||
DYNAMODB_TABLE = os.getenv("DYNAMODB_TABLE_NAME")
|
|
||||||
storage_strategy = os.environ.get('STORAGE_STRATEGY')
|
storage_strategy = os.environ.get('STORAGE_STRATEGY')
|
||||||
|
|
||||||
|
MONGODB_URL = os.getenv("MONGODB_URL")
|
||||||
|
MONGODB_DB_NAME = os.getenv("MONGODB_DB_NAME")
|
||||||
|
MONGODB_COLLECTION_NAME = os.getenv("MONGODB_COLLECTION_NAME", "rss_feeds")
|
||||||
|
|
||||||
|
mongo_client = MongoClient(MONGODB_URL)
|
||||||
|
feeds_collection = mongo_client[MONGODB_DB_NAME][MONGODB_COLLECTION_NAME]
|
||||||
|
|
||||||
##### Article Storage #####
|
##### Article Storage #####
|
||||||
def save_article(article:dict, strategy:str):
|
def save_article(article:dict, strategy:str):
|
||||||
if strategy == "s3":
|
if strategy == "s3":
|
||||||
@@ -96,12 +102,13 @@ def s3_save_article(article:dict):
|
|||||||
###### Feed Storage ######
|
###### Feed Storage ######
|
||||||
def update_rss_feed(feed: dict, last_pub_dt: int):
|
def update_rss_feed(feed: dict, last_pub_dt: int):
|
||||||
try:
|
try:
|
||||||
table = dynamodb.Table(DYNAMODB_TABLE)
|
feeds_collection.update_one(
|
||||||
table.update_item(
|
{"url": feed["u"]},
|
||||||
Key={'url': feed['u']},
|
{"$set": {"dt": last_pub_dt}},
|
||||||
UpdateExpression='SET dt = :val',
|
upsert=True,
|
||||||
ExpressionAttributeValues={':val': last_pub_dt}
|
)
|
||||||
|
logger.info(
|
||||||
|
f"Updated RSS feed in MongoDB: {feed['u']} with dt: {last_pub_dt}"
|
||||||
)
|
)
|
||||||
logger.info(f"Updated RSS feed in DynamoDB: {feed['u']} with dt: {feed['dt']}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to update RSS feed: {str(e)}")
|
logger.error(f"Failed to update RSS feed: {str(e)}")
|
||||||
@@ -4,15 +4,20 @@ import boto3
|
|||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import logging
|
import logging
|
||||||
|
from pymongo import MongoClient
|
||||||
|
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
logger.setLevel("INFO")
|
logger.setLevel("INFO")
|
||||||
|
|
||||||
dynamodb = boto3.resource('dynamodb')
|
|
||||||
sqs = boto3.client('sqs')
|
sqs = boto3.client('sqs')
|
||||||
|
|
||||||
SQS_QUEUE_URL = os.environ['SQS_QUEUE_URL']
|
SQS_QUEUE_URL = os.environ['SQS_QUEUE_URL']
|
||||||
DYNAMODB_TABLE_NAME = os.environ['DYNAMODB_TABLE_NAME']
|
MONGODB_URL = os.environ['MONGODB_URL']
|
||||||
|
MONGODB_DB_NAME = os.environ['MONGODB_DB_NAME']
|
||||||
|
MONGODB_COLLECTION_NAME = os.environ.get('MONGODB_COLLECTION_NAME', 'rss_feeds')
|
||||||
|
|
||||||
|
mongo_client = MongoClient(MONGODB_URL)
|
||||||
|
feeds_collection = mongo_client[MONGODB_DB_NAME][MONGODB_COLLECTION_NAME]
|
||||||
|
|
||||||
class DecimalEncoder(json.JSONEncoder):
|
class DecimalEncoder(json.JSONEncoder):
|
||||||
def default(self, obj):
|
def default(self, obj):
|
||||||
@@ -21,13 +26,10 @@ class DecimalEncoder(json.JSONEncoder):
|
|||||||
return super(DecimalEncoder, self).default(obj)
|
return super(DecimalEncoder, self).default(obj)
|
||||||
|
|
||||||
def handler(event, context):
|
def handler(event, context):
|
||||||
table = dynamodb.Table(DYNAMODB_TABLE_NAME)
|
|
||||||
messages_sent = 0
|
messages_sent = 0
|
||||||
|
|
||||||
# Scan the DynamoDB table
|
# Iterate over all feeds in MongoDB
|
||||||
response = table.scan()
|
for item in feeds_collection.find({}):
|
||||||
|
|
||||||
for item in response['Items']:
|
|
||||||
rss_url = item.get('url')
|
rss_url = item.get('url')
|
||||||
rss_dt = item.get('dt')
|
rss_dt = item.get('dt')
|
||||||
|
|
||||||
|
|||||||
@@ -34,6 +34,9 @@ def check_env() -> None:
|
|||||||
"LAMBDA_RUNTIME",
|
"LAMBDA_RUNTIME",
|
||||||
"LAMBDA_TIMEOUT",
|
"LAMBDA_TIMEOUT",
|
||||||
"LAMBDA_MEMORY",
|
"LAMBDA_MEMORY",
|
||||||
|
"MONGODB_URL",
|
||||||
|
"MONGODB_DB_NAME",
|
||||||
|
"MONGODB_COLLECTION_NAME",
|
||||||
"QUEUE_FILLER_LAMBDA_NAME",
|
"QUEUE_FILLER_LAMBDA_NAME",
|
||||||
"QUEUE_FILLER_LAMBDA_S3_KEY",
|
"QUEUE_FILLER_LAMBDA_S3_KEY",
|
||||||
"LOG_LEVEL",
|
"LOG_LEVEL",
|
||||||
|
|||||||
@@ -34,6 +34,11 @@ LAMBDA_RUNTIME=python${PYTHON_VERSION}
|
|||||||
LAMBDA_TIMEOUT=300
|
LAMBDA_TIMEOUT=300
|
||||||
LAMBDA_MEMORY=512
|
LAMBDA_MEMORY=512
|
||||||
|
|
||||||
|
# MongoDB settings
|
||||||
|
MONGODB_URL=mongodb://localhost:27017
|
||||||
|
MONGODB_DB_NAME=ingestrss
|
||||||
|
MONGODB_COLLECTION_NAME=rss_feeds
|
||||||
|
|
||||||
QUEUE_FILLER_LAMBDA_NAME=RSSQueueFiller
|
QUEUE_FILLER_LAMBDA_NAME=RSSQueueFiller
|
||||||
QUEUE_FILLER_LAMBDA_S3_KEY=RSSQueueFiller.zip
|
QUEUE_FILLER_LAMBDA_S3_KEY=RSSQueueFiller.zip
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user