mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-18 14:34:26 +01:00
fixes claude
This commit is contained in:
@@ -16,7 +16,7 @@ services:
|
||||
volumes:
|
||||
- mongo-data:/data/db
|
||||
healthcheck:
|
||||
test: ["CMD", "mongo", "--eval", "db.adminCommand('ping')"]
|
||||
test: ["CMD", "mongosh", "--eval", "db.adminCommand('ping')"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
@@ -24,8 +24,8 @@ services:
|
||||
image: minio/minio
|
||||
command: server /data --console-address ":9001"
|
||||
environment:
|
||||
MINIO_ACCESS_KEY: minioadmin
|
||||
MINIO_SECRET_KEY: minioadmin
|
||||
MINIO_ROOT_USER: minioadmin
|
||||
MINIO_ROOT_PASSWORD: minioadmin
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
@@ -53,7 +53,7 @@ services:
|
||||
MONGODB_URL: mongodb://mongodb:27017
|
||||
MONGODB_DB_NAME: ingestrss
|
||||
MONGODB_COLLECTION_NAME: rss_feeds
|
||||
MINIO_ENDPOINT: http://minio:9000
|
||||
MINIO_ENDPOINT: minio:9000
|
||||
MINIO_ACCESS_KEY: minioadmin
|
||||
MINIO_SECRET_KEY: minioadmin
|
||||
MINIO_BUCKET: ingestrss
|
||||
|
||||
52
launch.py
52
launch.py
@@ -6,10 +6,6 @@ import boto3
|
||||
from dotenv import load_dotenv
|
||||
import logging
|
||||
import argparse
|
||||
import subprocess
|
||||
from src.infra.lambdas.RSSQueueFiller.deploy_sqs_filler_lambda import deploy_sqs_filler
|
||||
|
||||
from src.utils.check_env import check_env
|
||||
|
||||
|
||||
def check_local_env() -> None:
|
||||
@@ -45,7 +41,7 @@ def start_docker_containers() -> None:
|
||||
raise
|
||||
|
||||
|
||||
print("🗞️ 💵 ⚖️ IngestRSS⚖️ 💵 🗞️".center(100, "-"))
|
||||
print("RSS Feed Processor".center(100, "-"))
|
||||
|
||||
parser = argparse.ArgumentParser(description="Launch IngestRSS")
|
||||
parser.add_argument(
|
||||
@@ -59,7 +55,41 @@ load_dotenv(override=True)
|
||||
|
||||
if args.local:
|
||||
check_local_env()
|
||||
# Upload RSS feeds to MongoDB for local deployment
|
||||
from src.feed_management.upload_rss_feeds import upload_rss_feeds
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
rss_feeds_file = os.path.join(current_dir, "rss_feeds.json")
|
||||
|
||||
if os.path.exists(rss_feeds_file):
|
||||
with open(rss_feeds_file, 'r') as f:
|
||||
rss_feeds = json.load(f)
|
||||
upload_rss_feeds(
|
||||
rss_feeds,
|
||||
os.getenv('MONGODB_URL'),
|
||||
os.getenv('MONGODB_DB_NAME'),
|
||||
os.getenv('MONGODB_COLLECTION_NAME', 'rss_feeds')
|
||||
)
|
||||
print("RSS feeds uploaded to MongoDB")
|
||||
|
||||
start_docker_containers()
|
||||
print("Local RSS Feed Processor started successfully!")
|
||||
print("Services running:")
|
||||
print("- MongoDB: localhost:27017")
|
||||
print("- Redis: localhost:6379")
|
||||
print("- MinIO: localhost:9000 (console: localhost:9001)")
|
||||
print("- Worker and Scheduler containers are processing feeds")
|
||||
sys.exit(0)
|
||||
|
||||
else:
|
||||
# Only import AWS modules for cloud deployment
|
||||
from src.utils.check_env import check_env
|
||||
from src.infra.deploy_infrastructure import deploy_infrastructure
|
||||
from src.infra.lambdas.RSSFeedProcessorLambda.deploy_rss_feed_lambda import deploy_lambda
|
||||
from src.infra.lambdas.lambda_utils.update_lambda_env_vars import update_env_vars
|
||||
from src.feed_management.upload_rss_feeds import upload_rss_feeds
|
||||
from src.infra.lambdas.RSSQueueFiller.deploy_sqs_filler_lambda import deploy_sqs_filler
|
||||
|
||||
check_env()
|
||||
|
||||
# Set up logging
|
||||
@@ -71,15 +101,9 @@ lambda_client = boto3.client("lambda")
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(current_dir)
|
||||
|
||||
from src.infra.deploy_infrastructure import deploy_infrastructure
|
||||
from src.infra.lambdas.RSSFeedProcessorLambda.deploy_rss_feed_lambda import deploy_lambda
|
||||
from src.infra.lambdas.lambda_utils.update_lambda_env_vars import update_env_vars
|
||||
from src.feed_management.upload_rss_feeds import upload_rss_feeds
|
||||
|
||||
def main():
|
||||
if "--local" in sys.argv:
|
||||
subprocess.run(["docker", "compose", "up", "-d"], check=False)
|
||||
return
|
||||
def main(local_mode=False):
|
||||
if local_mode:
|
||||
return # Already handled above
|
||||
|
||||
# Deploy infrastructure
|
||||
deploy_infrastructure()
|
||||
|
||||
@@ -7,7 +7,11 @@ constructs==10.2.69
|
||||
qdrant-client
|
||||
ollama
|
||||
tqdm
|
||||
prometheus-clien
|
||||
prometheus-client
|
||||
redis
|
||||
minio
|
||||
schedule==1.*
|
||||
feedparser
|
||||
newspaper3k
|
||||
python-dateutil
|
||||
lxml
|
||||
@@ -3,7 +3,13 @@ import requests
|
||||
|
||||
from qdrant_client import QdrantClient, models
|
||||
|
||||
from utils import setup_logging
|
||||
try:
|
||||
from ...utils import setup_logging
|
||||
except ImportError:
|
||||
# Fallback for when running standalone
|
||||
import logging
|
||||
def setup_logging():
|
||||
return logging.getLogger(__name__)
|
||||
|
||||
logger = setup_logging()
|
||||
|
||||
@@ -26,6 +32,7 @@ def get_index():
|
||||
return client
|
||||
|
||||
def vectorize(article: str) -> list[float]:
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{ollama_host}/api/embeddings",
|
||||
json={"model": ollama_embedding_model, "prompt": article},
|
||||
@@ -33,6 +40,11 @@ def vectorize(article: str) -> list[float]:
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json().get("embedding", [])
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error generating embedding: {e}")
|
||||
# Return a zero vector of the expected dimension as fallback
|
||||
dim = int(embedding_dim) if embedding_dim else 384 # Default dimension
|
||||
return [0.0] * dim
|
||||
|
||||
|
||||
def upsert_vectors(index: QdrantClient, data: list[dict]):
|
||||
@@ -44,7 +56,7 @@ def upsert_vectors(index: QdrantClient, data: list[dict]):
|
||||
|
||||
|
||||
def query_vectors(index: QdrantClient, vector: list[float], top_k: int, filter_query: dict | None = None):
|
||||
if len(vector) != int(embedding_dim):
|
||||
if embedding_dim and len(vector) != int(embedding_dim):
|
||||
raise ValueError("Length of vector does not match the embedding dimension")
|
||||
return index.search(
|
||||
collection_name=collection_name,
|
||||
|
||||
@@ -6,10 +6,16 @@ import logging
|
||||
from datetime import datetime
|
||||
from pymongo import MongoClient
|
||||
|
||||
from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
# Try to import vector DB components, but make them optional
|
||||
try:
|
||||
from .analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
|
||||
VECTOR_DB_AVAILABLE = True
|
||||
except ImportError:
|
||||
VECTOR_DB_AVAILABLE = False
|
||||
logger.warning("Vector DB components not available. Qdrant storage will not work.")
|
||||
|
||||
s3 = boto3.client('s3')
|
||||
|
||||
CONTENT_BUCKET = os.getenv("S3_BUCKET_NAME", os.getenv("CONTENT_BUCKET"))
|
||||
@@ -36,8 +42,13 @@ def save_article(article: dict, strategy: str):
|
||||
if strategy == "s3":
|
||||
s3_save_article(article)
|
||||
elif strategy == "qdrant":
|
||||
if VECTOR_DB_AVAILABLE:
|
||||
qdrant_save_article(article)
|
||||
else:
|
||||
logger.error("Qdrant storage requested but vector DB components not available")
|
||||
raise ValueError("Vector DB components not available for Qdrant storage")
|
||||
elif strategy == "both":
|
||||
if VECTOR_DB_AVAILABLE:
|
||||
qdrant_save_article(article)
|
||||
s3_save_article(article)
|
||||
else:
|
||||
@@ -62,7 +73,7 @@ def s3_save_article(article:dict):
|
||||
|
||||
now = datetime.now()
|
||||
article_id = article['article_id']
|
||||
logger.info(f"Content ")
|
||||
|
||||
if not article_id:
|
||||
logger.error(f"Missing rss_id or article_id in article: {article}")
|
||||
return
|
||||
|
||||
@@ -4,9 +4,9 @@ from dateutil import parser
|
||||
import queue
|
||||
import threading
|
||||
import logging
|
||||
from utils import generate_key
|
||||
from article_extractor import extract_article
|
||||
from article_cleaning import clean_text
|
||||
from .utils import generate_key
|
||||
from .article_extractor import extract_article
|
||||
from .article_cleaning import clean_text
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
@@ -38,6 +38,7 @@ def extract_feed_threading(rss: dict, output_queue, stop_thread):
|
||||
feed_url = rss['u']
|
||||
last_date = rss['dt']
|
||||
max_date = last_date
|
||||
entry = None # Initialize entry variable
|
||||
|
||||
try:
|
||||
feed = feedparser.parse(feed_url)
|
||||
@@ -45,11 +46,11 @@ def extract_feed_threading(rss: dict, output_queue, stop_thread):
|
||||
if stop_thread.is_set():
|
||||
break
|
||||
|
||||
pub_date = parse_pub_date(entry['published'])
|
||||
pub_date = parse_pub_date(entry.get('published', ''))
|
||||
|
||||
if pub_date > last_date:
|
||||
title, text = extract_article(entry.link)
|
||||
title, text = clean_text(title), clean_text(text)
|
||||
title, text = clean_text(title or ''), clean_text(text or '')
|
||||
article = {
|
||||
'link': entry.link,
|
||||
'rss': feed_url,
|
||||
@@ -71,7 +72,9 @@ def extract_feed_threading(rss: dict, output_queue, stop_thread):
|
||||
}
|
||||
output_queue.put(output)
|
||||
except Exception as e:
|
||||
logger.error(f"Feed: {entry}")
|
||||
logger.error(f"Feed URL: {feed_url}")
|
||||
if entry:
|
||||
logger.error(f"Current entry: {entry.get('link', 'unknown')}")
|
||||
logger.error(f"Feed failed due to error: {e}")
|
||||
|
||||
def extract_feed(rss: dict):
|
||||
@@ -83,7 +86,7 @@ def extract_feed(rss: dict):
|
||||
try:
|
||||
feed = feedparser.parse(feed_url)
|
||||
for entry in feed['entries']:
|
||||
pub_date = parse_pub_date(entry['published'])
|
||||
pub_date = parse_pub_date(entry.get('published', ''))
|
||||
|
||||
if pub_date > last_date:
|
||||
title, text = extract_article(entry.link)
|
||||
@@ -106,16 +109,15 @@ def extract_feed(rss: dict):
|
||||
'max_date': max_date,
|
||||
'feed': rss
|
||||
}
|
||||
print(output)
|
||||
return output
|
||||
except Exception as e:
|
||||
logger.error(f"Feed: {entry}")
|
||||
logger.error(f"Feed URL: {feed_url}")
|
||||
logger.error(f"Feed failed due to error: {e}")
|
||||
|
||||
def parse_pub_date(entry:dict):
|
||||
|
||||
if 'published' in entry:
|
||||
date_string = entry['published']
|
||||
def parse_pub_date(date_string: str) -> int:
|
||||
"""Parse publication date from various formats"""
|
||||
if not date_string:
|
||||
return int(datetime.now().timestamp())
|
||||
|
||||
try:
|
||||
return int(datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z").timestamp())
|
||||
@@ -125,7 +127,7 @@ def parse_pub_date(entry:dict):
|
||||
except ValueError:
|
||||
try:
|
||||
return int(parser.parse(date_string).timestamp())
|
||||
except ValueError:
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
return int(datetime.now().timestamp()) # Return current time if no date is found
|
||||
@@ -1,16 +1,28 @@
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
import boto3
|
||||
from decimal import Decimal
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
import redis
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel("INFO")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# For AWS deployment - SQS
|
||||
try:
|
||||
sqs = boto3.client('sqs')
|
||||
SQS_QUEUE_URL = os.environ.get('SQS_QUEUE_URL', '')
|
||||
AWS_DEPLOYMENT = bool(SQS_QUEUE_URL)
|
||||
except Exception:
|
||||
AWS_DEPLOYMENT = False
|
||||
|
||||
# For local deployment - Redis
|
||||
if not AWS_DEPLOYMENT:
|
||||
redis_client = redis.Redis.from_url(os.environ.get('REDIS_URL', 'redis://localhost:6379'))
|
||||
REDIS_QUEUE_NAME = os.environ.get('REDIS_QUEUE_NAME', 'rss-feed-queue')
|
||||
|
||||
SQS_QUEUE_URL = os.environ['SQS_QUEUE_URL']
|
||||
MONGODB_URL = os.environ['MONGODB_URL']
|
||||
MONGODB_DB_NAME = os.environ['MONGODB_DB_NAME']
|
||||
MONGODB_COLLECTION_NAME = os.environ.get('MONGODB_COLLECTION_NAME', 'rss_feeds')
|
||||
@@ -40,19 +52,25 @@ def handler(event, context):
|
||||
'u': rss_url,
|
||||
'dt': rss_dt
|
||||
}
|
||||
logger.debug("message", message)
|
||||
logger.debug(f"Message: {message}")
|
||||
|
||||
try:
|
||||
if AWS_DEPLOYMENT:
|
||||
# Send to SQS for AWS deployment
|
||||
sqs.send_message(
|
||||
QueueUrl=SQS_QUEUE_URL,
|
||||
MessageBody=json.dumps(message, cls=DecimalEncoder)
|
||||
)
|
||||
else:
|
||||
# Send to Redis for local deployment
|
||||
redis_client.lpush(REDIS_QUEUE_NAME, json.dumps(message, cls=DecimalEncoder))
|
||||
messages_sent += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending message to SQS: {str(e)}")
|
||||
logger.error(f"Error sending message to queue: {str(e)}")
|
||||
|
||||
logger.info(f"Sent {messages_sent} messages to SQS at {datetime.now().isoformat()}")
|
||||
logger.info(f"Sent {messages_sent} messages to queue at {datetime.now().isoformat()}")
|
||||
|
||||
return {
|
||||
"statusCode": 200,
|
||||
"body": json.dumps(f"Sent {messages_sent} RSS URLs to Redis"),
|
||||
"body": json.dumps(f"Sent {messages_sent} RSS URLs to queue"),
|
||||
}
|
||||
@@ -3,28 +3,30 @@ import sys
|
||||
import time
|
||||
import logging
|
||||
|
||||
# Ensure project root is in the Python path so imports work when executed
|
||||
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(CURRENT_DIR, "..", ".."))
|
||||
# Add the project root to Python path
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
|
||||
from src.infra.lambdas.RSSFeedProcessorLambda.src.lambda_function import lambda_handler
|
||||
|
||||
logging.basicConfig(level=os.environ.get("LOG_LEVEL", "INFO"))
|
||||
logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SLEEP_SECONDS = int(os.getenv("WORKER_SLEEP_SECONDS", "5"))
|
||||
|
||||
def main() -> None:
|
||||
"""Continuously run the existing Lambda handler as a local worker."""
|
||||
logger.info("Starting local RSS worker")
|
||||
def main():
|
||||
logger.info("Starting worker loop")
|
||||
while True:
|
||||
try:
|
||||
lambda_handler({}, None)
|
||||
result = lambda_handler({}, None)
|
||||
if result.get('statusCode') == 200:
|
||||
logger.debug("Worker iteration completed successfully")
|
||||
else:
|
||||
logger.warning(f"Worker iteration returned non-200 status: {result}")
|
||||
except Exception as exc:
|
||||
logger.error("Worker iteration failed", exc_info=exc)
|
||||
time.sleep(1)
|
||||
|
||||
logger.exception("Worker iteration failed: %s", exc)
|
||||
time.sleep(SLEEP_SECONDS)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user