finished:

- eventbridge -sqs - lambda bugs
2025-12-17 22:14:20 +01:00 · 2024-09-07 20:37:58 -04:00
parent 6b0862a871
commit ba6f5752e7
10 changed files with 130 additions and 36 deletions
--- a/src/infra/cloudformation/eventbridge.yaml
+++ b/src/infra/cloudformation/eventbridge.yaml
@@ -0,0 +1,15 @@
+Parameters:
+  LambdaFunctionArn:
+    Type: String
+    Description: ARN of the RSS Feed Processor Lambda function
+
+Resources:
+  EventBridgeSchedule:
+    Type: AWS::Events::Rule
+    Properties:
+      Name: rss-feed-processor-schedule
+      Description: Runs the RSS Feed Processor Lambda function every hour
+      ScheduleExpression: rate(1 hour) # TODO: Turn this into a variable.
+      Targets:
+        - Arn: !Ref LambdaFunctionArn
+          Id: rss-feed-processor-lambda
--- a/src/infra/cloudformation/sqs.yaml
+++ b/src/infra/cloudformation/sqs.yaml
@@ -11,7 +11,8 @@ Resources:
    Type: AWS::SQS::Queue
    Properties:
      QueueName: !Ref SQSQueueName
-      VisibilityTimeout: 300
+      VisibilityTimeout: 300 # Should be set to the 3rd standard deviation of your lambda runtime distribution.
+      ReceiveMessageWaitTimeSeconds: 20
      RedrivePolicy:
        deadLetterTargetArn: !GetAtt RSSFeedDLQ.Arn
        maxReceiveCount: 3
--- a/src/infra/deploy_infrastructure.py
+++ b/src/infra/deploy_infrastructure.py
@@ -176,6 +176,14 @@ def deploy_infrastructure():
                                ]
                        )
    
+    deploy_cloudformation('eventbridge.yaml', 'EventBridgeSchedule',
+                          parameters=[
+                              {
+                                  'ParameterKey': 'LambdaFunctionArn',
+                                  'ParameterValue': f"arn:aws:lambda:{os.getenv('AWS_REGION')}:{os.getenv('AWS_ACCOUNT_ID')}:function:{os.getenv('QUEUE_FILLER_LAMBDA_NAME')}"
+                              }
+                          ])
+    
    # TODO: Figure out KMS Stuff, but for now just do it in the console. I would like to get the rest of the cloudformation working 
    # before I start messing with KMS keys.

--- a/src/infra/lambdas/RSSFeedProcessorLambda/deploy_rss_feed_lambda.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/deploy_rss_feed_lambda.py
@@ -76,6 +76,28 @@ def update_function_configuration(lambda_client, function_name, handler, role, t
                logging.info(f"Function {function_name} is currently being updated. Retrying...")
                raise e

+@retry_with_backoff()
+def configure_sqs_trigger(lambda_client, function_name, queue_arn):
+    event_source_mapping = {
+        'FunctionName': function_name,
+        'EventSourceArn': queue_arn,
+        'BatchSize': 1,
+        'MaximumBatchingWindowInSeconds': 0,
+        'ScalingConfig': {
+            'MaximumConcurrency': 50
+        }
+    }
+
+    try:
+        response = lambda_client.create_event_source_mapping(**event_source_mapping)
+        print(f"SQS trigger configured successfully for {function_name}")
+    except ClientError as e:
+        if e.response['Error']['Code'] == 'ResourceConflictException':
+            print(f"SQS trigger already exists for {function_name}. Updating configuration...")
+            # If you want to update existing trigger, you'd need to list existing mappings and update them
+            # This is left as an exercise as it requires additional error handling and logic
+        else:
+            raise e

@retry_with_backoff()
 def create_function(lambda_client, function_name, runtime, role, handler, zip_file, timeout, memory, layers, kms_key_id):
@@ -162,6 +184,13 @@ def deploy_lambda():
            print(f"Lambda function '{LAMBDA_NAME}' not found. Creating new function...")
            create_function(lambda_client, LAMBDA_NAME, LAMBDA_RUNTIME, LAMBDA_ROLE_ARN, LAMBDA_HANDLER, deployment_package, LAMBDA_TIMEOUT, LAMBDA_MEMORY, layers, kms_key_id)

+        # Configure SQS trigger
+        queue_arn = os.getenv('SQS_QUEUE_ARN')  # Make sure to set this environment variable
+        if queue_arn:
+            configure_sqs_trigger(lambda_client, LAMBDA_NAME, queue_arn)
+        else:
+            print("Warning: SQS_QUEUE_ARN not set. Skipping SQS trigger configuration.")
+
        print("Lambda deployment completed successfully!")

    except Exception as e:
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/config.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/config.py
@@ -3,11 +3,6 @@ import os
 # SQS Configuration
 SQS_QUEUE_URL = os.environ['SQS_QUEUE_URL']

-# S3 Configuration
-CONTENT_BUCKET = os.environ['CONTENT_BUCKET']
-
-# DynamoDB Configuration
-DYNAMODB_TABLE = os.environ['DYNAMODB_TABLE']

 # Logging Configuration
 LOG_LEVEL = os.environ.get('LOG_LEVEL', 'INFO')
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -11,7 +11,7 @@ logger = logging.getLogger()
 s3 = boto3.client('s3')
 dynamodb = boto3.resource('dynamodb')

-CONTENT_BUCKET = os.getenv("S3_BUCKET_NAME") 
+CONTENT_BUCKET = os.getenv("S3_BUCKET_NAME", os.getenv("CONTENT_BUCKET")) 
 DYNAMODB_TABLE = os.getenv("DYNAMODB_TABLE_NAME")
 storage_strategy = os.environ.get('STORAGE_STRATEGY')

@@ -30,20 +30,29 @@ def dynamodb_save_article(article:dict):
    pass

 def s3_save_article(article:dict):    
+    logger.info("Saving article to S3")
+
    rss_feed_id = article['rss_id']
    article_id = article['article_id']
+    logger.info(f"Content ")
+    if not rss_feed_id or not article_id:
+        logger.error(f"Missing rss_id or article_id in article: {article}")
+        return
+
+    file_path = f"/tmp/{rss_feed_id}-{article_id}-article.json"
+    file_key = f"articles/{rss_feed_id}/{article_id}/article.json"
+    
+    # Save article to /tmp json file
+    with open(file_path, "w") as f:
+        json.dump(article, f)

    try:
-        key = f"articles/{rss_feed_id}/{article_id}/article.json" 
-        s3.put_object(
-            Bucket=CONTENT_BUCKET,
-            Key=key,
-            Body=json.dumps(article)
-        )
-        logger.info(f"Saved article to S3: {key}")
+        s3.upload_file(file_path, CONTENT_BUCKET, file_key)
+        
+        logger.info(f"Saved article {article_id} to S3 bucket {CONTENT_BUCKET}")
        
    except Exception as e:
-        logger.error(f"Failed to save article: {str(e)}")
+        logger.error(f"Failed to save article with error: {str(e)}. \n Article: {article} \n Article Type: {type(article)}")


 ###### Feed Storage ######
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/feed_processor.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/feed_processor.py
@@ -12,7 +12,7 @@ logger = logging.getLogger()
 def process_feed(feed: dict):
    output_queue = queue.Queue()
    stop_thread = threading.Event()
-    thread = threading.Thread(target=extract_feed, args=(feed, output_queue, stop_thread))
+    thread = threading.Thread(target=extract_feed_threading, args=(feed, output_queue, stop_thread))
    thread.daemon = True
    thread.start()
    
@@ -32,7 +32,7 @@ def process_feed(feed: dict):
            logger.info(f"Thread Failed: {feed['u']}")
            return None
        
-def extract_feed(rss: dict, output_queue, stop_thread):
+def extract_feed_threading(rss: dict, output_queue, stop_thread):
    articles = []
    feed_url = rss['u']
    last_date = rss['dt']
@@ -72,6 +72,44 @@ def extract_feed(rss: dict, output_queue, stop_thread):
        logger.error(f"Feed: {entry}")
        logger.error(f"Feed failed due to error: {e}")

+def extract_feed(rss: dict):
+    articles = []
+    feed_url = rss['u']
+    last_date = rss['dt']
+    max_date = last_date
+
+    try:
+        feed = feedparser.parse(feed_url)
+        for entry in feed['entries']:
+            pub_date = parse_pub_date(entry['published'])
+            
+            if pub_date > last_date:
+                title, text = extract_article(entry.link) 
+                article = {
+                    'link': entry.link,
+                    'rss': feed_url,
+                    'title': title,
+                    'content': text,
+                    'unixTime': pub_date,
+                    'rss_id': generate_key(feed_url),
+                    'article_id': generate_key(entry.link),
+                    'llm_summary': None,
+                    'embedding': None
+                }
+                articles.append(article)
+                max_date = max(max_date, pub_date)
+
+        output = {
+            'articles': articles,
+            'max_date': max_date,
+            'feed': rss
+        }
+        print(output)
+        return output
+    except Exception as e:
+        logger.error(f"Feed: {entry}")
+        logger.error(f"Feed failed due to error: {e}")
+
 def parse_pub_date(date_string):
    try:
        return int(datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z").timestamp())
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/lambda_function.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/lambda_function.py
@@ -8,6 +8,7 @@ from exceptions import RSSProcessingError, ArticleExtractionError, DataStorageEr
 from metrics import record_processed_articles, record_processing_time, record_extraction_errors
 import boto3
 import os
+from feed_processor import extract_feed

 # Set up logging
 logger = setup_logging()
@@ -21,8 +22,6 @@ def lambda_handler(event, context):
    logger.info("Starting RSS feed processing")
    start_time = time.time()
    
-    
-    
    try:
        # Receive message from SQS
        event_source = event["Records"][0]["eventSource"]
@@ -36,7 +35,8 @@ def lambda_handler(event, context):
        receipt_handle = event["Records"][0]['receiptHandle']

        # Process the feed
-        result = process_feed(feed)
+        result = extract_feed(feed)
+        print(type(result))
        logger.info("Process Feed Result Dictionary: ", result)
        last_pub_dt = result['max_date']

--- a/src/infra/lambdas/lambda_utils/update_lambda_env_vars.py
+++ b/src/infra/lambdas/lambda_utils/update_lambda_env_vars.py
@@ -11,8 +11,8 @@ def update_env_vars(function_name):

    env_vars = {
        'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
-        'CONTENT_BUCKET': os.environ.get('S3_BUCKET_NAME'),
-        'DYNAMODB_TABLE': os.environ.get('DYNAMODB_TABLE_NAME'),
+        'S3_BUCKET_NAME': os.environ.get('S3_BUCKET_NAME'),
+        'DYNAMODB_TABLE_NAME': os.environ.get('DYNAMODB_TABLE_NAME'),
        'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'),
        'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY')
    }
--- a/todo.md
+++ b/todo.md
@@ -1,15 +1,13 @@
-# TODO: Fix nonetype error. 
-# TODO: Try to fix forbiden url issue.
+# Current Sub-Modules
+* TODO:  Make sure SQS queue can't go over the concurrency limit of the account. 
+* TODO: Automate eventbridge.
+* TODO: Eventbridge set up ( make sure this works ) 
+* TODO More RSS Feed Module
+* TODO: Add in console setup python script for new project into launch.py


-# TODO: Add in console setup python script for new project into launch.py
-# TODO: Eventbridge set up ( make sure this works ) 
-# TODO: Automate eventbridge.
-# TODO: Make sure ingestion of articles actually works
-
 # Modules
-* More RSS Feed Module
-* Update Lambda Layer Creation Script to be comprehensive.
+* Update Lambda Layer Creation Script to be more comprehensive.

 # Future Modules
 * Gen AI Summarization Module
@@ -17,6 +15,7 @@
 * Duplicate Article Check Module.
 * Semantic Storage Module
 * API Module ( Semantic Search, Retrieval )
+* Architecture Diagram

 # Future Use Cases
 * Betting Market Prediction
@@ -24,6 +23,6 @@
 * News Aggregation
 * News Letter Tooling

-# Over-caffeineted Ideas
+# Over-caffeineted Ideas ☕
 * Make it solarpunk themed.
 * Write a serverless manifesto for personal projects and where you would like to see the serverless world go.