annie

2025-12-17 22:14:20 +01:00 · 2024-08-25 09:11:49 -04:00
parent a31e5fdfe0
commit 049cb6a6b2
37 changed files with 1117 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 repo_structure.txt
 .env
 /layer/python*
--- a/README.md
+++ b/README.md
@@ -1 +1,127 @@
-# OpenRSS
+# OpenRSS Feed Processor
 OpenRSS is an AWS-based RSS feed processing system that automatically fetches, processes, and stores articles from specified RSS feeds.
 ## Project Structure
 ```
 OpenRSS/
 ├── src/
 │   ├── infra/
 │   │   ├── cloudformation/
 │   │   │   ├── s3.yaml
 │   │   │   ├── dynamo.yaml
 │   │   │   └── sqs.yaml
 │   │   └── deploy_infrastructure.py
 │   ├── lambda_function/
 │   │   ├── src/
 │   │   │   ├── lambda_function.py
 │   │   │   ├── feed_processor.py
 │   │   │   ├── article_extractor.py
 │   │   │   ├── data_storage.py
 │   │   │   ├── utils.py
 │   │   │   ├── config.py
 │   │   │   ├── exceptions.py
 │   │   │   └── metrics.py
 │   │   ├── tests/
 │   │   │   └── test_lambda_function.py
 │   │   ├── layers/
 │   │   │   └── requirements.txt
 │   │   ├── deploy_lambda.py
 │   │   └── update_env_vars.py
 │   └── utils/
 │       ├── create_lambda_layer.py
 │       └── upload_rss_feeds.py
 ├── launch.py
 ├── rss_feeds.json
 ├── requirements.txt
 └── README.md
 ```
 ## Prerequisites
 - Python 3.8+
 - AWS CLI configured with appropriate permissions
 - An AWS account with necessary services (S3, DynamoDB, SQS, Lambda) enabled
 ## Setup
 1. Clone the repository:
   ```
   git clone https://github.com/yourusername/OpenRSS.git
   cd OpenRSS
   ```
 2. Install the required dependencies:
   ```
   pip install -r requirements.txt
   ```
 3. Create a `.env` file in the root directory with the following content:
   ```
   AWS_ACCESS_KEY_ID=your_access_key_here
   AWS_SECRET_ACCESS_KEY=your_secret_key_here
   AWS_REGION=us-east-1
   ```
 4. Update the `rss_feeds.json` file with the RSS feeds you want to process.
 ## Usage
 To deploy the infrastructure and start the RSS feed processor:
 ```
 python launch.py
 ```
 This script will:
 1. Deploy the necessary AWS infrastructure (S3, DynamoDB, SQS) using CloudFormation.
 2. Create and upload the Lambda layer.
 3. Deploy the Lambda function.
 4. Upload the RSS feeds to DynamoDB.
 5. Trigger an initial execution of the Lambda function.
 ## Infrastructure
 The project uses the following AWS services:
 - S3: Stores processed articles
 - DynamoDB: Stores RSS feed information and processing status
 - SQS: Queues RSS feeds for processing
 - Lambda: Processes RSS feeds and extracts articles
 ## Lambda Function
 The Lambda function (`src/lambda_function/src/lambda_function.py`) is triggered periodically to process RSS feeds. It:
 1. Retrieves RSS feed information from DynamoDB
 2. Fetches and parses the RSS feed
 3. Extracts articles using the newspaper3k library
 4. Stores processed articles in S3
 5. Updates the feed's last processed timestamp in DynamoDB
 ## Customization
 - To modify the CloudFormation templates, edit the YAML files in `src/infra/cloudformation/`.
 - To change the Lambda function's behavior, modify the Python files in `src/lambda_function/src/`.
 - To add or remove RSS feeds, update the `rss_feeds.json` file.
 ## Testing
 To run the tests for the Lambda function:
 ```
 python -m pytest src/lambda_function/tests/
 ```
 ## Monitoring
 The Lambda function logs its activities to CloudWatch Logs. You can monitor the function's performance and any errors through the AWS CloudWatch console.
 ## Contributing
 Contributions are welcome! Please feel free to submit a Pull Request.
 ## License
 This project is licensed under the MIT License.
--- a/launch.py
+++ b/launch.py
@@ -0,0 +1,66 @@
 import os
 import sys
 import json
 import boto3
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 # Set AWS credentials from environment variables
 os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
 os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('AWS_SECRET_ACCESS_KEY')
 os.environ['AWS_DEFAULT_REGION'] = os.getenv('AWS_REGION')
 TABLE_NAME = os.getenv('DYNAMODB_TABLE_NAME')
 ACCOUNT_NUM = os.getenv("AWS_ACCOUNT_ID")
 SQS_QUEUE_NAME = os.getenv("SQS_QUEUE_NAME")
 REGION = os.getenv("AWS_REGION")
 os.environ["SQS_QUEUE_URL"] = f"https://sqs.{REGION}.amazonaws.com/{ACCOUNT_NUM}/{SQS_QUEUE_NAME}"
 lambda_client = boto3.client("lambda")
 LAMBDA_FUNCTION_NAME = os.getenv("LAMBDA_FUNCTION_NAME")
 # Add the src directory to the Python path
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(current_dir)
 from src.infra.deploy_infrastructure import deploy_infrastructure
 from src.utils.create_lambda_layer import create_lambda_layer
 from src.lambda_function.deploy_lambda import deploy_lambda
 from src.lambda_function.update_lambda_env_vars import update_env_vars
 from src.utils.upload_rss_feeds import upload_rss_feeds
 def main():
    # Deploy infrastructure
    deploy_infrastructure()
    # Create Lambda layer
    create_lambda_layer()
    print("Finished with Lambda Layer")
    # Deploy Lambda function
    deploy_lambda()
    print("Finished Deploying Lambda")
    # Update Lambda environment variables
    update_env_vars(LAMBDA_FUNCTION_NAME, )
    print("Finished Environment Variable Updates")
    # Upload RSS feeds
    rss_feeds_file = os.path.join(current_dir, "rss_feeds.json")
    if os.path.exists(rss_feeds_file):
        with open(rss_feeds_file, 'r') as f:
            rss_feeds = json.load(f)
        upload_rss_feeds(rss_feeds, TABLE_NAME)
    else:
        print(f"WARNING: {rss_feeds_file} not found. Skipping RSS feed upload.")
    print("RSS Feed Processor launched successfully!")
    print("RSS Feed Processor launched successfully!")
 if __name__ == "__main__":
    main()
--- a/requirements.txt
+++ b/requirements.txt
--- a/rss_feeds.json
+++ b/rss_feeds.json
@@ -0,0 +1,42 @@
 [
    {
      "u": "http://rss.cnn.com/rss/cnn_topstories.rss",
      "dt": 0
    },
    {
      "u": "https://feeds.a.dj.com/rss/RSSWorldNews.xml",
      "dt": 0
    },
    {
      "u": "http://feeds.bbci.co.uk/news/world/rss.xml",
      "dt": 0
    },
    {
      "u": "https://feeds.npr.org/1001/rss.xml",
      "dt": 0
    },
    {
      "u": "https://www.reddit.com/r/news/.rss",
      "dt": 0
    },
    {
      "u": "https://news.ycombinator.com/rss",
      "dt": 0
    },
    {
      "u": "https://techcrunch.com/feed/",
      "dt": 0
    },
    {
      "u": "https://www.wired.com/feed/rss",
      "dt": 0
    },
    {
      "u": "https://www.sciencedaily.com/rss/all.xml",
      "dt": 0
    },
    {
      "u": "https://www.nasa.gov/rss/dyn/breaking_news.rss",
      "dt": 0
    }
  ]
--- a/src/infra/pycache/deploy_infrastructure.cpython-310.pyc
+++ b/src/infra/pycache/deploy_infrastructure.cpython-310.pyc
--- a/src/infra/cloudformation/dynamo.yaml
+++ b/src/infra/cloudformation/dynamo.yaml
@@ -0,0 +1,27 @@
 AWSTemplateFormatVersion: '2010-09-09'
 Description: 'CloudFormation template for RSS Feed Processor DynamoDB Table'
 Parameters:
  DynamoDBName:
    Type: String
    Description: ""
 Resources:
  RSSFeedsTable:
    Type: AWS::DynamoDB::Table
    Properties:
      TableName: !Ref 'DynamoDBName'
      AttributeDefinitions:
        - AttributeName: url
          AttributeType: S
      KeySchema:
        - AttributeName: url
          KeyType: HASH
      BillingMode: PAY_PER_REQUEST
 Outputs:
  TableName:
    Description: 'Name of the DynamoDB table for RSS feeds'
    Value: !Ref RSSFeedsTable
    Export:
      Name: !Sub '${AWS::StackName}-RSSFeedsTableName'
--- a/src/infra/cloudformation/lambda_role.yaml
+++ b/src/infra/cloudformation/lambda_role.yaml
@@ -0,0 +1,48 @@
 AWSTemplateFormatVersion: '2010-09-09'
 Description: 'IAM Role for RSS Feed Processor Lambda Function with Broad Permissions'
 Parameters:
  LambdaExecutionRoleName:
    Type: String
    Description: "Name of the Lambda Execution Role"
 Resources:
  LambdaExecutionRole:
    Type: 'AWS::IAM::Role'
    Properties:
      RoleName: !Ref LambdaExecutionRoleName
      AssumeRolePolicyDocument:
        Version: '2012-10-17'
        Statement:
          - Effect: Allow
            Principal:
              Service:
                - lambda.amazonaws.com
            Action:
              - 'sts:AssumeRole'
      ManagedPolicyArns:
        - 'arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole'
      Policies:
        - PolicyName: 'RSSFeedProcessorLambdaBroadPolicy'
          PolicyDocument:
            Version: '2012-10-17'
            Statement:
              - Effect: Allow
                Action:
                  - 'sqs:*'
                  - 'dynamodb:*'
                  - 's3:*'
                  - 'lambda:*'
                  - 'logs:*'
                  - 'xray:*'
                  - 'cloudwatch:*'
                  - 'events:*'
                  - 'kms:Decrypt'
                Resource: '*'
 Outputs:
  LambdaRoleArn:
    Description: 'ARN of the Lambda Execution Role'
    Value: !GetAtt LambdaExecutionRole.Arn
    Export:
      Name: !Sub '${AWS::StackName}-LambdaRoleArn'
--- a/src/infra/cloudformation/s3.yaml
+++ b/src/infra/cloudformation/s3.yaml
@@ -0,0 +1,26 @@
 AWSTemplateFormatVersion: '2010-09-09'
 Description: 'CloudFormation template for RSS Feed Processor S3 Bucket'
 Parameters:
  BucketName:
    Type: String
    Description: "Name of the Lambda Execution Role"
 Resources:
  ArticleContentBucket:
    Type: AWS::S3::Bucket
    Properties:
      BucketName: !Ref BucketName
      VersioningConfiguration:
        Status: Enabled
      BucketEncryption:
        ServerSideEncryptionConfiguration:
          - ServerSideEncryptionByDefault:
              SSEAlgorithm: AES256
 Outputs:
  BucketName:
    Description: 'Name of the S3 bucket for article content'
    Value: !Ref ArticleContentBucket
    Export:
      Name: !Sub '${AWS::StackName}-ArticleContentBucketName'
--- a/src/infra/cloudformation/sqs.yaml
+++ b/src/infra/cloudformation/sqs.yaml
@@ -0,0 +1,35 @@
 AWSTemplateFormatVersion: '2010-09-09'
 Description: 'CloudFormation template for RSS Feed Processor SQS Queue'
 Parameters:
  SQSQueueName:
    Type: String
    Description: ""
 Resources:
  RSSFeedQueue:
    Type: AWS::SQS::Queue
    Properties:
      QueueName: !Ref SQSQueueName
      VisibilityTimeout: 300
      RedrivePolicy:
        deadLetterTargetArn: !GetAtt RSSFeedDLQ.Arn
        maxReceiveCount: 3
  RSSFeedDLQ:
    Type: AWS::SQS::Queue
    Properties:
      QueueName: !Sub '${AWS::StackName}-rss-feed-dlq'
 Outputs:
  QueueURL:
    Description: 'URL of the SQS queue for RSS feeds'
    Value: !Ref RSSFeedQueue
    Export:
      Name: !Sub '${AWS::StackName}-RSSFeedQueueURL'
  DLQueueURL:
    Description: 'URL of the Dead Letter Queue for RSS feeds'
    Value: !Ref RSSFeedDLQ
    Export:
      Name: !Sub '${AWS::StackName}-RSSFeedDLQueueURL'
--- a/src/infra/deploy_infrastructure.py
+++ b/src/infra/deploy_infrastructure.py
@@ -0,0 +1,95 @@
 import boto3
 import os
 from botocore.exceptions import ClientError
 def deploy_cloudformation(template_file, stack_suffix, force_recreate=False, parameters=[]):
    cf_client = boto3.client('cloudformation')
    stack_name = f"rss-feed-processor-{stack_suffix}"
    with open(f'src/infra/cloudformation/{template_file}', 'r') as file:
        template_body = file.read()
    print(f"Template contents:\n{template_body}")
    capabilities = ['CAPABILITY_NAMED_IAM']
    try:
        if force_recreate:
            try:
                print(f"Deleting stack {stack_name} for recreation...")
                cf_client.delete_stack(StackName=stack_name)
                waiter = cf_client.get_waiter('stack_delete_complete')
                waiter.wait(StackName=stack_name)
                print(f"Stack {stack_name} deleted successfully.")
            except ClientError:
                print(f"Stack {stack_name} does not exist or is already deleted.")
        try:
            stack = cf_client.describe_stacks(StackName=stack_name)['Stacks'][0]
            print(f"Updating stack {stack_name}...")
            cf_client.update_stack(
                StackName=stack_name,
                TemplateBody=template_body,
                Capabilities=capabilities,
                Parameters=parameters  # Add parameters here
            )
            waiter = cf_client.get_waiter('stack_update_complete')
            waiter.wait(StackName=stack_name)
            print(f"Stack {stack_name} updated successfully.")
        except ClientError as e:
            if 'does not exist' in str(e):
                print(f"Creating stack {stack_name}...")
                cf_client.create_stack(
                    StackName=stack_name,
                    TemplateBody=template_body,
                    Capabilities=capabilities,
                    Parameters=parameters  # Add parameters here
                )
                waiter = cf_client.get_waiter('stack_create_complete')
                waiter.wait(StackName=stack_name)
                print(f"Stack {stack_name} created successfully.")
            elif 'No updates are to be performed' in str(e):
                print(f"No updates needed for stack {stack_name}.")
            else:
                raise
    except ClientError as e:
        print(f"Error handling stack {stack_name}: {str(e)}")
        raise
 def deploy_infrastructure():
    deploy_cloudformation('s3.yaml', 'S3',
                          parameters=[
                            {
                                'ParameterKey': 'BucketName',
                                'ParameterValue': os.environ.get('S3_BUCKET_NAME', 'default-role-name')
                            }
                        ])  # Force recreation of Lambda role)
    deploy_cloudformation('dynamo.yaml', 'DynamoDB', 
                          parameters=[
                            {
                                'ParameterKey': 'DynamoDBName',
                                'ParameterValue': os.environ.get('DYNAMODB_TABLE_NAME', 'default-role-name')
                            }
                        ])  
    deploy_cloudformation('sqs.yaml', 'SQS',
                          parameters=[
                            {
                                'ParameterKey': 'SQSQueueName',
                                'ParameterValue': os.environ.get('SQS_QUEUE_NAME', 'default-role-name')
                            }
                        ])  
    deploy_cloudformation('lambda_role.yaml', 'Lambda', force_recreate=True,
                          parameters=[
                            {
                                'ParameterKey': 'LambdaExecutionRoleName',
                                'ParameterValue': os.environ.get('LAMBDA_EXECUTION_ROLE_NAME', 'default-role-name')
                            }
                        ])  
 if __name__ == "__main__":
    deploy_infrastructure()
--- a/src/lambda_function/pycache/deploy_lambda.cpython-310.pyc
+++ b/src/lambda_function/pycache/deploy_lambda.cpython-310.pyc
--- a/src/lambda_function/pycache/update_env_vars.cpython-310.pyc
+++ b/src/lambda_function/pycache/update_env_vars.cpython-310.pyc
--- a/src/lambda_function/pycache/update_lambda_env_vars.cpython-310.pyc
+++ b/src/lambda_function/pycache/update_lambda_env_vars.cpython-310.pyc
--- a/src/lambda_function/deploy_lambda.py
+++ b/src/lambda_function/deploy_lambda.py
@@ -0,0 +1,90 @@
 import boto3
 import os
 import zipfile
 import io
 from botocore.exceptions import ClientError
 from src.utils.retry_logic import retry_with_backoff
 # Set variables
 LAMBDA_NAME = "RSSFeedProcessor"
 LAMBDA_HANDLER = "lambda_function.lambda_handler"
 ACCOUNT_NUM = os.getenv('AWS_ACCOUNT_ID')
 LAMBDA_ROLE_NAME = os.getenv('LAMBDA_EXECUTION_ROLE_NAME')
 LAMBDA_ROLE_ARN = f"arn:aws:iam::{ACCOUNT_NUM}:role/{LAMBDA_ROLE_NAME}"
 LAMBDA_TIMEOUT = 300
 LAMBDA_MEMORY = 256
 LAMBDA_RUNTIME = "python3.10"
 def zip_directory(path):
    print(f"Creating deployment package from {path}...")
    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED, False) as zip_file:
        for root, _, files in os.walk(path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, path)
                zip_file.write(file_path, arcname)
    return zip_buffer.getvalue()
@retry_with_backoff()
 def update_function_code(lambda_client, function_name, zip_file):
    return lambda_client.update_function_code(
        FunctionName=function_name,
        ZipFile=zip_file
    )
@retry_with_backoff()
 def update_function_configuration(lambda_client, function_name, handler, role, timeout, memory):
    return lambda_client.update_function_configuration(
        FunctionName=function_name,
        Handler=handler,
        Role=role,
        Timeout=timeout,
        MemorySize=memory
    )
@retry_with_backoff()
 def create_function(lambda_client, function_name, runtime, role, handler, zip_file, timeout, memory):
    return lambda_client.create_function(
        FunctionName=function_name,
        Runtime=runtime,
        Role=role,
        Handler=handler,
        Code={'ZipFile': zip_file},
        Timeout=timeout,
        MemorySize=memory
    )
 def deploy_lambda():
    lambda_client = boto3.client('lambda')
    print(f"Starting deployment of Lambda function: {LAMBDA_NAME}")
    deployment_package = zip_directory('src/lambda_function/src')
    try:
        # Check if the function exists
        try:
            lambda_client.get_function(FunctionName=LAMBDA_NAME)
            function_exists = True
        except ClientError as e:
            if e.response['Error']['Code'] == 'ResourceNotFoundException':
                function_exists = False
            else:
                raise e
        if function_exists:
            print("Updating existing Lambda function...")
            update_function_code(lambda_client, LAMBDA_NAME, deployment_package)
            update_function_configuration(lambda_client, LAMBDA_NAME, LAMBDA_HANDLER, LAMBDA_ROLE_ARN, LAMBDA_TIMEOUT, LAMBDA_MEMORY)
        else:
            print(f"Lambda function '{LAMBDA_NAME}' not found. Creating new function...")
            create_function(lambda_client, LAMBDA_NAME, LAMBDA_RUNTIME, LAMBDA_ROLE_ARN, LAMBDA_HANDLER, deployment_package, LAMBDA_TIMEOUT, LAMBDA_MEMORY)
        print("Lambda deployment completed successfully!")
    except Exception as e:
        print(f"Error during Lambda deployment: {str(e)}")
        raise
 if __name__ == "__main__":
    deploy_lambda()
--- a/src/lambda_function/layers/requirements.txt
+++ b/src/lambda_function/layers/requirements.txt
@@ -0,0 +1,5 @@
 requests
 newspaper3k
 feedparser
 python-dateutil
 pandas
--- a/src/lambda_function/src/pycache/utils.cpython-310.pyc
+++ b/src/lambda_function/src/pycache/utils.cpython-310.pyc
--- a/src/lambda_function/src/article_extractor.py
+++ b/src/lambda_function/src/article_extractor.py
@@ -0,0 +1,28 @@
 import newspaper
 import logging
 logger = logging.getLogger()
 def extract_article(url):
    """
    Extracts the title and text of an article from the given URL.
    Args:
        url (str): The URL of the article.
    Returns:
        A tuple containing the title and text of the article, respectively.
    """
    logger.debug(f"Starting Newspaper Article Extraction {url}")
    config = newspaper.Config()
    config.request_timeout = 60
    article = newspaper.Article(url)
    try:
        article.download()
        logger.debug(f"Downloaded Article {url}")
        article.parse()
        logger.debug(f"Parsed Article {url}")
        return article.title, article.text
    except Exception as e:
        logger.error(f"Failed to extract article {url}: {str(e)}")
        return None, None
--- a/src/lambda_function/src/config.py
+++ b/src/lambda_function/src/config.py
@@ -0,0 +1,20 @@
 import os
 # SQS Configuration
 SQS_QUEUE_URL = os.environ['SQS_QUEUE_URL']
 # S3 Configuration
 CONTENT_BUCKET = os.environ['CONTENT_BUCKET']
 # DynamoDB Configuration
 DYNAMODB_TABLE = os.environ['DYNAMODB_TABLE']
 # Logging Configuration
 LOG_LEVEL = os.environ.get('LOG_LEVEL', 'INFO')
 # RSS Feed Processing Configuration
 MAX_ARTICLES_PER_FEED = int(os.environ.get('MAX_ARTICLES_PER_FEED', '10'))
 FEED_PROCESSING_TIMEOUT = int(os.environ.get('FEED_PROCESSING_TIMEOUT', '90'))
 # Article Extraction Configuration
 ARTICLE_EXTRACTION_TIMEOUT = int(os.environ.get('ARTICLE_EXTRACTION_TIMEOUT', '30'))
--- a/src/lambda_function/src/data_storage.py
+++ b/src/lambda_function/src/data_storage.py
@@ -0,0 +1,42 @@
 import boto3
 import json
 import os
 import logging
 logger = logging.getLogger()
 s3 = boto3.client('s3')
 dynamodb = boto3.resource('dynamodb')
 CONTENT_BUCKET = os.environ['CONTENT_BUCKET']
 DYNAMODB_TABLE = os.environ['DYNAMODB_TABLE']
 def save_article(article):
    try:
        # Save to S3
        key = f"articles/{article['unixTime']}/{article['link'].split('/')[-1]}.json"
        s3.put_object(
            Bucket=CONTENT_BUCKET,
            Key=key,
            Body=json.dumps(article)
        )
        logger.info(f"Saved article to S3: {key}")
        # Save to DynamoDB
        table = dynamodb.Table(DYNAMODB_TABLE)
        table.put_item(Item=article)
        logger.info(f"Saved article to DynamoDB: {article['link']}")
    except Exception as e:
        logger.error(f"Failed to save article: {str(e)}")
 def update_rss_feed(feed):
    try:
        table = dynamodb.Table(DYNAMODB_TABLE)
        table.update_item(
            Key={'u': feed['u']},
            UpdateExpression='SET dt = :val',
            ExpressionAttributeValues={':val': feed['dt']}
        )
        logger.info(f"Updated RSS feed in DynamoDB: {feed['u']}")
    except Exception as e:
        logger.error(f"Failed to update RSS feed: {str(e)}")
--- a/src/lambda_function/src/exceptions.py
+++ b/src/lambda_function/src/exceptions.py
@@ -0,0 +1,11 @@
 class RSSProcessingError(Exception):
    """Exception raised for errors in the RSS processing."""
    pass
 class ArticleExtractionError(Exception):
    """Exception raised for errors in the article extraction."""
    pass
 class DataStorageError(Exception):
    """Exception raised for errors in data storage operations."""
    pass
--- a/src/lambda_function/src/feed_processor.py
+++ b/src/lambda_function/src/feed_processor.py
@@ -0,0 +1,76 @@
 import feedparser
 from datetime import datetime
 from dateutil import parser
 import queue
 import threading
 import logging
 from article_extractor import extract_article
 logger = logging.getLogger()
 def process_feed(feed: dict):
    output_queue = queue.Queue()
    stop_thread = threading.Event()
    thread = threading.Thread(target=extract_feed, args=(feed, output_queue, stop_thread))
    thread.daemon = True
    thread.start()
    logger.debug(f"Thread Started: {feed['u']}")
    thread.join(timeout=90)
    if thread.is_alive():
        stop_thread.set()
        logger.debug(f"Killing Thread: {feed['u']}")
        return None
    else:
        try:
            output = output_queue.get_nowait()
            logger.info(f"Thread Succeeded: {feed['u']}")
            return output
        except queue.Empty:
            logger.info(f"Thread Failed: {feed['u']}")
            return None
 def extract_feed(rss: dict, output_queue, stop_thread):
    articles = []
    feed_url = rss['u']
    last_date = rss['dt']
    max_date = last_date
    try:
        feed = feedparser.parse(feed_url)
        for entry in feed['entries']:
            if stop_thread.is_set():
                break
            pub_date = parse_pub_date(entry['published'])
            if pub_date > last_date:
                title, text = extract_article(entry.link)
                article = {
                    'link': entry.link,
                    'rss': feed_url,
                    'title': title,
                    'content': text,
                    'unixTime': pub_date
                }
                articles.append(article)
                max_date = max(max_date, pub_date)
        output = {
            'articles': articles,
            'max_date': max_date,
            'feed': rss
        }
        output_queue.put(output)
    except Exception as e:
        logger.error(f"Feed failed due to error: {e}")
 def parse_pub_date(date_string):
    try:
        return int(datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z").timestamp())
    except ValueError:
        try:
            return int(datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ").timestamp())
        except ValueError:
            return int(parser.parse(date_string).timestamp())
--- a/src/lambda_function/src/lambda_function.py
+++ b/src/lambda_function/src/lambda_function.py
@@ -0,0 +1,78 @@
 import json
 import time
 from feed_processor import process_feed
 from data_storage import save_article, update_rss_feed
 from utils import setup_logging
 from config import SQS_QUEUE_URL
 from exceptions import RSSProcessingError, ArticleExtractionError, DataStorageError
 from metrics import record_processed_articles, record_processing_time, record_extraction_errors
 import boto3
 # Set up logging
 logger = setup_logging()
 # Initialize AWS clients
 sqs = boto3.client('sqs')
 def lambda_handler(event, context):
    logger.info("Starting RSS feed processing")
    start_time = time.time()
    try:
        # Receive message from SQS
        response = sqs.receive_message(
            QueueUrl=SQS_QUEUE_URL,
            MaxNumberOfMessages=1,
            WaitTimeSeconds=0
        )
        if 'Messages' not in response:
            logger.info("No messages in queue")
            return {'statusCode': 200, 'body': json.dumps('No RSS feeds to process')}
        message = response['Messages'][0]
        receipt_handle = message['ReceiptHandle']
        feed = json.loads(message['Body'])
        # Process the feed
        result = process_feed(feed)
        if result:
            # Save articles and update feed
            for article in result['articles']:
                try:
                    save_article(article)
                except DataStorageError as e:
                    logger.error(f"Failed to save article: {str(e)}")
                    record_extraction_errors(1)
            update_rss_feed(result['feed'])
            # Delete the message from the queue
            sqs.delete_message(QueueUrl=SQS_QUEUE_URL, ReceiptHandle=receipt_handle)
            logger.info(f"Processed feed: {feed['u']}")
            # Record metrics
            record_processed_articles(len(result['articles']))
        else:
            logger.warning(f"Failed to process feed: {feed['u']}")
            record_extraction_errors(1)
    except RSSProcessingError as e:
        logger.error(f"RSS Processing Error: {str(e)}")
        return {'statusCode': 500, 'body': json.dumps('RSS processing failed')}
    except Exception as e:
        logger.error(f"Unexpected error: {str(e)}")
        return {'statusCode': 500, 'body': json.dumps('An unexpected error occurred')}
    finally:
        end_time = time.time()
        processing_time = end_time - start_time
        record_processing_time(processing_time)
        logger.info(f"Lambda execution time: {processing_time:.2f} seconds")
    return {
        'statusCode': 200,
        'body': json.dumps('RSS feed processed successfully')
    }
--- a/src/lambda_function/src/metrics.py
+++ b/src/lambda_function/src/metrics.py
@@ -0,0 +1,26 @@
 import boto3
 import time
 cloudwatch = boto3.client('cloudwatch')
 def put_metric_data(metric_name, value, unit='Count'):
    cloudwatch.put_metric_data(
        Namespace='RSS/FeedProcessor',
        MetricData=[
            {
                'MetricName': metric_name,
                'Value': value,
                'Unit': unit,
                'Timestamp': time.time()
            },
        ]
    )
 def record_processed_articles(count):
    put_metric_data('ProcessedArticles', count)
 def record_processing_time(duration):
    put_metric_data('ProcessingTime', duration, 'Seconds')
 def record_extraction_errors(count):
    put_metric_data('ExtractionErrors', count)
--- a/src/lambda_function/src/utils.py
+++ b/src/lambda_function/src/utils.py
@@ -0,0 +1,8 @@
 import logging
 import os
 def setup_logging():
    logger = logging.getLogger()
    log_level = os.environ.get('LOG_LEVEL', 'INFO')
    logger.setLevel(logging.getLevelName(log_level))
    return logger
--- a/src/lambda_function/tests/test_lambda_function.py
+++ b/src/lambda_function/tests/test_lambda_function.py
--- a/src/lambda_function/update_lambda_env_vars.py
+++ b/src/lambda_function/update_lambda_env_vars.py
@@ -0,0 +1,22 @@
 import boto3
 import os
 from src.utils.retry_logic import retry_with_backoff
 # Set variables
 LAMBDA_NAME = "RSSFeedProcessor"
@retry_with_backoff()
 def update_env_vars(function_name):
    lambda_client = boto3.client('lambda')
    env_vars = {
        'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
        'CONTENT_BUCKET': os.environ.get('S3_BUCKET_NAME'),
        'DYNAMODB_TABLE': os.environ.get('DYNAMODB_TABLE_NAME'),
        'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO')
    }
    return lambda_client.update_function_configuration(
        FunctionName=LAMBDA_NAME,
        Environment={'Variables': env_vars}
    )
--- a/src/utils/pycache/create_lambda_layer.cpython-310.pyc
+++ b/src/utils/pycache/create_lambda_layer.cpython-310.pyc
--- a/src/utils/pycache/create_s3_bucket.cpython-310.pyc
+++ b/src/utils/pycache/create_s3_bucket.cpython-310.pyc
--- a/src/utils/pycache/retry_logic.cpython-310.pyc
+++ b/src/utils/pycache/retry_logic.cpython-310.pyc
--- a/src/utils/pycache/upload_rss_feeds.cpython-310.pyc
+++ b/src/utils/pycache/upload_rss_feeds.cpython-310.pyc
--- a/src/utils/create_lambda_layer.py
+++ b/src/utils/create_lambda_layer.py
@@ -0,0 +1,87 @@
 import boto3
 import subprocess
 import os
 import shutil
 from botocore.exceptions import ClientError
 # Set variables
 LAYER_NAME = "RSSFeedProcessorDependencies"
 BUCKET_NAME = os.getenv("S3_LAYER_BUCKET_NAME")
 REQUIREMENTS_FILE = "src/lambda_function/layers/requirements.txt"
 ZIP_FILE = f"{LAYER_NAME}.zip"
 def create_s3_bucket_if_not_exists(bucket_name, region=None):
    s3_client = boto3.client('s3', region_name=region)
    try:
        # Check if the bucket exists
        s3_client.head_bucket(Bucket=bucket_name)
        print(f"Bucket '{bucket_name}' already exists.")
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == '404':
            # Create the bucket
            if region == 'us-east-1' or region is None:
                # us-east-1 does not require LocationConstraint
                s3_client.create_bucket(Bucket=bucket_name)
            else:
                # Other regions require LocationConstraint
                s3_client.create_bucket(
                    Bucket=bucket_name,
                    CreateBucketConfiguration={
                        'LocationConstraint': region
                    }
                )
            print(f"Bucket '{bucket_name}' created.")
        else:
            # For any other errors, re-raise the exception
            raise e
 def create_lambda_layer():
    # Create a temporary directory for the layer
    os.makedirs("layer/python", exist_ok=True)
    # Install dependencies
    subprocess.check_call([
        "pip", "install", 
        "-r", REQUIREMENTS_FILE, 
        "-t", "layer/python"
    ])
    print("Finished Installing Packages")
    # Create ZIP file
    shutil.make_archive(LAYER_NAME, 'zip', "layer")
    print("Finished Zipping Package")
    # Create or update Lambda layer
    lambda_client = boto3.client('lambda', region_name='us-east-1')
    # Make sure the S3 bucket exists 
    create_s3_bucket_if_not_exists(BUCKET_NAME)
    # Upload the zip file to S3
    s3_client = boto3.client('s3')
    s3_client.upload_file(ZIP_FILE, BUCKET_NAME, ZIP_FILE)
    print(f"Uploaded {ZIP_FILE} to S3 bucket '{BUCKET_NAME}'.")
    # Publish the layer using the S3 object
    response = lambda_client.publish_layer_version(
        LayerName=LAYER_NAME,
        Description="Dependencies for RSS Feed Processor",
        Content={
            'S3Bucket': BUCKET_NAME,
            'S3Key': ZIP_FILE
        },
        CompatibleRuntimes=['python3.10', 'python3.11']
    )
    print(f"Created Lambda layer version: {response['Version']}")
    # Clean up
    shutil.rmtree("layer")
    os.remove(ZIP_FILE)
    print("Lambda layer creation complete!")
 if __name__ == "__main__":
    create_lambda_layer()
--- a/src/utils/create_s3_bucket.py
+++ b/src/utils/create_s3_bucket.py
@@ -0,0 +1,34 @@
 import boto3
 from botocore.exceptions import ClientError
 def create_s3_bucket_if_not_exists(bucket_name, region=None):
    s3_client = boto3.client('s3', region_name=region)
    try:
        # Check if the bucket exists
        s3_client.head_bucket(Bucket=bucket_name)
        print(f"Bucket '{bucket_name}' already exists.")
    except ClientError as e:
        # If a 404 error is caught, it means the bucket does not exist
        error_code = e.response['Error']['Code']
        if error_code == '404':
            # Create the bucket
            if region is None:
                s3_client.create_bucket(Bucket=bucket_name)
            else:
                s3_client.create_bucket(
                    Bucket=bucket_name,
                    CreateBucketConfiguration={
                        'LocationConstraint': region
                    }
                )
            print(f"Bucket '{bucket_name}' created.")
        else:
            # For any other errors, re-raise the exception
            raise e
 # Example usage
 bucket_name = 'your-unique-bucket-name'
 region = 'us-east-1'  # Change this to your desired region
 create_s3_bucket_if_not_exists(bucket_name, region)
--- a/src/utils/retry_logic.py
+++ b/src/utils/retry_logic.py
@@ -0,0 +1,27 @@
 import time
 from botocore.exceptions import ClientError
 def retry_with_backoff(max_retries=5, initial_backoff=1, backoff_multiplier=2):
    def decorator(func):
        def wrapper(*args, **kwargs):
            retries = 0
            backoff = initial_backoff
            while retries < max_retries:
                try:
                    return func(*args, **kwargs)
                except ClientError as e:
                    if e.response['Error']['Code'] in ['ResourceConflictException', 'ResourceInUseException']:
                        if retries == max_retries - 1:
                            raise
                        wait_time = backoff * (2 ** retries)
                        print(f"Encountered {e.response['Error']['Code']}. Retrying in {wait_time} seconds...")
                        time.sleep(wait_time)
                        retries += 1
                        backoff *= backoff_multiplier
                    else:
                        raise
            raise Exception(f"Function failed after {max_retries} retries.")
        return wrapper
    return decorator
--- a/src/utils/upload_rss_feeds.py
+++ b/src/utils/upload_rss_feeds.py
@@ -0,0 +1,57 @@
 import json
 import boto3
 from boto3.dynamodb.conditions import Key
 from botocore.exceptions import ClientError
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def upload_rss_feeds(rss_feeds, table_name):
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table(table_name)
    logger.info(f"Uploading RSS feeds to table: {table_name}")
    try:
        # Get the table's key schema
        key_schema = table.key_schema
        partition_key = next(key['AttributeName'] for key in key_schema if key['KeyType'] == 'HASH')
    except ClientError as e:
        logger.error(f"Error getting table schema: {e.response['Error']['Message']}")
        return
    new_items = 0
    existing_items = 0
    for feed in rss_feeds:
        # Check if the item already exists
        try:
            response = table.get_item(Key={partition_key: feed['u']})
        except ClientError as e:
            logger.error(f"Error checking for existing item: {e.response['Error']['Message']}")
            continue
        if 'Item' not in response:
            # Item doesn't exist, insert new item
            item = {partition_key: feed['u'], 'dt': 0}
            item.update(feed)
            try:
                table.put_item(Item=item)
                new_items += 1
            except ClientError as e:
                logger.error(f"Error inserting new item: {e.response['Error']['Message']}")
        else:
            existing_items += 1
    logger.info(f"Upload complete. {new_items} new items inserted. {existing_items} items already existed.")
 if __name__ == "__main__":
    table_name = 'rss-feeds-table'
    rss_feed_path = 'rss_feeds.json'
    with open(rss_feed_path) as f:
        rss_feeds = json.load(f)
    logger.info(f"Loaded RSS feeds: {rss_feeds}")
    upload_rss_feeds(rss_feeds, table_name)
--- a/template.env
+++ b/template.env
@@ -0,0 +1,26 @@
 # AWS Configuration
 AWS_REGION=us-east-$
 AWS_ACCOUNT_ID=$$$$$$$$$
 # Access keys (only use these for local development, NEVER in production)
 AWS_ACCESS_KEY_ID=$$$$$$$$$
 AWS_SECRET_ACCESS_KEY=$$$$$$$$$
 # Resource Names (without ARNs or full URLs)
 LAMBDA_FUNCTION_NAME=rss-feed-processor
 LAMBDA_EXECUTION_ROLE_NAME=rss-feed-processor-role
 S3_BUCKET_NAME=rss-feed-processor-bucket
 DYNAMODB_TABLE_NAME=rss-feeds-table
 SQS_QUEUE_NAME=rss-feed-queue
 S3_LAYER_BUCKET_NAME=rss-feed-processor-layers
 # RSS Feed Processing Configuration
 MAX_ARTICLES_PER_FEED=10
 FEED_PROCESSING_TIMEOUT=90
 # Logging Configuration
 LOG_LEVEL=INFO
 # Other Application Settings
 APP_NAME=RSS Feed Processor
 VERSION=1.0.0
--- a/todo.md
+++ b/todo.md
@@ -0,0 +1,11 @@
 * [ ] Make sure lambda works base
 * [ ] Make sure the lambda syncs up well with the sqs and can easily pull items from dynamoDB. 
 * [ ] 
 * [ ] Version control lambda packages
 * [ ] RSS Feed Easy Insertion
 * [ ] environment variable template
 * [ ] Shoudl we do some vector database stuff with this repo as well?
 * [ ] We should probably make another module which makes it fairly easy to query all this data from 
 anywhere
 * [ ] Add in a scheduler for the lambda