mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 22:14:20 +01:00
annie
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
repo_structure.txt
|
||||||
|
.env
|
||||||
|
/layer/python*
|
||||||
128
README.md
128
README.md
@@ -1 +1,127 @@
|
|||||||
# OpenRSS
|
# OpenRSS Feed Processor
|
||||||
|
|
||||||
|
OpenRSS is an AWS-based RSS feed processing system that automatically fetches, processes, and stores articles from specified RSS feeds.
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
OpenRSS/
|
||||||
|
├── src/
|
||||||
|
│ ├── infra/
|
||||||
|
│ │ ├── cloudformation/
|
||||||
|
│ │ │ ├── s3.yaml
|
||||||
|
│ │ │ ├── dynamo.yaml
|
||||||
|
│ │ │ └── sqs.yaml
|
||||||
|
│ │ └── deploy_infrastructure.py
|
||||||
|
│ ├── lambda_function/
|
||||||
|
│ │ ├── src/
|
||||||
|
│ │ │ ├── lambda_function.py
|
||||||
|
│ │ │ ├── feed_processor.py
|
||||||
|
│ │ │ ├── article_extractor.py
|
||||||
|
│ │ │ ├── data_storage.py
|
||||||
|
│ │ │ ├── utils.py
|
||||||
|
│ │ │ ├── config.py
|
||||||
|
│ │ │ ├── exceptions.py
|
||||||
|
│ │ │ └── metrics.py
|
||||||
|
│ │ ├── tests/
|
||||||
|
│ │ │ └── test_lambda_function.py
|
||||||
|
│ │ ├── layers/
|
||||||
|
│ │ │ └── requirements.txt
|
||||||
|
│ │ ├── deploy_lambda.py
|
||||||
|
│ │ └── update_env_vars.py
|
||||||
|
│ └── utils/
|
||||||
|
│ ├── create_lambda_layer.py
|
||||||
|
│ └── upload_rss_feeds.py
|
||||||
|
├── launch.py
|
||||||
|
├── rss_feeds.json
|
||||||
|
├── requirements.txt
|
||||||
|
└── README.md
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Python 3.8+
|
||||||
|
- AWS CLI configured with appropriate permissions
|
||||||
|
- An AWS account with necessary services (S3, DynamoDB, SQS, Lambda) enabled
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
1. Clone the repository:
|
||||||
|
```
|
||||||
|
git clone https://github.com/yourusername/OpenRSS.git
|
||||||
|
cd OpenRSS
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install the required dependencies:
|
||||||
|
```
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Create a `.env` file in the root directory with the following content:
|
||||||
|
```
|
||||||
|
AWS_ACCESS_KEY_ID=your_access_key_here
|
||||||
|
AWS_SECRET_ACCESS_KEY=your_secret_key_here
|
||||||
|
AWS_REGION=us-east-1
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Update the `rss_feeds.json` file with the RSS feeds you want to process.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To deploy the infrastructure and start the RSS feed processor:
|
||||||
|
|
||||||
|
```
|
||||||
|
python launch.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This script will:
|
||||||
|
1. Deploy the necessary AWS infrastructure (S3, DynamoDB, SQS) using CloudFormation.
|
||||||
|
2. Create and upload the Lambda layer.
|
||||||
|
3. Deploy the Lambda function.
|
||||||
|
4. Upload the RSS feeds to DynamoDB.
|
||||||
|
5. Trigger an initial execution of the Lambda function.
|
||||||
|
|
||||||
|
## Infrastructure
|
||||||
|
|
||||||
|
The project uses the following AWS services:
|
||||||
|
|
||||||
|
- S3: Stores processed articles
|
||||||
|
- DynamoDB: Stores RSS feed information and processing status
|
||||||
|
- SQS: Queues RSS feeds for processing
|
||||||
|
- Lambda: Processes RSS feeds and extracts articles
|
||||||
|
|
||||||
|
## Lambda Function
|
||||||
|
|
||||||
|
The Lambda function (`src/lambda_function/src/lambda_function.py`) is triggered periodically to process RSS feeds. It:
|
||||||
|
|
||||||
|
1. Retrieves RSS feed information from DynamoDB
|
||||||
|
2. Fetches and parses the RSS feed
|
||||||
|
3. Extracts articles using the newspaper3k library
|
||||||
|
4. Stores processed articles in S3
|
||||||
|
5. Updates the feed's last processed timestamp in DynamoDB
|
||||||
|
|
||||||
|
## Customization
|
||||||
|
|
||||||
|
- To modify the CloudFormation templates, edit the YAML files in `src/infra/cloudformation/`.
|
||||||
|
- To change the Lambda function's behavior, modify the Python files in `src/lambda_function/src/`.
|
||||||
|
- To add or remove RSS feeds, update the `rss_feeds.json` file.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
To run the tests for the Lambda function:
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest src/lambda_function/tests/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
The Lambda function logs its activities to CloudWatch Logs. You can monitor the function's performance and any errors through the AWS CloudWatch console.
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
Contributions are welcome! Please feel free to submit a Pull Request.
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
This project is licensed under the MIT License.
|
||||||
66
launch.py
Normal file
66
launch.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import boto3
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# Load environment variables
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Set AWS credentials from environment variables
|
||||||
|
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
|
||||||
|
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('AWS_SECRET_ACCESS_KEY')
|
||||||
|
os.environ['AWS_DEFAULT_REGION'] = os.getenv('AWS_REGION')
|
||||||
|
TABLE_NAME = os.getenv('DYNAMODB_TABLE_NAME')
|
||||||
|
ACCOUNT_NUM = os.getenv("AWS_ACCOUNT_ID")
|
||||||
|
SQS_QUEUE_NAME = os.getenv("SQS_QUEUE_NAME")
|
||||||
|
REGION = os.getenv("AWS_REGION")
|
||||||
|
os.environ["SQS_QUEUE_URL"] = f"https://sqs.{REGION}.amazonaws.com/{ACCOUNT_NUM}/{SQS_QUEUE_NAME}"
|
||||||
|
|
||||||
|
|
||||||
|
lambda_client = boto3.client("lambda")
|
||||||
|
LAMBDA_FUNCTION_NAME = os.getenv("LAMBDA_FUNCTION_NAME")
|
||||||
|
|
||||||
|
|
||||||
|
# Add the src directory to the Python path
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
sys.path.append(current_dir)
|
||||||
|
|
||||||
|
from src.infra.deploy_infrastructure import deploy_infrastructure
|
||||||
|
from src.utils.create_lambda_layer import create_lambda_layer
|
||||||
|
from src.lambda_function.deploy_lambda import deploy_lambda
|
||||||
|
from src.lambda_function.update_lambda_env_vars import update_env_vars
|
||||||
|
from src.utils.upload_rss_feeds import upload_rss_feeds
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Deploy infrastructure
|
||||||
|
deploy_infrastructure()
|
||||||
|
|
||||||
|
# Create Lambda layer
|
||||||
|
create_lambda_layer()
|
||||||
|
print("Finished with Lambda Layer")
|
||||||
|
|
||||||
|
|
||||||
|
# Deploy Lambda function
|
||||||
|
deploy_lambda()
|
||||||
|
print("Finished Deploying Lambda")
|
||||||
|
|
||||||
|
# Update Lambda environment variables
|
||||||
|
update_env_vars(LAMBDA_FUNCTION_NAME, )
|
||||||
|
print("Finished Environment Variable Updates")
|
||||||
|
|
||||||
|
# Upload RSS feeds
|
||||||
|
rss_feeds_file = os.path.join(current_dir, "rss_feeds.json")
|
||||||
|
if os.path.exists(rss_feeds_file):
|
||||||
|
with open(rss_feeds_file, 'r') as f:
|
||||||
|
rss_feeds = json.load(f)
|
||||||
|
upload_rss_feeds(rss_feeds, TABLE_NAME)
|
||||||
|
else:
|
||||||
|
print(f"WARNING: {rss_feeds_file} not found. Skipping RSS feed upload.")
|
||||||
|
|
||||||
|
print("RSS Feed Processor launched successfully!")
|
||||||
|
|
||||||
|
print("RSS Feed Processor launched successfully!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
0
requirements.txt
Normal file
0
requirements.txt
Normal file
42
rss_feeds.json
Normal file
42
rss_feeds.json
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"u": "http://rss.cnn.com/rss/cnn_topstories.rss",
|
||||||
|
"dt": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"u": "https://feeds.a.dj.com/rss/RSSWorldNews.xml",
|
||||||
|
"dt": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"u": "http://feeds.bbci.co.uk/news/world/rss.xml",
|
||||||
|
"dt": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"u": "https://feeds.npr.org/1001/rss.xml",
|
||||||
|
"dt": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"u": "https://www.reddit.com/r/news/.rss",
|
||||||
|
"dt": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"u": "https://news.ycombinator.com/rss",
|
||||||
|
"dt": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"u": "https://techcrunch.com/feed/",
|
||||||
|
"dt": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"u": "https://www.wired.com/feed/rss",
|
||||||
|
"dt": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"u": "https://www.sciencedaily.com/rss/all.xml",
|
||||||
|
"dt": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"u": "https://www.nasa.gov/rss/dyn/breaking_news.rss",
|
||||||
|
"dt": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
BIN
src/infra/__pycache__/deploy_infrastructure.cpython-310.pyc
Normal file
BIN
src/infra/__pycache__/deploy_infrastructure.cpython-310.pyc
Normal file
Binary file not shown.
27
src/infra/cloudformation/dynamo.yaml
Normal file
27
src/infra/cloudformation/dynamo.yaml
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
AWSTemplateFormatVersion: '2010-09-09'
|
||||||
|
Description: 'CloudFormation template for RSS Feed Processor DynamoDB Table'
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
DynamoDBName:
|
||||||
|
Type: String
|
||||||
|
Description: ""
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
RSSFeedsTable:
|
||||||
|
Type: AWS::DynamoDB::Table
|
||||||
|
Properties:
|
||||||
|
TableName: !Ref 'DynamoDBName'
|
||||||
|
AttributeDefinitions:
|
||||||
|
- AttributeName: url
|
||||||
|
AttributeType: S
|
||||||
|
KeySchema:
|
||||||
|
- AttributeName: url
|
||||||
|
KeyType: HASH
|
||||||
|
BillingMode: PAY_PER_REQUEST
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
TableName:
|
||||||
|
Description: 'Name of the DynamoDB table for RSS feeds'
|
||||||
|
Value: !Ref RSSFeedsTable
|
||||||
|
Export:
|
||||||
|
Name: !Sub '${AWS::StackName}-RSSFeedsTableName'
|
||||||
48
src/infra/cloudformation/lambda_role.yaml
Normal file
48
src/infra/cloudformation/lambda_role.yaml
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
AWSTemplateFormatVersion: '2010-09-09'
|
||||||
|
Description: 'IAM Role for RSS Feed Processor Lambda Function with Broad Permissions'
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
LambdaExecutionRoleName:
|
||||||
|
Type: String
|
||||||
|
Description: "Name of the Lambda Execution Role"
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
LambdaExecutionRole:
|
||||||
|
Type: 'AWS::IAM::Role'
|
||||||
|
Properties:
|
||||||
|
RoleName: !Ref LambdaExecutionRoleName
|
||||||
|
AssumeRolePolicyDocument:
|
||||||
|
Version: '2012-10-17'
|
||||||
|
Statement:
|
||||||
|
- Effect: Allow
|
||||||
|
Principal:
|
||||||
|
Service:
|
||||||
|
- lambda.amazonaws.com
|
||||||
|
Action:
|
||||||
|
- 'sts:AssumeRole'
|
||||||
|
ManagedPolicyArns:
|
||||||
|
- 'arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole'
|
||||||
|
Policies:
|
||||||
|
- PolicyName: 'RSSFeedProcessorLambdaBroadPolicy'
|
||||||
|
PolicyDocument:
|
||||||
|
Version: '2012-10-17'
|
||||||
|
Statement:
|
||||||
|
- Effect: Allow
|
||||||
|
Action:
|
||||||
|
- 'sqs:*'
|
||||||
|
- 'dynamodb:*'
|
||||||
|
- 's3:*'
|
||||||
|
- 'lambda:*'
|
||||||
|
- 'logs:*'
|
||||||
|
- 'xray:*'
|
||||||
|
- 'cloudwatch:*'
|
||||||
|
- 'events:*'
|
||||||
|
- 'kms:Decrypt'
|
||||||
|
Resource: '*'
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
LambdaRoleArn:
|
||||||
|
Description: 'ARN of the Lambda Execution Role'
|
||||||
|
Value: !GetAtt LambdaExecutionRole.Arn
|
||||||
|
Export:
|
||||||
|
Name: !Sub '${AWS::StackName}-LambdaRoleArn'
|
||||||
26
src/infra/cloudformation/s3.yaml
Normal file
26
src/infra/cloudformation/s3.yaml
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
AWSTemplateFormatVersion: '2010-09-09'
|
||||||
|
Description: 'CloudFormation template for RSS Feed Processor S3 Bucket'
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
BucketName:
|
||||||
|
Type: String
|
||||||
|
Description: "Name of the Lambda Execution Role"
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
ArticleContentBucket:
|
||||||
|
Type: AWS::S3::Bucket
|
||||||
|
Properties:
|
||||||
|
BucketName: !Ref BucketName
|
||||||
|
VersioningConfiguration:
|
||||||
|
Status: Enabled
|
||||||
|
BucketEncryption:
|
||||||
|
ServerSideEncryptionConfiguration:
|
||||||
|
- ServerSideEncryptionByDefault:
|
||||||
|
SSEAlgorithm: AES256
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
BucketName:
|
||||||
|
Description: 'Name of the S3 bucket for article content'
|
||||||
|
Value: !Ref ArticleContentBucket
|
||||||
|
Export:
|
||||||
|
Name: !Sub '${AWS::StackName}-ArticleContentBucketName'
|
||||||
35
src/infra/cloudformation/sqs.yaml
Normal file
35
src/infra/cloudformation/sqs.yaml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
AWSTemplateFormatVersion: '2010-09-09'
|
||||||
|
Description: 'CloudFormation template for RSS Feed Processor SQS Queue'
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
SQSQueueName:
|
||||||
|
Type: String
|
||||||
|
Description: ""
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
RSSFeedQueue:
|
||||||
|
Type: AWS::SQS::Queue
|
||||||
|
Properties:
|
||||||
|
QueueName: !Ref SQSQueueName
|
||||||
|
VisibilityTimeout: 300
|
||||||
|
RedrivePolicy:
|
||||||
|
deadLetterTargetArn: !GetAtt RSSFeedDLQ.Arn
|
||||||
|
maxReceiveCount: 3
|
||||||
|
|
||||||
|
RSSFeedDLQ:
|
||||||
|
Type: AWS::SQS::Queue
|
||||||
|
Properties:
|
||||||
|
QueueName: !Sub '${AWS::StackName}-rss-feed-dlq'
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
QueueURL:
|
||||||
|
Description: 'URL of the SQS queue for RSS feeds'
|
||||||
|
Value: !Ref RSSFeedQueue
|
||||||
|
Export:
|
||||||
|
Name: !Sub '${AWS::StackName}-RSSFeedQueueURL'
|
||||||
|
|
||||||
|
DLQueueURL:
|
||||||
|
Description: 'URL of the Dead Letter Queue for RSS feeds'
|
||||||
|
Value: !Ref RSSFeedDLQ
|
||||||
|
Export:
|
||||||
|
Name: !Sub '${AWS::StackName}-RSSFeedDLQueueURL'
|
||||||
95
src/infra/deploy_infrastructure.py
Normal file
95
src/infra/deploy_infrastructure.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
import boto3
|
||||||
|
import os
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
|
def deploy_cloudformation(template_file, stack_suffix, force_recreate=False, parameters=[]):
|
||||||
|
cf_client = boto3.client('cloudformation')
|
||||||
|
stack_name = f"rss-feed-processor-{stack_suffix}"
|
||||||
|
|
||||||
|
with open(f'src/infra/cloudformation/{template_file}', 'r') as file:
|
||||||
|
template_body = file.read()
|
||||||
|
|
||||||
|
print(f"Template contents:\n{template_body}")
|
||||||
|
|
||||||
|
capabilities = ['CAPABILITY_NAMED_IAM']
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
if force_recreate:
|
||||||
|
try:
|
||||||
|
print(f"Deleting stack {stack_name} for recreation...")
|
||||||
|
cf_client.delete_stack(StackName=stack_name)
|
||||||
|
waiter = cf_client.get_waiter('stack_delete_complete')
|
||||||
|
waiter.wait(StackName=stack_name)
|
||||||
|
print(f"Stack {stack_name} deleted successfully.")
|
||||||
|
except ClientError:
|
||||||
|
print(f"Stack {stack_name} does not exist or is already deleted.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
stack = cf_client.describe_stacks(StackName=stack_name)['Stacks'][0]
|
||||||
|
print(f"Updating stack {stack_name}...")
|
||||||
|
cf_client.update_stack(
|
||||||
|
StackName=stack_name,
|
||||||
|
TemplateBody=template_body,
|
||||||
|
Capabilities=capabilities,
|
||||||
|
Parameters=parameters # Add parameters here
|
||||||
|
)
|
||||||
|
waiter = cf_client.get_waiter('stack_update_complete')
|
||||||
|
waiter.wait(StackName=stack_name)
|
||||||
|
print(f"Stack {stack_name} updated successfully.")
|
||||||
|
except ClientError as e:
|
||||||
|
if 'does not exist' in str(e):
|
||||||
|
print(f"Creating stack {stack_name}...")
|
||||||
|
cf_client.create_stack(
|
||||||
|
StackName=stack_name,
|
||||||
|
TemplateBody=template_body,
|
||||||
|
Capabilities=capabilities,
|
||||||
|
Parameters=parameters # Add parameters here
|
||||||
|
)
|
||||||
|
waiter = cf_client.get_waiter('stack_create_complete')
|
||||||
|
waiter.wait(StackName=stack_name)
|
||||||
|
print(f"Stack {stack_name} created successfully.")
|
||||||
|
elif 'No updates are to be performed' in str(e):
|
||||||
|
print(f"No updates needed for stack {stack_name}.")
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
except ClientError as e:
|
||||||
|
print(f"Error handling stack {stack_name}: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def deploy_infrastructure():
|
||||||
|
deploy_cloudformation('s3.yaml', 'S3',
|
||||||
|
parameters=[
|
||||||
|
{
|
||||||
|
'ParameterKey': 'BucketName',
|
||||||
|
'ParameterValue': os.environ.get('S3_BUCKET_NAME', 'default-role-name')
|
||||||
|
}
|
||||||
|
]) # Force recreation of Lambda role)
|
||||||
|
deploy_cloudformation('dynamo.yaml', 'DynamoDB',
|
||||||
|
parameters=[
|
||||||
|
{
|
||||||
|
'ParameterKey': 'DynamoDBName',
|
||||||
|
'ParameterValue': os.environ.get('DYNAMODB_TABLE_NAME', 'default-role-name')
|
||||||
|
}
|
||||||
|
])
|
||||||
|
|
||||||
|
deploy_cloudformation('sqs.yaml', 'SQS',
|
||||||
|
parameters=[
|
||||||
|
{
|
||||||
|
'ParameterKey': 'SQSQueueName',
|
||||||
|
'ParameterValue': os.environ.get('SQS_QUEUE_NAME', 'default-role-name')
|
||||||
|
}
|
||||||
|
])
|
||||||
|
|
||||||
|
deploy_cloudformation('lambda_role.yaml', 'Lambda', force_recreate=True,
|
||||||
|
parameters=[
|
||||||
|
{
|
||||||
|
'ParameterKey': 'LambdaExecutionRoleName',
|
||||||
|
'ParameterValue': os.environ.get('LAMBDA_EXECUTION_ROLE_NAME', 'default-role-name')
|
||||||
|
}
|
||||||
|
])
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
deploy_infrastructure()
|
||||||
|
|
||||||
BIN
src/lambda_function/__pycache__/deploy_lambda.cpython-310.pyc
Normal file
BIN
src/lambda_function/__pycache__/deploy_lambda.cpython-310.pyc
Normal file
Binary file not shown.
BIN
src/lambda_function/__pycache__/update_env_vars.cpython-310.pyc
Normal file
BIN
src/lambda_function/__pycache__/update_env_vars.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
90
src/lambda_function/deploy_lambda.py
Normal file
90
src/lambda_function/deploy_lambda.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
import boto3
|
||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
import io
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
from src.utils.retry_logic import retry_with_backoff
|
||||||
|
|
||||||
|
# Set variables
|
||||||
|
LAMBDA_NAME = "RSSFeedProcessor"
|
||||||
|
LAMBDA_HANDLER = "lambda_function.lambda_handler"
|
||||||
|
ACCOUNT_NUM = os.getenv('AWS_ACCOUNT_ID')
|
||||||
|
LAMBDA_ROLE_NAME = os.getenv('LAMBDA_EXECUTION_ROLE_NAME')
|
||||||
|
LAMBDA_ROLE_ARN = f"arn:aws:iam::{ACCOUNT_NUM}:role/{LAMBDA_ROLE_NAME}"
|
||||||
|
LAMBDA_TIMEOUT = 300
|
||||||
|
LAMBDA_MEMORY = 256
|
||||||
|
LAMBDA_RUNTIME = "python3.10"
|
||||||
|
|
||||||
|
def zip_directory(path):
|
||||||
|
print(f"Creating deployment package from {path}...")
|
||||||
|
zip_buffer = io.BytesIO()
|
||||||
|
with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED, False) as zip_file:
|
||||||
|
for root, _, files in os.walk(path):
|
||||||
|
for file in files:
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
arcname = os.path.relpath(file_path, path)
|
||||||
|
zip_file.write(file_path, arcname)
|
||||||
|
return zip_buffer.getvalue()
|
||||||
|
|
||||||
|
@retry_with_backoff()
|
||||||
|
def update_function_code(lambda_client, function_name, zip_file):
|
||||||
|
return lambda_client.update_function_code(
|
||||||
|
FunctionName=function_name,
|
||||||
|
ZipFile=zip_file
|
||||||
|
)
|
||||||
|
|
||||||
|
@retry_with_backoff()
|
||||||
|
def update_function_configuration(lambda_client, function_name, handler, role, timeout, memory):
|
||||||
|
return lambda_client.update_function_configuration(
|
||||||
|
FunctionName=function_name,
|
||||||
|
Handler=handler,
|
||||||
|
Role=role,
|
||||||
|
Timeout=timeout,
|
||||||
|
MemorySize=memory
|
||||||
|
)
|
||||||
|
|
||||||
|
@retry_with_backoff()
|
||||||
|
def create_function(lambda_client, function_name, runtime, role, handler, zip_file, timeout, memory):
|
||||||
|
return lambda_client.create_function(
|
||||||
|
FunctionName=function_name,
|
||||||
|
Runtime=runtime,
|
||||||
|
Role=role,
|
||||||
|
Handler=handler,
|
||||||
|
Code={'ZipFile': zip_file},
|
||||||
|
Timeout=timeout,
|
||||||
|
MemorySize=memory
|
||||||
|
)
|
||||||
|
|
||||||
|
def deploy_lambda():
|
||||||
|
lambda_client = boto3.client('lambda')
|
||||||
|
|
||||||
|
print(f"Starting deployment of Lambda function: {LAMBDA_NAME}")
|
||||||
|
deployment_package = zip_directory('src/lambda_function/src')
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check if the function exists
|
||||||
|
try:
|
||||||
|
lambda_client.get_function(FunctionName=LAMBDA_NAME)
|
||||||
|
function_exists = True
|
||||||
|
except ClientError as e:
|
||||||
|
if e.response['Error']['Code'] == 'ResourceNotFoundException':
|
||||||
|
function_exists = False
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
if function_exists:
|
||||||
|
print("Updating existing Lambda function...")
|
||||||
|
update_function_code(lambda_client, LAMBDA_NAME, deployment_package)
|
||||||
|
update_function_configuration(lambda_client, LAMBDA_NAME, LAMBDA_HANDLER, LAMBDA_ROLE_ARN, LAMBDA_TIMEOUT, LAMBDA_MEMORY)
|
||||||
|
else:
|
||||||
|
print(f"Lambda function '{LAMBDA_NAME}' not found. Creating new function...")
|
||||||
|
create_function(lambda_client, LAMBDA_NAME, LAMBDA_RUNTIME, LAMBDA_ROLE_ARN, LAMBDA_HANDLER, deployment_package, LAMBDA_TIMEOUT, LAMBDA_MEMORY)
|
||||||
|
|
||||||
|
print("Lambda deployment completed successfully!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during Lambda deployment: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
deploy_lambda()
|
||||||
5
src/lambda_function/layers/requirements.txt
Normal file
5
src/lambda_function/layers/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
requests
|
||||||
|
newspaper3k
|
||||||
|
feedparser
|
||||||
|
python-dateutil
|
||||||
|
pandas
|
||||||
BIN
src/lambda_function/src/__pycache__/utils.cpython-310.pyc
Normal file
BIN
src/lambda_function/src/__pycache__/utils.cpython-310.pyc
Normal file
Binary file not shown.
28
src/lambda_function/src/article_extractor.py
Normal file
28
src/lambda_function/src/article_extractor.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
import newspaper
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
def extract_article(url):
|
||||||
|
"""
|
||||||
|
Extracts the title and text of an article from the given URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL of the article.
|
||||||
|
Returns:
|
||||||
|
A tuple containing the title and text of the article, respectively.
|
||||||
|
"""
|
||||||
|
logger.debug(f"Starting Newspaper Article Extraction {url}")
|
||||||
|
config = newspaper.Config()
|
||||||
|
config.request_timeout = 60
|
||||||
|
article = newspaper.Article(url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
article.download()
|
||||||
|
logger.debug(f"Downloaded Article {url}")
|
||||||
|
article.parse()
|
||||||
|
logger.debug(f"Parsed Article {url}")
|
||||||
|
return article.title, article.text
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to extract article {url}: {str(e)}")
|
||||||
|
return None, None
|
||||||
20
src/lambda_function/src/config.py
Normal file
20
src/lambda_function/src/config.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
# SQS Configuration
|
||||||
|
SQS_QUEUE_URL = os.environ['SQS_QUEUE_URL']
|
||||||
|
|
||||||
|
# S3 Configuration
|
||||||
|
CONTENT_BUCKET = os.environ['CONTENT_BUCKET']
|
||||||
|
|
||||||
|
# DynamoDB Configuration
|
||||||
|
DYNAMODB_TABLE = os.environ['DYNAMODB_TABLE']
|
||||||
|
|
||||||
|
# Logging Configuration
|
||||||
|
LOG_LEVEL = os.environ.get('LOG_LEVEL', 'INFO')
|
||||||
|
|
||||||
|
# RSS Feed Processing Configuration
|
||||||
|
MAX_ARTICLES_PER_FEED = int(os.environ.get('MAX_ARTICLES_PER_FEED', '10'))
|
||||||
|
FEED_PROCESSING_TIMEOUT = int(os.environ.get('FEED_PROCESSING_TIMEOUT', '90'))
|
||||||
|
|
||||||
|
# Article Extraction Configuration
|
||||||
|
ARTICLE_EXTRACTION_TIMEOUT = int(os.environ.get('ARTICLE_EXTRACTION_TIMEOUT', '30'))
|
||||||
42
src/lambda_function/src/data_storage.py
Normal file
42
src/lambda_function/src/data_storage.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import boto3
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
s3 = boto3.client('s3')
|
||||||
|
dynamodb = boto3.resource('dynamodb')
|
||||||
|
|
||||||
|
CONTENT_BUCKET = os.environ['CONTENT_BUCKET']
|
||||||
|
DYNAMODB_TABLE = os.environ['DYNAMODB_TABLE']
|
||||||
|
|
||||||
|
def save_article(article):
|
||||||
|
try:
|
||||||
|
# Save to S3
|
||||||
|
key = f"articles/{article['unixTime']}/{article['link'].split('/')[-1]}.json"
|
||||||
|
s3.put_object(
|
||||||
|
Bucket=CONTENT_BUCKET,
|
||||||
|
Key=key,
|
||||||
|
Body=json.dumps(article)
|
||||||
|
)
|
||||||
|
logger.info(f"Saved article to S3: {key}")
|
||||||
|
|
||||||
|
# Save to DynamoDB
|
||||||
|
table = dynamodb.Table(DYNAMODB_TABLE)
|
||||||
|
table.put_item(Item=article)
|
||||||
|
logger.info(f"Saved article to DynamoDB: {article['link']}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to save article: {str(e)}")
|
||||||
|
|
||||||
|
def update_rss_feed(feed):
|
||||||
|
try:
|
||||||
|
table = dynamodb.Table(DYNAMODB_TABLE)
|
||||||
|
table.update_item(
|
||||||
|
Key={'u': feed['u']},
|
||||||
|
UpdateExpression='SET dt = :val',
|
||||||
|
ExpressionAttributeValues={':val': feed['dt']}
|
||||||
|
)
|
||||||
|
logger.info(f"Updated RSS feed in DynamoDB: {feed['u']}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to update RSS feed: {str(e)}")
|
||||||
11
src/lambda_function/src/exceptions.py
Normal file
11
src/lambda_function/src/exceptions.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
class RSSProcessingError(Exception):
|
||||||
|
"""Exception raised for errors in the RSS processing."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class ArticleExtractionError(Exception):
|
||||||
|
"""Exception raised for errors in the article extraction."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class DataStorageError(Exception):
|
||||||
|
"""Exception raised for errors in data storage operations."""
|
||||||
|
pass
|
||||||
76
src/lambda_function/src/feed_processor.py
Normal file
76
src/lambda_function/src/feed_processor.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
import feedparser
|
||||||
|
from datetime import datetime
|
||||||
|
from dateutil import parser
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
|
import logging
|
||||||
|
from article_extractor import extract_article
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
def process_feed(feed: dict):
|
||||||
|
output_queue = queue.Queue()
|
||||||
|
stop_thread = threading.Event()
|
||||||
|
thread = threading.Thread(target=extract_feed, args=(feed, output_queue, stop_thread))
|
||||||
|
thread.daemon = True
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
logger.debug(f"Thread Started: {feed['u']}")
|
||||||
|
thread.join(timeout=90)
|
||||||
|
|
||||||
|
if thread.is_alive():
|
||||||
|
stop_thread.set()
|
||||||
|
logger.debug(f"Killing Thread: {feed['u']}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
output = output_queue.get_nowait()
|
||||||
|
logger.info(f"Thread Succeeded: {feed['u']}")
|
||||||
|
return output
|
||||||
|
except queue.Empty:
|
||||||
|
logger.info(f"Thread Failed: {feed['u']}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_feed(rss: dict, output_queue, stop_thread):
|
||||||
|
articles = []
|
||||||
|
feed_url = rss['u']
|
||||||
|
last_date = rss['dt']
|
||||||
|
max_date = last_date
|
||||||
|
|
||||||
|
try:
|
||||||
|
feed = feedparser.parse(feed_url)
|
||||||
|
for entry in feed['entries']:
|
||||||
|
if stop_thread.is_set():
|
||||||
|
break
|
||||||
|
|
||||||
|
pub_date = parse_pub_date(entry['published'])
|
||||||
|
|
||||||
|
if pub_date > last_date:
|
||||||
|
title, text = extract_article(entry.link)
|
||||||
|
article = {
|
||||||
|
'link': entry.link,
|
||||||
|
'rss': feed_url,
|
||||||
|
'title': title,
|
||||||
|
'content': text,
|
||||||
|
'unixTime': pub_date
|
||||||
|
}
|
||||||
|
articles.append(article)
|
||||||
|
max_date = max(max_date, pub_date)
|
||||||
|
|
||||||
|
output = {
|
||||||
|
'articles': articles,
|
||||||
|
'max_date': max_date,
|
||||||
|
'feed': rss
|
||||||
|
}
|
||||||
|
output_queue.put(output)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Feed failed due to error: {e}")
|
||||||
|
|
||||||
|
def parse_pub_date(date_string):
|
||||||
|
try:
|
||||||
|
return int(datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z").timestamp())
|
||||||
|
except ValueError:
|
||||||
|
try:
|
||||||
|
return int(datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ").timestamp())
|
||||||
|
except ValueError:
|
||||||
|
return int(parser.parse(date_string).timestamp())
|
||||||
78
src/lambda_function/src/lambda_function.py
Normal file
78
src/lambda_function/src/lambda_function.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import json
|
||||||
|
import time
|
||||||
|
from feed_processor import process_feed
|
||||||
|
from data_storage import save_article, update_rss_feed
|
||||||
|
from utils import setup_logging
|
||||||
|
from config import SQS_QUEUE_URL
|
||||||
|
from exceptions import RSSProcessingError, ArticleExtractionError, DataStorageError
|
||||||
|
from metrics import record_processed_articles, record_processing_time, record_extraction_errors
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logger = setup_logging()
|
||||||
|
|
||||||
|
# Initialize AWS clients
|
||||||
|
sqs = boto3.client('sqs')
|
||||||
|
|
||||||
|
def lambda_handler(event, context):
|
||||||
|
logger.info("Starting RSS feed processing")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Receive message from SQS
|
||||||
|
response = sqs.receive_message(
|
||||||
|
QueueUrl=SQS_QUEUE_URL,
|
||||||
|
MaxNumberOfMessages=1,
|
||||||
|
WaitTimeSeconds=0
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'Messages' not in response:
|
||||||
|
logger.info("No messages in queue")
|
||||||
|
return {'statusCode': 200, 'body': json.dumps('No RSS feeds to process')}
|
||||||
|
|
||||||
|
message = response['Messages'][0]
|
||||||
|
receipt_handle = message['ReceiptHandle']
|
||||||
|
feed = json.loads(message['Body'])
|
||||||
|
|
||||||
|
# Process the feed
|
||||||
|
result = process_feed(feed)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
# Save articles and update feed
|
||||||
|
for article in result['articles']:
|
||||||
|
try:
|
||||||
|
save_article(article)
|
||||||
|
except DataStorageError as e:
|
||||||
|
logger.error(f"Failed to save article: {str(e)}")
|
||||||
|
record_extraction_errors(1)
|
||||||
|
|
||||||
|
update_rss_feed(result['feed'])
|
||||||
|
|
||||||
|
# Delete the message from the queue
|
||||||
|
sqs.delete_message(QueueUrl=SQS_QUEUE_URL, ReceiptHandle=receipt_handle)
|
||||||
|
logger.info(f"Processed feed: {feed['u']}")
|
||||||
|
|
||||||
|
# Record metrics
|
||||||
|
record_processed_articles(len(result['articles']))
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to process feed: {feed['u']}")
|
||||||
|
record_extraction_errors(1)
|
||||||
|
|
||||||
|
except RSSProcessingError as e:
|
||||||
|
logger.error(f"RSS Processing Error: {str(e)}")
|
||||||
|
return {'statusCode': 500, 'body': json.dumps('RSS processing failed')}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error: {str(e)}")
|
||||||
|
return {'statusCode': 500, 'body': json.dumps('An unexpected error occurred')}
|
||||||
|
|
||||||
|
finally:
|
||||||
|
end_time = time.time()
|
||||||
|
processing_time = end_time - start_time
|
||||||
|
record_processing_time(processing_time)
|
||||||
|
logger.info(f"Lambda execution time: {processing_time:.2f} seconds")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'statusCode': 200,
|
||||||
|
'body': json.dumps('RSS feed processed successfully')
|
||||||
|
}
|
||||||
26
src/lambda_function/src/metrics.py
Normal file
26
src/lambda_function/src/metrics.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
import boto3
|
||||||
|
import time
|
||||||
|
|
||||||
|
cloudwatch = boto3.client('cloudwatch')
|
||||||
|
|
||||||
|
def put_metric_data(metric_name, value, unit='Count'):
|
||||||
|
cloudwatch.put_metric_data(
|
||||||
|
Namespace='RSS/FeedProcessor',
|
||||||
|
MetricData=[
|
||||||
|
{
|
||||||
|
'MetricName': metric_name,
|
||||||
|
'Value': value,
|
||||||
|
'Unit': unit,
|
||||||
|
'Timestamp': time.time()
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def record_processed_articles(count):
|
||||||
|
put_metric_data('ProcessedArticles', count)
|
||||||
|
|
||||||
|
def record_processing_time(duration):
|
||||||
|
put_metric_data('ProcessingTime', duration, 'Seconds')
|
||||||
|
|
||||||
|
def record_extraction_errors(count):
|
||||||
|
put_metric_data('ExtractionErrors', count)
|
||||||
8
src/lambda_function/src/utils.py
Normal file
8
src/lambda_function/src/utils.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
def setup_logging():
|
||||||
|
logger = logging.getLogger()
|
||||||
|
log_level = os.environ.get('LOG_LEVEL', 'INFO')
|
||||||
|
logger.setLevel(logging.getLevelName(log_level))
|
||||||
|
return logger
|
||||||
0
src/lambda_function/tests/test_lambda_function.py
Normal file
0
src/lambda_function/tests/test_lambda_function.py
Normal file
22
src/lambda_function/update_lambda_env_vars.py
Normal file
22
src/lambda_function/update_lambda_env_vars.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
import boto3
|
||||||
|
import os
|
||||||
|
from src.utils.retry_logic import retry_with_backoff
|
||||||
|
|
||||||
|
# Set variables
|
||||||
|
LAMBDA_NAME = "RSSFeedProcessor"
|
||||||
|
|
||||||
|
@retry_with_backoff()
|
||||||
|
def update_env_vars(function_name):
|
||||||
|
lambda_client = boto3.client('lambda')
|
||||||
|
|
||||||
|
env_vars = {
|
||||||
|
'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
|
||||||
|
'CONTENT_BUCKET': os.environ.get('S3_BUCKET_NAME'),
|
||||||
|
'DYNAMODB_TABLE': os.environ.get('DYNAMODB_TABLE_NAME'),
|
||||||
|
'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO')
|
||||||
|
}
|
||||||
|
|
||||||
|
return lambda_client.update_function_configuration(
|
||||||
|
FunctionName=LAMBDA_NAME,
|
||||||
|
Environment={'Variables': env_vars}
|
||||||
|
)
|
||||||
BIN
src/utils/__pycache__/create_lambda_layer.cpython-310.pyc
Normal file
BIN
src/utils/__pycache__/create_lambda_layer.cpython-310.pyc
Normal file
Binary file not shown.
BIN
src/utils/__pycache__/create_s3_bucket.cpython-310.pyc
Normal file
BIN
src/utils/__pycache__/create_s3_bucket.cpython-310.pyc
Normal file
Binary file not shown.
BIN
src/utils/__pycache__/retry_logic.cpython-310.pyc
Normal file
BIN
src/utils/__pycache__/retry_logic.cpython-310.pyc
Normal file
Binary file not shown.
BIN
src/utils/__pycache__/upload_rss_feeds.cpython-310.pyc
Normal file
BIN
src/utils/__pycache__/upload_rss_feeds.cpython-310.pyc
Normal file
Binary file not shown.
87
src/utils/create_lambda_layer.py
Normal file
87
src/utils/create_lambda_layer.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
import boto3
|
||||||
|
import subprocess
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
|
# Set variables
|
||||||
|
LAYER_NAME = "RSSFeedProcessorDependencies"
|
||||||
|
BUCKET_NAME = os.getenv("S3_LAYER_BUCKET_NAME")
|
||||||
|
REQUIREMENTS_FILE = "src/lambda_function/layers/requirements.txt"
|
||||||
|
ZIP_FILE = f"{LAYER_NAME}.zip"
|
||||||
|
|
||||||
|
def create_s3_bucket_if_not_exists(bucket_name, region=None):
|
||||||
|
s3_client = boto3.client('s3', region_name=region)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check if the bucket exists
|
||||||
|
s3_client.head_bucket(Bucket=bucket_name)
|
||||||
|
print(f"Bucket '{bucket_name}' already exists.")
|
||||||
|
except ClientError as e:
|
||||||
|
error_code = e.response['Error']['Code']
|
||||||
|
if error_code == '404':
|
||||||
|
# Create the bucket
|
||||||
|
if region == 'us-east-1' or region is None:
|
||||||
|
# us-east-1 does not require LocationConstraint
|
||||||
|
s3_client.create_bucket(Bucket=bucket_name)
|
||||||
|
else:
|
||||||
|
# Other regions require LocationConstraint
|
||||||
|
s3_client.create_bucket(
|
||||||
|
Bucket=bucket_name,
|
||||||
|
CreateBucketConfiguration={
|
||||||
|
'LocationConstraint': region
|
||||||
|
}
|
||||||
|
)
|
||||||
|
print(f"Bucket '{bucket_name}' created.")
|
||||||
|
else:
|
||||||
|
# For any other errors, re-raise the exception
|
||||||
|
raise e
|
||||||
|
|
||||||
|
def create_lambda_layer():
|
||||||
|
# Create a temporary directory for the layer
|
||||||
|
os.makedirs("layer/python", exist_ok=True)
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
subprocess.check_call([
|
||||||
|
"pip", "install",
|
||||||
|
"-r", REQUIREMENTS_FILE,
|
||||||
|
"-t", "layer/python"
|
||||||
|
])
|
||||||
|
print("Finished Installing Packages")
|
||||||
|
|
||||||
|
# Create ZIP file
|
||||||
|
shutil.make_archive(LAYER_NAME, 'zip', "layer")
|
||||||
|
print("Finished Zipping Package")
|
||||||
|
|
||||||
|
# Create or update Lambda layer
|
||||||
|
lambda_client = boto3.client('lambda', region_name='us-east-1')
|
||||||
|
|
||||||
|
# Make sure the S3 bucket exists
|
||||||
|
create_s3_bucket_if_not_exists(BUCKET_NAME)
|
||||||
|
|
||||||
|
# Upload the zip file to S3
|
||||||
|
s3_client = boto3.client('s3')
|
||||||
|
s3_client.upload_file(ZIP_FILE, BUCKET_NAME, ZIP_FILE)
|
||||||
|
print(f"Uploaded {ZIP_FILE} to S3 bucket '{BUCKET_NAME}'.")
|
||||||
|
|
||||||
|
# Publish the layer using the S3 object
|
||||||
|
response = lambda_client.publish_layer_version(
|
||||||
|
LayerName=LAYER_NAME,
|
||||||
|
Description="Dependencies for RSS Feed Processor",
|
||||||
|
Content={
|
||||||
|
'S3Bucket': BUCKET_NAME,
|
||||||
|
'S3Key': ZIP_FILE
|
||||||
|
},
|
||||||
|
CompatibleRuntimes=['python3.10', 'python3.11']
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Created Lambda layer version: {response['Version']}")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
shutil.rmtree("layer")
|
||||||
|
os.remove(ZIP_FILE)
|
||||||
|
|
||||||
|
print("Lambda layer creation complete!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
create_lambda_layer()
|
||||||
34
src/utils/create_s3_bucket.py
Normal file
34
src/utils/create_s3_bucket.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
import boto3
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
|
def create_s3_bucket_if_not_exists(bucket_name, region=None):
|
||||||
|
s3_client = boto3.client('s3', region_name=region)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check if the bucket exists
|
||||||
|
s3_client.head_bucket(Bucket=bucket_name)
|
||||||
|
print(f"Bucket '{bucket_name}' already exists.")
|
||||||
|
except ClientError as e:
|
||||||
|
# If a 404 error is caught, it means the bucket does not exist
|
||||||
|
error_code = e.response['Error']['Code']
|
||||||
|
if error_code == '404':
|
||||||
|
# Create the bucket
|
||||||
|
if region is None:
|
||||||
|
s3_client.create_bucket(Bucket=bucket_name)
|
||||||
|
else:
|
||||||
|
s3_client.create_bucket(
|
||||||
|
Bucket=bucket_name,
|
||||||
|
CreateBucketConfiguration={
|
||||||
|
'LocationConstraint': region
|
||||||
|
}
|
||||||
|
)
|
||||||
|
print(f"Bucket '{bucket_name}' created.")
|
||||||
|
else:
|
||||||
|
# For any other errors, re-raise the exception
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
bucket_name = 'your-unique-bucket-name'
|
||||||
|
region = 'us-east-1' # Change this to your desired region
|
||||||
|
|
||||||
|
create_s3_bucket_if_not_exists(bucket_name, region)
|
||||||
27
src/utils/retry_logic.py
Normal file
27
src/utils/retry_logic.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
import time
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
|
def retry_with_backoff(max_retries=5, initial_backoff=1, backoff_multiplier=2):
|
||||||
|
def decorator(func):
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
retries = 0
|
||||||
|
backoff = initial_backoff
|
||||||
|
|
||||||
|
while retries < max_retries:
|
||||||
|
try:
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
except ClientError as e:
|
||||||
|
if e.response['Error']['Code'] in ['ResourceConflictException', 'ResourceInUseException']:
|
||||||
|
if retries == max_retries - 1:
|
||||||
|
raise
|
||||||
|
wait_time = backoff * (2 ** retries)
|
||||||
|
print(f"Encountered {e.response['Error']['Code']}. Retrying in {wait_time} seconds...")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
retries += 1
|
||||||
|
backoff *= backoff_multiplier
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
raise Exception(f"Function failed after {max_retries} retries.")
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
return decorator
|
||||||
57
src/utils/upload_rss_feeds.py
Normal file
57
src/utils/upload_rss_feeds.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
import json
|
||||||
|
import boto3
|
||||||
|
from boto3.dynamodb.conditions import Key
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def upload_rss_feeds(rss_feeds, table_name):
|
||||||
|
dynamodb = boto3.resource('dynamodb')
|
||||||
|
table = dynamodb.Table(table_name)
|
||||||
|
|
||||||
|
logger.info(f"Uploading RSS feeds to table: {table_name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get the table's key schema
|
||||||
|
key_schema = table.key_schema
|
||||||
|
partition_key = next(key['AttributeName'] for key in key_schema if key['KeyType'] == 'HASH')
|
||||||
|
except ClientError as e:
|
||||||
|
logger.error(f"Error getting table schema: {e.response['Error']['Message']}")
|
||||||
|
return
|
||||||
|
|
||||||
|
new_items = 0
|
||||||
|
existing_items = 0
|
||||||
|
|
||||||
|
for feed in rss_feeds:
|
||||||
|
# Check if the item already exists
|
||||||
|
try:
|
||||||
|
response = table.get_item(Key={partition_key: feed['u']})
|
||||||
|
except ClientError as e:
|
||||||
|
logger.error(f"Error checking for existing item: {e.response['Error']['Message']}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if 'Item' not in response:
|
||||||
|
# Item doesn't exist, insert new item
|
||||||
|
item = {partition_key: feed['u'], 'dt': 0}
|
||||||
|
item.update(feed)
|
||||||
|
|
||||||
|
try:
|
||||||
|
table.put_item(Item=item)
|
||||||
|
new_items += 1
|
||||||
|
except ClientError as e:
|
||||||
|
logger.error(f"Error inserting new item: {e.response['Error']['Message']}")
|
||||||
|
else:
|
||||||
|
existing_items += 1
|
||||||
|
|
||||||
|
logger.info(f"Upload complete. {new_items} new items inserted. {existing_items} items already existed.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
table_name = 'rss-feeds-table'
|
||||||
|
rss_feed_path = 'rss_feeds.json'
|
||||||
|
with open(rss_feed_path) as f:
|
||||||
|
rss_feeds = json.load(f)
|
||||||
|
logger.info(f"Loaded RSS feeds: {rss_feeds}")
|
||||||
|
upload_rss_feeds(rss_feeds, table_name)
|
||||||
26
template.env
Normal file
26
template.env
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# AWS Configuration
|
||||||
|
AWS_REGION=us-east-$
|
||||||
|
AWS_ACCOUNT_ID=$$$$$$$$$
|
||||||
|
|
||||||
|
# Access keys (only use these for local development, NEVER in production)
|
||||||
|
AWS_ACCESS_KEY_ID=$$$$$$$$$
|
||||||
|
AWS_SECRET_ACCESS_KEY=$$$$$$$$$
|
||||||
|
|
||||||
|
# Resource Names (without ARNs or full URLs)
|
||||||
|
LAMBDA_FUNCTION_NAME=rss-feed-processor
|
||||||
|
LAMBDA_EXECUTION_ROLE_NAME=rss-feed-processor-role
|
||||||
|
S3_BUCKET_NAME=rss-feed-processor-bucket
|
||||||
|
DYNAMODB_TABLE_NAME=rss-feeds-table
|
||||||
|
SQS_QUEUE_NAME=rss-feed-queue
|
||||||
|
S3_LAYER_BUCKET_NAME=rss-feed-processor-layers
|
||||||
|
|
||||||
|
# RSS Feed Processing Configuration
|
||||||
|
MAX_ARTICLES_PER_FEED=10
|
||||||
|
FEED_PROCESSING_TIMEOUT=90
|
||||||
|
|
||||||
|
# Logging Configuration
|
||||||
|
LOG_LEVEL=INFO
|
||||||
|
|
||||||
|
# Other Application Settings
|
||||||
|
APP_NAME=RSS Feed Processor
|
||||||
|
VERSION=1.0.0
|
||||||
11
todo.md
Normal file
11
todo.md
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
* [ ] Make sure lambda works base
|
||||||
|
* [ ] Make sure the lambda syncs up well with the sqs and can easily pull items from dynamoDB.
|
||||||
|
* [ ]
|
||||||
|
|
||||||
|
* [ ] Version control lambda packages
|
||||||
|
* [ ] RSS Feed Easy Insertion
|
||||||
|
* [ ] environment variable template
|
||||||
|
* [ ] Shoudl we do some vector database stuff with this repo as well?
|
||||||
|
* [ ] We should probably make another module which makes it fairly easy to query all this data from
|
||||||
|
anywhere
|
||||||
|
* [ ] Add in a scheduler for the lambda
|
||||||
Reference in New Issue
Block a user