# RSS Article Batch Download Examples

This notebook demonstrates how to batch download RSS articles from S3.

In [None]:
import os
import pandas as pd
from datetime import datetime, timedelta
from src.search import S3BatchDownloader

# Configure logging
import logging
logging.basicConfig(level=logging.INFO)

## Initialize the Downloader

In [None]:
# Optional: Set environment variables
os.environ['AWS_REGION'] = 'eu-west-3'
os.environ['RSS_BUCKET_NAME'] = 'your-bucket'
os.environ['RSS_PREFIX'] = 'articles/'

# Initialize downloader
downloader = S3BatchDownloader()

## 1. Check Storage Statistics

In [None]:
# Get storage stats
stats = downloader.get_storage_stats()

print(f"Total objects: {stats['total_objects']:,}")
print(f"Total size: {stats['total_size_mb']:.2f} MB")
print(f"Average object size: {stats['average_size_kb']:.2f} KB")

## 2. Download Examples

In [None]:
# Download last 7 days of articles
start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')

recent_articles_path = downloader.download_to_csv(
 output_path='recent_articles.csv',
 start_date=start_date
)

# Load and display sample
recent_df = pd.read_csv(recent_articles_path)
print(f"\nDownloaded {len(recent_df)} recent articles")
recent_df.head()

In [None]:
# Download articles from specific source prefix
tech_articles_path = downloader.download_to_csv(
 output_path='tech_articles.csv',
 prefix='articles/tech/'
)

tech_df = pd.read_csv(tech_articles_path)
print(f"\nDownloaded {len(tech_df)} tech articles")
tech_df.head()

## 3. Analysis Examples

In [None]:
# Analyze downloaded data
if 'source' in tech_df.columns:
 source_counts = tech_df['source'].value_counts()
 
 print("\nArticles per Source:")
 print(source_counts)

In [None]:
# Time-based analysis
if 'published_date' in tech_df.columns:
 tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])
 daily_counts = tech_df.resample('D', on='published_date').size()
 
 # Plot
 import matplotlib.pyplot as plt
 
 plt.figure(figsize=(15, 6))
 daily_counts.plot(kind='bar')
 plt.title('Articles per Day')
 plt.xlabel('Date')
 plt.ylabel('Number of Articles')
 plt.xticks(rotation=45)
 plt.tight_layout()
 plt.show()

## 4. Export Filtered Data

In [None]:
# Filter and export specific data
if 'title' in tech_df.columns and 'content' in tech_df.columns:
 # Filter articles containing specific keywords
 ai_articles = tech_df[
 tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |
 tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)
 ]
 
 # Export filtered data
 ai_articles.to_csv('ai_articles.csv', index=False)
 print(f"\nExported {len(ai_articles)} AI-related articles to ai_articles.csv")

## 5. Cleanup

In [None]:
# Optional: Remove downloaded CSV files
import os

for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:
 if os.path.exists(file):
 os.remove(file)
 print(f"Removed {file}")