# RSS Article Query Examples

This notebook demonstrates how to query RSS articles using the ArticleQuerier class.

In [None]:
import os
import pandas as pd
from datetime import datetime, timedelta
from src.search import ArticleQuerier

# Configure logging if needed
import logging
logging.basicConfig(level=logging.INFO)

## Initialize the Querier

You can either set environment variables or use default values:

In [None]:
# Optional: Set environment variables
os.environ['AWS_REGION'] = 'eu-west-3'
os.environ['RSS_DATABASE_NAME'] = 'rss_articles'
os.environ['RSS_TABLE_NAME'] = 'articles'
os.environ['RSS_BUCKET_NAME'] = 'your-bucket'

# Initialize querier
querier = ArticleQuerier()

## 1. Basic Source Analysis

In [None]:
# Get all sources and their article counts
sources = querier.get_sources()

# Display top sources
print("Top Sources by Article Count:")
sources.head(10)

## 2. Search Examples

In [None]:
# Search articles containing 'python' in title
python_articles = querier.search(title="python", limit=5)
print("\nArticles about Python:")
python_articles[['title', 'source', 'published_date']]

In [None]:
# Search with multiple filters
# Get recent AWS articles from specific source
filtered_articles = querier.search(
 content="aws",
 source="techcrunch",
 date_from=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),
 limit=5
)

print("\nRecent AWS articles from TechCrunch:")
filtered_articles[['title', 'published_date', 'url']]

## 3. Custom SQL Queries

In [None]:
# Articles per month by source
monthly_stats = querier.query("""
 SELECT 
 source,
 DATE_TRUNC('month', published_date) as month,
 COUNT(*) as article_count
 FROM articles
 WHERE published_date >= CURRENT_DATE - INTERVAL '6' MONTH
 GROUP BY 1, 2
 ORDER BY 2 DESC, 3 DESC
""")

print("\nMonthly Article Counts:")
monthly_stats.head(10)

In [None]:
# Topic analysis (searching for specific keywords)
topic_analysis = querier.query("""
 SELECT 
 CASE
 WHEN LOWER(title) LIKE '%python%' THEN 'Python'
 WHEN LOWER(title) LIKE '%javascript%' OR LOWER(title) LIKE '%js%' THEN 'JavaScript'
 WHEN LOWER(title) LIKE '%aws%' THEN 'AWS'
 WHEN LOWER(title) LIKE '%ai%' OR LOWER(title) LIKE '%artificial intelligence%' THEN 'AI'
 END as topic,
 COUNT(*) as article_count
 FROM articles
 WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY
 GROUP BY 1
 HAVING topic IS NOT NULL
 ORDER BY 2 DESC
""")

print("\nTopic Analysis (Last 30 Days):")
topic_analysis

## 4. Visualization Example

In [None]:
import matplotlib.pyplot as plt

# Get daily article counts
daily_counts = querier.query("""
 SELECT 
 DATE_TRUNC('day', published_date) as date,
 COUNT(*) as article_count
 FROM articles
 WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY
 GROUP BY 1
 ORDER BY 1
""")

# Plot
plt.figure(figsize=(15, 6))
plt.plot(daily_counts['date'], daily_counts['article_count'])
plt.title('Daily Article Counts (Last 30 Days)')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()