# RSS Article Batch Download Examples

This notebook demonstrates how to batch download RSS articles from S3.

In [1]:
import os
import pandas as pd
from time import time


In [2]:
# Root imports
os.chdir('..')
print(os.getcwd())

from src.search.batch import S3BatchDownloader
from dotenv import load_dotenv

load_dotenv(override=True)

downloader = S3BatchDownloader()

os.chdir('example-notebooks')


/home/ceg98/IngestRSS


## Initialize the Downloader

In [3]:
import boto3
from botocore.exceptions import ClientError

s3 = boto3.client('s3')

# Test ListObjects
try:
    response = s3.list_objects_v2(Bucket=os.getenv("S3_BUCKET_NAME"))
    print("List permission: Allowed")
except ClientError as e:
    print("List permission: Denied")


List permission: Allowed


In [4]:
start = time()
output_path = "consolidated_data.csv"  # or "consolidated_data.json"

# Define date range
start_date = "2024-11-17"
end_date = "2024-11-20"

# Start downloading
downloader.download_to_file(
    output_path=output_path,
    file_format="csv",  # or "json"
    start_date=start_date,
    end_date=end_date
)

print(f"Downloaded data to {output_path} in {time() - start:.2f} seconds")

Error downloading 2024/11/17/d16bca904a.json: Read timeout on endpoint URL: "None"


Downloaded data to consolidated_data.csv in 303.85 seconds


In [5]:
df = pd.read_csv(output_path)

In [6]:
df.head()

Unnamed: 0,link,rss,title,content,unixTime,rss_id,article_id,llm_summary,embedding,unixtime
0,https://www.getresponse.com/blog/thanksgiving-...,https://www.getresponse.com/blog/feed,How to write a heartfelt Thanksgiving email (w...,It’s nearly time for Thanksgiving – which mean...,1731883653,90dc77288c,00c36e16ef,,,1731883653
1,https://www.npr.org/2023/07/11/1187019058/summ...,https://www.npr.org/rss/podcast.php?id=510289,Starting a business: identify pain points and ...,Summer School 1: Planet Money goes to business...,1731884630,8181d7a585,0174d34236,,,1731884630
2,https://www.vg247.com/fallout-season-two-repor...,https://www.vg247.com/feed/,Fallout season two will reportedly star Macaul...,Warning: Spoilers for the first season of the ...,1731883716,374d98cbb9,006fa48e38,,,1731883716
3,https://insideclimatenews.org/news/15112024/to...,https://insideclimatenews.org/rss/,"As New York Congestion Pricing Turns a Corner,...","On Thursday, New York Gov. Kathy Hochul announ...",1731883625,f1fc2a503e,00c2ede5ae,,,1731883625
4,https://transom.org/2018/enticing-listeners-pr...,https://transom.org/feed/podcast/,Enticing Listeners To Press Play Again,http://www.podtrac.com/pts/redirect.mp3/media....,1731883665,c00d35bf2d,01cce6dac3,,,1731883665


# Cleaning Data - EDA ( Delete Later ) 

In [9]:
df = df.dropna(subset=['content'])

In [7]:
df.shape

(32781, 10)

In [20]:
df.sample(20)["content"].values

array(["Lisa Francois\n\nSince the fall of Roe, Americans have consistently supported reproductive freedom at the polls. Knowing this, ACLU advocates across the country have worked to ensure the people have a say on reproductive freedom. On Election Day, voters will decide on ten ballot initiatives that will protect and increase abortion access at the state level.\n\nWhile 13 states currently have total abortion bans in effect, Missouri was the first state to enforce its ban, taking action mere minutes after the fall of Roe. Alongside local partners, the ACLU of Missouri has spent years fighting to secure access to reproductive health care. Today, Missourians will decide on Amendment 3, which would enshrine the right to reproductive freedom in their state constitution.\n\nIn Arizona, lawmakers have likewise aggressively restricted abortion access for decades, imposing a 15-week ban on abortion after Roe was overturned and trying to reinstate an 1864 total abortion ban. The ACLU of Ariz