# RSS Article Batch Download Examples

This notebook demonstrates how to batch download RSS articles from S3.

In [6]:
import os
import sys
import pandas as pd
from time import time


In [2]:
# Root imports
os.chdir('..')

from src.search.batch import S3BatchDownloader
from dotenv import load_dotenv

load_dotenv(override=True)

downloader = S3BatchDownloader()

os.chdir('example-notebooks')


## Initialize the Downloader

In [3]:
import boto3
from botocore.exceptions import ClientError

s3 = boto3.client('s3')

# Test ListObjects
try:
    response = s3.list_objects_v2(Bucket='open-rss-articles-us-east-1')
    print("List permission: Allowed")
except ClientError as e:
    print("List permission: Denied")


List permission: Allowed


In [None]:
start = time()
output_path = "consolidated_data.csv"  # or "consolidated_data.json"

# Define date range
start_date = "2024-11-17"
end_date = "2024-11-20"

# Start downloading
downloader.download_to_file(
    output_path=output_path,
    file_format="csv",  # or "json"
    start_date=start_date,
    end_date=end_date
)

print(f"Downloaded data to {output_path} in {time() - start:.2f} seconds")

Downloaded data to consolidated_data.csv in 59.01 seconds


In [8]:
df = pd.read_csv(output_path)

In [13]:
df.head()

Unnamed: 0,link,rss,title,content,unixTime,rss_id,article_id,llm_summary,embedding,unixtime
0,https://www.cnet.com/home/internet/comporium-i...,https://www.cnet.com/rss/news/,"Comporium Home Internet: Pricing, Speeds and A...",Unavailable in Provider unavailable in 90001 E...,1731883654,f5e6f52c79,00089f7505,,,1731883654
1,https://kotaku.com/lady-gaga-wednesday-season-...,https://kotaku.com/rss,Lady Gaga Appearing In Wednesday Season 2 Is A...,The next stop on Lady Gaga’s years-long tour t...,1731883665,a0840ab3b4,009f880a86,,,1731883665
2,https://expeditionportal.com/classifieds-honda...,https://www.expeditionportal.com/feed/,2008 Honda Element EX ECamper :: Classifieds,Overlanding is about experiences first and for...,1731883665,ecdc66bb02,00ac77e95f,,,1731883665
3,http://www.influencerupdate.biz/news/68624/twi...,https://www.influencerupdate.biz/rss/news/,Twitch is testing mid-roll ads on creator broa...,Streaming giant Twitch is planning to test mid...,1731883669,e34caba76d,0041bc4abf,,,1731883669
4,https://thejournal.com/Articles/2024/10/29/Rep...,https://thejournal.com/rss-feeds/news.aspx,Report: 90% of Schools Depend on E-rate Fundin...,Report: 90% of Schools Depend on E-rate Fundin...,1731883616,efd9bb9654,000a3da3b6,,,1731883616


## 1. Check Storage Statistics