# RSS Article Batch Download Examples

This notebook demonstrates how to batch download RSS articles from S3.

In [2]:
import os
import pandas as pd
from time import time


In [3]:
# Root imports

from src.search.batch import S3BatchDownloader
from dotenv import load_dotenv

load_dotenv(override=True)

downloader = S3BatchDownloader()


## Initialize the Downloader

In [4]:
import boto3
from botocore.exceptions import ClientError

s3 = boto3.client('s3')

# Test ListObjects
try:
    response = s3.list_objects_v2(Bucket=os.getenv("S3_BUCKET_NAME"))
    print("List permission: Allowed")
except ClientError as e:
    print("List permission: Denied")


List permission: Allowed


In [4]:
start = time()
output_path = "November-1.csv"  # or "consolidated_data.json"

# Define date range
start_date = "2024-11-17" # FIXME: Fix the error where data can't be collected before the date you started collecting.
end_date = "2024-11-22"

# Start downloading
downloader.download_to_file(
    output_path=output_path,
    file_format="csv",  # or "json"
    start_date=start_date,
    end_date=end_date
)

print(f"Downloaded data to {output_path} in {time() - start:.2f} seconds")

Found 59592 objects to process


 53%|█████▎    | 31707/59592 [04:33<6:08:33,  1.26object/s]IOStream.flush timed out
100%|██████████| 59592/59592 [08:38<00:00, 114.83object/s]  


Downloaded data to November-1.csv in 605.81 seconds


In [5]:
start = time()
output_path = "November-2.csv"  # or "consolidated_data.json"

# Define date range
start_date = "2024-11-23" # FIXME: Fix the error where data can't be collected before the date you started collecting.
end_date = "2024-11-27"

# Start downloading
downloader.download_to_file(
    output_path=output_path,
    file_format="csv",  # or "json"
    start_date=start_date,
    end_date=end_date
)

print(f"Downloaded data to {output_path} in {time() - start:.2f} seconds")

Found 50037 objects to process


 66%|██████▌   | 33067/50037 [04:07<11:49, 23.92object/s]  IOStream.flush timed out
100%|██████████| 50037/50037 [07:11<00:00, 116.06object/s] 


Downloaded data to November-2.csv in 539.74 seconds


In [4]:
start = time()
output_path = "November-3.csv"  # or "consolidated_data.json"



# Define date range
start_date = "2024-11-28" # FIXME: Fix the error where data can't be collected before the date you started collecting.
end_date = "2024-11-30"

# Start downloading
downloader.download_to_file(
    output_path=output_path,
    file_format="csv",  # or "json"
    start_date=start_date,
    end_date=end_date
)

print(f"Downloaded data to {output_path} in {time() - start:.2f} seconds")

Found 23857 objects to process


100%|██████████| 23857/23857 [02:37<00:00, 151.22object/s]


Downloaded data to November-3.csv in 219.71 seconds


# Aggregating

In [5]:
import pandas as pd
import glob

# List to hold dataframes
dataframes = []

# Loop through the files November-1.csv to November-3.csv
for i in range(1, 4):
    filename = f"November-{i}.csv"
    try:
        # Read each CSV and append to the list
        df = pd.read_csv(filename)
        dataframes.append(df)
    except FileNotFoundError:
        print(f"File {filename} not found, skipping.")

# Aggregate the dataframes
if dataframes:
    aggregated_df = pd.concat(dataframes, ignore_index=True)
    print("Aggregated DataFrame:")
    print(aggregated_df.head())
else:
    print("No files were aggregated.")

# If you want to save the result to a new CSV
aggregated_df.to_csv("Aggregated_November.csv", index=False)

Aggregated DataFrame:
                                                link  \
0  https://www.npr.org/2022/11/04/1134434712/plan...   
1  https://www.communitysignal.com/embodying-work...   
2  https://identityweek.net/tech5-awarded-a-15-ye...   
3  https://edtechmagazine.com/k12/article/2024/11...   
4  https://www.ufc.com/news/updates-ufc-309-jones...   

                                                 rss  \
0      https://www.npr.org/rss/podcast.php?id=510289   
1              https://www.communitysignal.com/feed/   
2              https://www.planetbiometrics.com/rss/   
3  https://www.edtechmagazine.com/k12/rss.xml?tax...   
4                       https://www.ufc.com/rss/news   

                                               title  \
0  The crisis pollsters face ahead of the midterm...   
1  Embodying Work-Life Balance as a Community Pro...   
2  TECH5 awarded a 15-year contract with Virginia...   
3  How Can You Prepare Your Cloud to Safely Imple...   
4                Updates

In [5]:
df = pd.read_csv("Aggregated_November.csv")

In [6]:
# Drop columns which are all null values. 
df = df.dropna(axis=1, how='all')

In [8]:
# Get null value percentage for each column
null_percent = df.isnull().mean().round(4) * 100

In [10]:
# Drop rows where the content of the column is null
df = df.dropna(subset=['content'])

In [None]:
# Get null value percentage for each column
null_percent = df.isnull().mean().round(4) * 100

link          0.0
rss           0.0
title         0.2
content       0.0
unixTime      0.0
rss_id        0.0
article_id    0.0
unixtime      0.0
dtype: float64

In [14]:
df = df.dropna(subset=['title'])

In [15]:
# Get summary statistics for the dataframe
summary = df.describe(include='all')

In [17]:
# save df 
df.to_csv("Aggregated_November.csv", index=False)

In [18]:
summary

Unnamed: 0,link,rss,title,content,unixTime,rss_id,article_id,unixtime
count,130388,130388,130388,130388,130388.0,130388,130388,130388.0
unique,37817,688,37383,37658,,688,37817,
top,https://www.npr.org/2022/11/04/1134434712/plan...,https://blogs.unity3d.com/feed/,TechPipeline,"""Unity"", Unity logos, and other Unity trademar...",,b78e974545,05dfb6bb11,
freq,11,3918,330,3912,,3918,11,
mean,,,,,1732375000.0,,,1732375000.0
std,,,,,321448.4,,,321448.4
min,,,,,1731884000.0,,,1731884000.0
25%,,,,,1732085000.0,,,1732085000.0
50%,,,,,1732402000.0,,,1732402000.0
75%,,,,,1732632000.0,,,1732632000.0


In [19]:
df.head()

Unnamed: 0,link,rss,title,content,unixTime,rss_id,article_id,unixtime
0,https://www.npr.org/2022/11/04/1134434712/plan...,https://www.npr.org/rss/podcast.php?id=510289,The crisis pollsters face ahead of the midterm...,Planet Money tries election polling\n\nEnlarge...,1731884660,8181d7a585,05dfb6bb11,1731884660
1,https://www.communitysignal.com/embodying-work...,https://www.communitysignal.com/feed/,Embodying Work-Life Balance as a Community Pro...,Are you able to step away from your community ...,1731883672,b6cef58d91,0418489c55,1731883672
2,https://identityweek.net/tech5-awarded-a-15-ye...,https://www.planetbiometrics.com/rss/,TECH5 awarded a 15-year contract with Virginia...,TECH5 has announced a major contract win with ...,1731883632,084b136c50,0bb2ec0554,1731883632
3,https://edtechmagazine.com/k12/article/2024/11...,https://www.edtechmagazine.com/k12/rss.xml?tax...,How Can You Prepare Your Cloud to Safely Imple...,Many K–12 schools that operate in the cloud ha...,1731883653,7827152faf,07d39400e9,1731883653
4,https://www.ufc.com/news/updates-ufc-309-jones...,https://www.ufc.com/rss/news,Updates To UFC 309: Jones vs Miocic,UFC 309: JONES vs MIOCIC Updates:\n\nDue to me...,1731883662,2c774cd014,03f31af6e2,1731883662
