{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RSS Article Batch Download Examples\n", "\n", "This notebook demonstrates how to batch download RSS articles from S3." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from time import time\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Root imports\n", "\n", "from src.search.batch import S3BatchDownloader\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv(override=True)\n", "\n", "downloader = S3BatchDownloader()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Initialize the Downloader" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "List permission: Allowed\n" ] } ], "source": [ "import boto3\n", "from botocore.exceptions import ClientError\n", "\n", "s3 = boto3.client('s3')\n", "\n", "# Test ListObjects\n", "try:\n", " response = s3.list_objects_v2(Bucket=os.getenv(\"S3_BUCKET_NAME\"))\n", " print(\"List permission: Allowed\")\n", "except ClientError as e:\n", " print(\"List permission: Denied\")\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 59592 objects to process\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 53%|█████▎ | 31707/59592 [04:33<6:08:33, 1.26object/s]IOStream.flush timed out\n", "100%|██████████| 59592/59592 [08:38<00:00, 114.83object/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloaded data to November-1.csv in 605.81 seconds\n" ] } ], "source": [ "start = time()\n", "output_path = \"November-1.csv\" # or \"consolidated_data.json\"\n", "\n", "# Define date range\n", "start_date = \"2024-11-17\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n", "end_date = \"2024-11-22\"\n", "\n", "# Start downloading\n", "downloader.download_to_file(\n", " output_path=output_path,\n", " file_format=\"csv\", # or \"json\"\n", " start_date=start_date,\n", " end_date=end_date\n", ")\n", "\n", "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 50037 objects to process\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 66%|██████▌ | 33067/50037 [04:07<11:49, 23.92object/s] IOStream.flush timed out\n", "100%|██████████| 50037/50037 [07:11<00:00, 116.06object/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloaded data to November-2.csv in 539.74 seconds\n" ] } ], "source": [ "start = time()\n", "output_path = \"November-2.csv\" # or \"consolidated_data.json\"\n", "\n", "# Define date range\n", "start_date = \"2024-11-23\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n", "end_date = \"2024-11-27\"\n", "\n", "# Start downloading\n", "downloader.download_to_file(\n", " output_path=output_path,\n", " file_format=\"csv\", # or \"json\"\n", " start_date=start_date,\n", " end_date=end_date\n", ")\n", "\n", "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 23857 objects to process\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 23857/23857 [02:37<00:00, 151.22object/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloaded data to November-3.csv in 219.71 seconds\n" ] } ], "source": [ "start = time()\n", "output_path = \"November-3.csv\" # or \"consolidated_data.json\"\n", "\n", "\n", "\n", "# Define date range\n", "start_date = \"2024-11-28\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n", "end_date = \"2024-11-30\"\n", "\n", "# Start downloading\n", "downloader.download_to_file(\n", " output_path=output_path,\n", " file_format=\"csv\", # or \"json\"\n", " start_date=start_date,\n", " end_date=end_date\n", ")\n", "\n", "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Aggregating" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Aggregated DataFrame:\n", " link \\\n", "0 https://www.npr.org/2022/11/04/1134434712/plan... \n", "1 https://www.communitysignal.com/embodying-work... \n", "2 https://identityweek.net/tech5-awarded-a-15-ye... \n", "3 https://edtechmagazine.com/k12/article/2024/11... \n", "4 https://www.ufc.com/news/updates-ufc-309-jones... \n", "\n", " rss \\\n", "0 https://www.npr.org/rss/podcast.php?id=510289 \n", "1 https://www.communitysignal.com/feed/ \n", "2 https://www.planetbiometrics.com/rss/ \n", "3 https://www.edtechmagazine.com/k12/rss.xml?tax... \n", "4 https://www.ufc.com/rss/news \n", "\n", " title \\\n", "0 The crisis pollsters face ahead of the midterm... \n", "1 Embodying Work-Life Balance as a Community Pro... \n", "2 TECH5 awarded a 15-year contract with Virginia... \n", "3 How Can You Prepare Your Cloud to Safely Imple... \n", "4 Updates To UFC 309: Jones vs Miocic \n", "\n", " content unixTime rss_id \\\n", "0 Planet Money tries election polling\\n\\nEnlarge... 1731884660 8181d7a585 \n", "1 Are you able to step away from your community ... 1731883672 b6cef58d91 \n", "2 TECH5 has announced a major contract win with ... 1731883632 084b136c50 \n", "3 Many K–12 schools that operate in the cloud ha... 1731883653 7827152faf \n", "4 UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... 1731883662 2c774cd014 \n", "\n", " article_id llm_summary embedding unixtime \n", "0 05dfb6bb11 NaN NaN 1731884660 \n", "1 0418489c55 NaN NaN 1731883672 \n", "2 0bb2ec0554 NaN NaN 1731883632 \n", "3 07d39400e9 NaN NaN 1731883653 \n", "4 03f31af6e2 NaN NaN 1731883662 \n" ] } ], "source": [ "import pandas as pd\n", "import glob\n", "\n", "# List to hold dataframes\n", "dataframes = []\n", "\n", "# Loop through the files November-1.csv to November-3.csv\n", "for i in range(1, 4):\n", " filename = f\"November-{i}.csv\"\n", " try:\n", " # Read each CSV and append to the list\n", " df = pd.read_csv(filename)\n", " dataframes.append(df)\n", " except FileNotFoundError:\n", " print(f\"File {filename} not found, skipping.\")\n", "\n", "# Aggregate the dataframes\n", "if dataframes:\n", " aggregated_df = pd.concat(dataframes, ignore_index=True)\n", " print(\"Aggregated DataFrame:\")\n", " print(aggregated_df.head())\n", "else:\n", " print(\"No files were aggregated.\")\n", "\n", "# If you want to save the result to a new CSV\n", "aggregated_df.to_csv(\"Aggregated_November.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"Aggregated_November.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Drop columns which are all null values. \n", "df = df.dropna(axis=1, how='all')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Get null value percentage for each column\n", "null_percent = df.isnull().mean().round(4) * 100" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Drop rows where the content of the column is null\n", "df = df.dropna(subset=['content'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "link 0.0\n", "rss 0.0\n", "title 0.2\n", "content 0.0\n", "unixTime 0.0\n", "rss_id 0.0\n", "article_id 0.0\n", "unixtime 0.0\n", "dtype: float64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get null value percentage for each column\n", "null_percent = df.isnull().mean().round(4) * 100" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "df = df.dropna(subset=['title'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Get summary statistics for the dataframe\n", "summary = df.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# save df \n", "df.to_csv(\"Aggregated_November.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
linkrsstitlecontentunixTimerss_idarticle_idunixtime
count1303881303881303881303881.303880e+051303881303881.303880e+05
unique378176883738337658NaN68837817NaN
tophttps://www.npr.org/2022/11/04/1134434712/plan...https://blogs.unity3d.com/feed/TechPipeline\"Unity\", Unity logos, and other Unity trademar...NaNb78e97454505dfb6bb11NaN
freq1139183303912NaN391811NaN
meanNaNNaNNaNNaN1.732375e+09NaNNaN1.732375e+09
stdNaNNaNNaNNaN3.214484e+05NaNNaN3.214484e+05
minNaNNaNNaNNaN1.731884e+09NaNNaN1.731884e+09
25%NaNNaNNaNNaN1.732085e+09NaNNaN1.732085e+09
50%NaNNaNNaNNaN1.732402e+09NaNNaN1.732402e+09
75%NaNNaNNaNNaN1.732632e+09NaNNaN1.732632e+09
maxNaNNaNNaNNaN1.732922e+09NaNNaN1.732922e+09
\n", "
" ], "text/plain": [ " link \\\n", "count 130388 \n", "unique 37817 \n", "top https://www.npr.org/2022/11/04/1134434712/plan... \n", "freq 11 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n", "\n", " rss title \\\n", "count 130388 130388 \n", "unique 688 37383 \n", "top https://blogs.unity3d.com/feed/ TechPipeline \n", "freq 3918 330 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " content unixTime \\\n", "count 130388 1.303880e+05 \n", "unique 37658 NaN \n", "top \"Unity\", Unity logos, and other Unity trademar... NaN \n", "freq 3912 NaN \n", "mean NaN 1.732375e+09 \n", "std NaN 3.214484e+05 \n", "min NaN 1.731884e+09 \n", "25% NaN 1.732085e+09 \n", "50% NaN 1.732402e+09 \n", "75% NaN 1.732632e+09 \n", "max NaN 1.732922e+09 \n", "\n", " rss_id article_id unixtime \n", "count 130388 130388 1.303880e+05 \n", "unique 688 37817 NaN \n", "top b78e974545 05dfb6bb11 NaN \n", "freq 3918 11 NaN \n", "mean NaN NaN 1.732375e+09 \n", "std NaN NaN 3.214484e+05 \n", "min NaN NaN 1.731884e+09 \n", "25% NaN NaN 1.732085e+09 \n", "50% NaN NaN 1.732402e+09 \n", "75% NaN NaN 1.732632e+09 \n", "max NaN NaN 1.732922e+09 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
linkrsstitlecontentunixTimerss_idarticle_idunixtime
0https://www.npr.org/2022/11/04/1134434712/plan...https://www.npr.org/rss/podcast.php?id=510289The crisis pollsters face ahead of the midterm...Planet Money tries election polling\\n\\nEnlarge...17318846608181d7a58505dfb6bb111731884660
1https://www.communitysignal.com/embodying-work...https://www.communitysignal.com/feed/Embodying Work-Life Balance as a Community Pro...Are you able to step away from your community ...1731883672b6cef58d910418489c551731883672
2https://identityweek.net/tech5-awarded-a-15-ye...https://www.planetbiometrics.com/rss/TECH5 awarded a 15-year contract with Virginia...TECH5 has announced a major contract win with ...1731883632084b136c500bb2ec05541731883632
3https://edtechmagazine.com/k12/article/2024/11...https://www.edtechmagazine.com/k12/rss.xml?tax...How Can You Prepare Your Cloud to Safely Imple...Many K–12 schools that operate in the cloud ha...17318836537827152faf07d39400e91731883653
4https://www.ufc.com/news/updates-ufc-309-jones...https://www.ufc.com/rss/newsUpdates To UFC 309: Jones vs MiocicUFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...17318836622c774cd01403f31af6e21731883662
\n", "
" ], "text/plain": [ " link \\\n", "0 https://www.npr.org/2022/11/04/1134434712/plan... \n", "1 https://www.communitysignal.com/embodying-work... \n", "2 https://identityweek.net/tech5-awarded-a-15-ye... \n", "3 https://edtechmagazine.com/k12/article/2024/11... \n", "4 https://www.ufc.com/news/updates-ufc-309-jones... \n", "\n", " rss \\\n", "0 https://www.npr.org/rss/podcast.php?id=510289 \n", "1 https://www.communitysignal.com/feed/ \n", "2 https://www.planetbiometrics.com/rss/ \n", "3 https://www.edtechmagazine.com/k12/rss.xml?tax... \n", "4 https://www.ufc.com/rss/news \n", "\n", " title \\\n", "0 The crisis pollsters face ahead of the midterm... \n", "1 Embodying Work-Life Balance as a Community Pro... \n", "2 TECH5 awarded a 15-year contract with Virginia... \n", "3 How Can You Prepare Your Cloud to Safely Imple... \n", "4 Updates To UFC 309: Jones vs Miocic \n", "\n", " content unixTime rss_id \\\n", "0 Planet Money tries election polling\\n\\nEnlarge... 1731884660 8181d7a585 \n", "1 Are you able to step away from your community ... 1731883672 b6cef58d91 \n", "2 TECH5 has announced a major contract win with ... 1731883632 084b136c50 \n", "3 Many K–12 schools that operate in the cloud ha... 1731883653 7827152faf \n", "4 UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... 1731883662 2c774cd014 \n", "\n", " article_id unixtime \n", "0 05dfb6bb11 1731884660 \n", "1 0418489c55 1731883672 \n", "2 0bb2ec0554 1731883632 \n", "3 07d39400e9 1731883653 \n", "4 03f31af6e2 1731883662 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 2 }