{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RSS Article Batch Download Examples\n", "\n", "This notebook demonstrates how to batch download RSS articles from S3." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import pandas as pd\n", "from time import time\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Root imports\n", "os.chdir('..')\n", "\n", "from src.search.batch import S3BatchDownloader\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv(override=True)\n", "\n", "downloader = S3BatchDownloader()\n", "\n", "os.chdir('example-notebooks')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Initialize the Downloader" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "List permission: Allowed\n" ] } ], "source": [ "import boto3\n", "from botocore.exceptions import ClientError\n", "\n", "s3 = boto3.client('s3')\n", "\n", "# Test ListObjects\n", "try:\n", " response = s3.list_objects_v2(Bucket='open-rss-articles-us-east-1')\n", " print(\"List permission: Allowed\")\n", "except ClientError as e:\n", " print(\"List permission: Denied\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloaded data to consolidated_data.csv in 59.01 seconds\n" ] } ], "source": [ "start = time()\n", "output_path = \"consolidated_data.csv\" # or \"consolidated_data.json\"\n", "\n", "# Define date range\n", "start_date = \"2024-11-17\"\n", "end_date = \"2024-11-20\"\n", "\n", "# Start downloading\n", "downloader.download_to_file(\n", " output_path=output_path,\n", " file_format=\"csv\", # or \"json\"\n", " start_date=start_date,\n", " end_date=end_date\n", ")\n", "\n", "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(output_path)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
linkrsstitlecontentunixTimerss_idarticle_idllm_summaryembeddingunixtime
0https://www.cnet.com/home/internet/comporium-i...https://www.cnet.com/rss/news/Comporium Home Internet: Pricing, Speeds and A...Unavailable in Provider unavailable in 90001 E...1731883654f5e6f52c7900089f7505NaNNaN1731883654
1https://kotaku.com/lady-gaga-wednesday-season-...https://kotaku.com/rssLady Gaga Appearing In Wednesday Season 2 Is A...The next stop on Lady Gaga’s years-long tour t...1731883665a0840ab3b4009f880a86NaNNaN1731883665
2https://expeditionportal.com/classifieds-honda...https://www.expeditionportal.com/feed/2008 Honda Element EX ECamper :: ClassifiedsOverlanding is about experiences first and for...1731883665ecdc66bb0200ac77e95fNaNNaN1731883665
3http://www.influencerupdate.biz/news/68624/twi...https://www.influencerupdate.biz/rss/news/Twitch is testing mid-roll ads on creator broa...Streaming giant Twitch is planning to test mid...1731883669e34caba76d0041bc4abfNaNNaN1731883669
4https://thejournal.com/Articles/2024/10/29/Rep...https://thejournal.com/rss-feeds/news.aspxReport: 90% of Schools Depend on E-rate Fundin...Report: 90% of Schools Depend on E-rate Fundin...1731883616efd9bb9654000a3da3b6NaNNaN1731883616
\n", "
" ], "text/plain": [ " link \\\n", "0 https://www.cnet.com/home/internet/comporium-i... \n", "1 https://kotaku.com/lady-gaga-wednesday-season-... \n", "2 https://expeditionportal.com/classifieds-honda... \n", "3 http://www.influencerupdate.biz/news/68624/twi... \n", "4 https://thejournal.com/Articles/2024/10/29/Rep... \n", "\n", " rss \\\n", "0 https://www.cnet.com/rss/news/ \n", "1 https://kotaku.com/rss \n", "2 https://www.expeditionportal.com/feed/ \n", "3 https://www.influencerupdate.biz/rss/news/ \n", "4 https://thejournal.com/rss-feeds/news.aspx \n", "\n", " title \\\n", "0 Comporium Home Internet: Pricing, Speeds and A... \n", "1 Lady Gaga Appearing In Wednesday Season 2 Is A... \n", "2 2008 Honda Element EX ECamper :: Classifieds \n", "3 Twitch is testing mid-roll ads on creator broa... \n", "4 Report: 90% of Schools Depend on E-rate Fundin... \n", "\n", " content unixTime rss_id \\\n", "0 Unavailable in Provider unavailable in 90001 E... 1731883654 f5e6f52c79 \n", "1 The next stop on Lady Gaga’s years-long tour t... 1731883665 a0840ab3b4 \n", "2 Overlanding is about experiences first and for... 1731883665 ecdc66bb02 \n", "3 Streaming giant Twitch is planning to test mid... 1731883669 e34caba76d \n", "4 Report: 90% of Schools Depend on E-rate Fundin... 1731883616 efd9bb9654 \n", "\n", " article_id llm_summary embedding unixtime \n", "0 00089f7505 NaN NaN 1731883654 \n", "1 009f880a86 NaN NaN 1731883665 \n", "2 00ac77e95f NaN NaN 1731883665 \n", "3 0041bc4abf NaN NaN 1731883669 \n", "4 000a3da3b6 NaN NaN 1731883616 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Check Storage Statistics" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 2 }