{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RSS Article Batch Download Examples\n", "\n", "This notebook demonstrates how to batch download RSS articles from S3." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import pandas as pd\n", "from time import time\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Root imports\n", "os.chdir('..')\n", "\n", "from src.search.batch import S3BatchDownloader\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv(override=True)\n", "\n", "downloader = S3BatchDownloader()\n", "\n", "os.chdir('example-notebooks')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Initialize the Downloader" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "List permission: Allowed\n" ] } ], "source": [ "import boto3\n", "from botocore.exceptions import ClientError\n", "\n", "s3 = boto3.client('s3')\n", "\n", "# Test ListObjects\n", "try:\n", " response = s3.list_objects_v2(Bucket='open-rss-articles-us-east-1')\n", " print(\"List permission: Allowed\")\n", "except ClientError as e:\n", " print(\"List permission: Denied\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloaded data to consolidated_data.csv in 59.01 seconds\n" ] } ], "source": [ "start = time()\n", "output_path = \"consolidated_data.csv\" # or \"consolidated_data.json\"\n", "\n", "# Define date range\n", "start_date = \"2024-11-17\"\n", "end_date = \"2024-11-20\"\n", "\n", "# Start downloading\n", "downloader.download_to_file(\n", " output_path=output_path,\n", " file_format=\"csv\", # or \"json\"\n", " start_date=start_date,\n", " end_date=end_date\n", ")\n", "\n", "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(output_path)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | link | \n", "rss | \n", "title | \n", "content | \n", "unixTime | \n", "rss_id | \n", "article_id | \n", "llm_summary | \n", "embedding | \n", "unixtime | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "https://www.cnet.com/home/internet/comporium-i... | \n", "https://www.cnet.com/rss/news/ | \n", "Comporium Home Internet: Pricing, Speeds and A... | \n", "Unavailable in Provider unavailable in 90001 E... | \n", "1731883654 | \n", "f5e6f52c79 | \n", "00089f7505 | \n", "NaN | \n", "NaN | \n", "1731883654 | \n", "
| 1 | \n", "https://kotaku.com/lady-gaga-wednesday-season-... | \n", "https://kotaku.com/rss | \n", "Lady Gaga Appearing In Wednesday Season 2 Is A... | \n", "The next stop on Lady Gaga’s years-long tour t... | \n", "1731883665 | \n", "a0840ab3b4 | \n", "009f880a86 | \n", "NaN | \n", "NaN | \n", "1731883665 | \n", "
| 2 | \n", "https://expeditionportal.com/classifieds-honda... | \n", "https://www.expeditionportal.com/feed/ | \n", "2008 Honda Element EX ECamper :: Classifieds | \n", "Overlanding is about experiences first and for... | \n", "1731883665 | \n", "ecdc66bb02 | \n", "00ac77e95f | \n", "NaN | \n", "NaN | \n", "1731883665 | \n", "
| 3 | \n", "http://www.influencerupdate.biz/news/68624/twi... | \n", "https://www.influencerupdate.biz/rss/news/ | \n", "Twitch is testing mid-roll ads on creator broa... | \n", "Streaming giant Twitch is planning to test mid... | \n", "1731883669 | \n", "e34caba76d | \n", "0041bc4abf | \n", "NaN | \n", "NaN | \n", "1731883669 | \n", "
| 4 | \n", "https://thejournal.com/Articles/2024/10/29/Rep... | \n", "https://thejournal.com/rss-feeds/news.aspx | \n", "Report: 90% of Schools Depend on E-rate Fundin... | \n", "Report: 90% of Schools Depend on E-rate Fundin... | \n", "1731883616 | \n", "efd9bb9654 | \n", "000a3da3b6 | \n", "NaN | \n", "NaN | \n", "1731883616 | \n", "