{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RSS Article Batch Download Examples\n", "\n", "This notebook demonstrates how to batch download RSS articles from S3." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from time import time\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/ceg98/IngestRSS\n" ] } ], "source": [ "# Root imports\n", "os.chdir('..')\n", "print(os.getcwd())\n", "\n", "from src.search.batch import S3BatchDownloader\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv(override=True)\n", "\n", "downloader = S3BatchDownloader()\n", "\n", "os.chdir('example-notebooks')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Initialize the Downloader" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "List permission: Allowed\n" ] } ], "source": [ "import boto3\n", "from botocore.exceptions import ClientError\n", "\n", "s3 = boto3.client('s3')\n", "\n", "# Test ListObjects\n", "try:\n", " response = s3.list_objects_v2(Bucket=os.getenv(\"S3_BUCKET_NAME\"))\n", " print(\"List permission: Allowed\")\n", "except ClientError as e:\n", " print(\"List permission: Denied\")\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Error downloading 2024/11/17/d16bca904a.json: Read timeout on endpoint URL: \"None\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloaded data to consolidated_data.csv in 303.85 seconds\n" ] } ], "source": [ "start = time()\n", "output_path = \"consolidated_data.csv\" # or \"consolidated_data.json\"\n", "\n", "# Define date range\n", "start_date = \"2024-11-17\"\n", "end_date = \"2024-11-20\"\n", "\n", "# Start downloading\n", "downloader.download_to_file(\n", " output_path=output_path,\n", " file_format=\"csv\", # or \"json\"\n", " start_date=start_date,\n", " end_date=end_date\n", ")\n", "\n", "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(output_path)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | link | \n", "rss | \n", "title | \n", "content | \n", "unixTime | \n", "rss_id | \n", "article_id | \n", "llm_summary | \n", "embedding | \n", "unixtime | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "https://www.getresponse.com/blog/thanksgiving-... | \n", "https://www.getresponse.com/blog/feed | \n", "How to write a heartfelt Thanksgiving email (w... | \n", "It’s nearly time for Thanksgiving – which mean... | \n", "1731883653 | \n", "90dc77288c | \n", "00c36e16ef | \n", "NaN | \n", "NaN | \n", "1731883653 | \n", "
| 1 | \n", "https://www.npr.org/2023/07/11/1187019058/summ... | \n", "https://www.npr.org/rss/podcast.php?id=510289 | \n", "Starting a business: identify pain points and ... | \n", "Summer School 1: Planet Money goes to business... | \n", "1731884630 | \n", "8181d7a585 | \n", "0174d34236 | \n", "NaN | \n", "NaN | \n", "1731884630 | \n", "
| 2 | \n", "https://www.vg247.com/fallout-season-two-repor... | \n", "https://www.vg247.com/feed/ | \n", "Fallout season two will reportedly star Macaul... | \n", "Warning: Spoilers for the first season of the ... | \n", "1731883716 | \n", "374d98cbb9 | \n", "006fa48e38 | \n", "NaN | \n", "NaN | \n", "1731883716 | \n", "
| 3 | \n", "https://insideclimatenews.org/news/15112024/to... | \n", "https://insideclimatenews.org/rss/ | \n", "As New York Congestion Pricing Turns a Corner,... | \n", "On Thursday, New York Gov. Kathy Hochul announ... | \n", "1731883625 | \n", "f1fc2a503e | \n", "00c2ede5ae | \n", "NaN | \n", "NaN | \n", "1731883625 | \n", "
| 4 | \n", "https://transom.org/2018/enticing-listeners-pr... | \n", "https://transom.org/feed/podcast/ | \n", "Enticing Listeners To Press Play Again | \n", "http://www.podtrac.com/pts/redirect.mp3/media.... | \n", "1731883665 | \n", "c00d35bf2d | \n", "01cce6dac3 | \n", "NaN | \n", "NaN | \n", "1731883665 | \n", "