{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RSS Article Batch Download Examples\n", "\n", "This notebook demonstrates how to batch download RSS articles from S3." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from time import time\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/ceg98/IngestRSS\n" ] } ], "source": [ "# Root imports\n", "os.chdir('..')\n", "print(os.getcwd())\n", "\n", "from src.search.batch import S3BatchDownloader\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv(override=True)\n", "\n", "downloader = S3BatchDownloader()\n", "\n", "os.chdir('example-notebooks')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Initialize the Downloader" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "List permission: Allowed\n" ] } ], "source": [ "import boto3\n", "from botocore.exceptions import ClientError\n", "\n", "s3 = boto3.client('s3')\n", "\n", "# Test ListObjects\n", "try:\n", " response = s3.list_objects_v2(Bucket=os.getenv(\"S3_BUCKET_NAME\"))\n", " print(\"List permission: Allowed\")\n", "except ClientError as e:\n", " print(\"List permission: Denied\")\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 59592 objects to process\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 59592/59592 [06:23<00:00, 155.31object/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloaded data to consolidated_data.csv in 463.50 seconds\n" ] } ], "source": [ "start = time()\n", "output_path = \"consolidated_data.csv\" # or \"consolidated_data.json\"\n", "\n", "# Define date range\n", "start_date = \"2024-11-17\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n", "end_date = \"2024-11-22\"\n", "\n", "# Start downloading\n", "downloader.download_to_file(\n", " output_path=output_path,\n", " file_format=\"csv\", # or \"json\"\n", " start_date=start_date,\n", " end_date=end_date\n", ")\n", "\n", "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(output_path)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | link | \n", "rss | \n", "title | \n", "content | \n", "unixTime | \n", "rss_id | \n", "article_id | \n", "llm_summary | \n", "embedding | \n", "unixtime | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "https://meetings.skift.com/2024/11/07/critical... | \n", "https://www.eventmanagerblog.com/feed | \n", "Critical Accessibility Gaps Exist in Business ... | \n", "Joint research assesses the current state of i... | \n", "1731883626 | \n", "22219df62c | \n", "0236fb6aab | \n", "NaN | \n", "NaN | \n", "1731883626 | \n", "
| 1 | \n", "https://www.npr.org/2021/08/27/1031717918/plan... | \n", "https://www.npr.org/rss/podcast.php?id=510289 | \n", "Planet Money Summer School 6: Crypto & Commenc... | \n", "Planet Money Summer School 6: Crypto & Commenc... | \n", "1731884703 | \n", "8181d7a585 | \n", "03c9f1092b | \n", "NaN | \n", "NaN | \n", "1731884703 | \n", "
| 2 | \n", "https://www.ufc.com/news/mauricio-ruffy-enjoyi... | \n", "https://www.ufc.com/rss/news | \n", "Mauricio Ruffy Is Enjoying The Process | \n", "“I saw the UFC's care for me, always putting m... | \n", "1731883664 | \n", "2c774cd014 | \n", "05cfd6979f | \n", "NaN | \n", "NaN | \n", "1731883664 | \n", "
| 3 | \n", "https://www.vg247.com/dragon-age-the-veilguard... | \n", "https://www.vg247.com/feed/ | \n", "Dragon Age: The Veilguard's first patch isn't ... | \n", "Dragon Age: The Veilguard has now gotten its f... | \n", "1731883714 | \n", "374d98cbb9 | \n", "00505e505b | \n", "NaN | \n", "NaN | \n", "1731883714 | \n", "
| 4 | \n", "https://www.frugalrules.com/finances-back-on-t... | \n", "https://www.frugalrules.com/feed/ | \n", "How to Get Your Finances Back on Track Startin... | \n", "We all face it. Life gets the best of us, and ... | \n", "1731883633 | \n", "e18025b98e | \n", "03c3406d27 | \n", "NaN | \n", "NaN | \n", "1731883633 | \n", "