IngestRSS/monthly_ingestion.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RSS Article Batch Download Examples\n",
    "\n",
    "This notebook demonstrates how to batch download RSS articles from S3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from time import time\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Root imports\n",
    "\n",
    "from src.search.batch import S3BatchDownloader\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv(override=True)\n",
    "\n",
    "downloader = S3BatchDownloader()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Initialize the Downloader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "List permission: Allowed\n"
     ]
    }
   ],
   "source": [
    "import boto3\n",
    "from botocore.exceptions import ClientError\n",
    "\n",
    "s3 = boto3.client('s3')\n",
    "\n",
    "# Test ListObjects\n",
    "try:\n",
    "    response = s3.list_objects_v2(Bucket=os.getenv(\"S3_BUCKET_NAME\"))\n",
    "    print(\"List permission: Allowed\")\n",
    "except ClientError as e:\n",
    "    print(\"List permission: Denied\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 59592 objects to process\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 53%|█████▎    | 31707/59592 [04:33<6:08:33,  1.26object/s]IOStream.flush timed out\n",
      "100%|██████████| 59592/59592 [08:38<00:00, 114.83object/s]  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloaded data to November-1.csv in 605.81 seconds\n"
     ]
    }
   ],
   "source": [
    "start = time()\n",
    "output_path = \"November-1.csv\"  # or \"consolidated_data.json\"\n",
    "\n",
    "# Define date range\n",
    "start_date = \"2024-11-17\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
    "end_date = \"2024-11-22\"\n",
    "\n",
    "# Start downloading\n",
    "downloader.download_to_file(\n",
    "    output_path=output_path,\n",
    "    file_format=\"csv\",  # or \"json\"\n",
    "    start_date=start_date,\n",
    "    end_date=end_date\n",
    ")\n",
    "\n",
    "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 50037 objects to process\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 66%|██████▌   | 33067/50037 [04:07<11:49, 23.92object/s]  IOStream.flush timed out\n",
      "100%|██████████| 50037/50037 [07:11<00:00, 116.06object/s] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloaded data to November-2.csv in 539.74 seconds\n"
     ]
    }
   ],
   "source": [
    "start = time()\n",
    "output_path = \"November-2.csv\"  # or \"consolidated_data.json\"\n",
    "\n",
    "# Define date range\n",
    "start_date = \"2024-11-23\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
    "end_date = \"2024-11-27\"\n",
    "\n",
    "# Start downloading\n",
    "downloader.download_to_file(\n",
    "    output_path=output_path,\n",
    "    file_format=\"csv\",  # or \"json\"\n",
    "    start_date=start_date,\n",
    "    end_date=end_date\n",
    ")\n",
    "\n",
    "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 23857 objects to process\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 23857/23857 [02:37<00:00, 151.22object/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloaded data to November-3.csv in 219.71 seconds\n"
     ]
    }
   ],
   "source": [
    "start = time()\n",
    "output_path = \"November-3.csv\"  # or \"consolidated_data.json\"\n",
    "\n",
    "\n",
    "\n",
    "# Define date range\n",
    "start_date = \"2024-11-28\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
    "end_date = \"2024-11-30\"\n",
    "\n",
    "# Start downloading\n",
    "downloader.download_to_file(\n",
    "    output_path=output_path,\n",
    "    file_format=\"csv\",  # or \"json\"\n",
    "    start_date=start_date,\n",
    "    end_date=end_date\n",
    ")\n",
    "\n",
    "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Aggregating"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Aggregated DataFrame:\n",
      "                                                link  \\\n",
      "0  https://www.npr.org/2022/11/04/1134434712/plan...   \n",
      "1  https://www.communitysignal.com/embodying-work...   \n",
      "2  https://identityweek.net/tech5-awarded-a-15-ye...   \n",
      "3  https://edtechmagazine.com/k12/article/2024/11...   \n",
      "4  https://www.ufc.com/news/updates-ufc-309-jones...   \n",
      "\n",
      "                                                 rss  \\\n",
      "0      https://www.npr.org/rss/podcast.php?id=510289   \n",
      "1              https://www.communitysignal.com/feed/   \n",
      "2              https://www.planetbiometrics.com/rss/   \n",
      "3  https://www.edtechmagazine.com/k12/rss.xml?tax...   \n",
      "4                       https://www.ufc.com/rss/news   \n",
      "\n",
      "                                               title  \\\n",
      "0  The crisis pollsters face ahead of the midterm...   \n",
      "1  Embodying Work-Life Balance as a Community Pro...   \n",
      "2  TECH5 awarded a 15-year contract with Virginia...   \n",
      "3  How Can You Prepare Your Cloud to Safely Imple...   \n",
      "4                Updates To UFC 309: Jones vs Miocic   \n",
      "\n",
      "                                             content    unixTime      rss_id  \\\n",
      "0  Planet Money tries election polling\\n\\nEnlarge...  1731884660  8181d7a585   \n",
      "1  Are you able to step away from your community ...  1731883672  b6cef58d91   \n",
      "2  TECH5 has announced a major contract win with ...  1731883632  084b136c50   \n",
      "3  Many K–12 schools that operate in the cloud ha...  1731883653  7827152faf   \n",
      "4  UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...  1731883662  2c774cd014   \n",
      "\n",
      "   article_id  llm_summary  embedding    unixtime  \n",
      "0  05dfb6bb11          NaN        NaN  1731884660  \n",
      "1  0418489c55          NaN        NaN  1731883672  \n",
      "2  0bb2ec0554          NaN        NaN  1731883632  \n",
      "3  07d39400e9          NaN        NaN  1731883653  \n",
      "4  03f31af6e2          NaN        NaN  1731883662  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import glob\n",
    "\n",
    "# List to hold dataframes\n",
    "dataframes = []\n",
    "\n",
    "# Loop through the files November-1.csv to November-3.csv\n",
    "for i in range(1, 4):\n",
    "    filename = f\"November-{i}.csv\"\n",
    "    try:\n",
    "        # Read each CSV and append to the list\n",
    "        df = pd.read_csv(filename)\n",
    "        dataframes.append(df)\n",
    "    except FileNotFoundError:\n",
    "        print(f\"File {filename} not found, skipping.\")\n",
    "\n",
    "# Aggregate the dataframes\n",
    "if dataframes:\n",
    "    aggregated_df = pd.concat(dataframes, ignore_index=True)\n",
    "    print(\"Aggregated DataFrame:\")\n",
    "    print(aggregated_df.head())\n",
    "else:\n",
    "    print(\"No files were aggregated.\")\n",
    "\n",
    "# If you want to save the result to a new CSV\n",
    "aggregated_df.to_csv(\"Aggregated_November.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"Aggregated_November.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop columns which are all null values. \n",
    "df = df.dropna(axis=1, how='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get null value percentage for each column\n",
    "null_percent = df.isnull().mean().round(4) * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop rows where the content of the column is null\n",
    "df = df.dropna(subset=['content'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "link          0.0\n",
       "rss           0.0\n",
       "title         0.2\n",
       "content       0.0\n",
       "unixTime      0.0\n",
       "rss_id        0.0\n",
       "article_id    0.0\n",
       "unixtime      0.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get null value percentage for each column\n",
    "null_percent = df.isnull().mean().round(4) * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.dropna(subset=['title'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get summary statistics for the dataframe\n",
    "summary = df.describe(include='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save df \n",
    "df.to_csv(\"Aggregated_November.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>link</th>\n",
       "      <th>rss</th>\n",
       "      <th>title</th>\n",
       "      <th>content</th>\n",
       "      <th>unixTime</th>\n",
       "      <th>rss_id</th>\n",
       "      <th>article_id</th>\n",
       "      <th>unixtime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>130388</td>\n",
       "      <td>130388</td>\n",
       "      <td>130388</td>\n",
       "      <td>130388</td>\n",
       "      <td>1.303880e+05</td>\n",
       "      <td>130388</td>\n",
       "      <td>130388</td>\n",
       "      <td>1.303880e+05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>37817</td>\n",
       "      <td>688</td>\n",
       "      <td>37383</td>\n",
       "      <td>37658</td>\n",
       "      <td>NaN</td>\n",
       "      <td>688</td>\n",
       "      <td>37817</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
       "      <td>https://blogs.unity3d.com/feed/</td>\n",
       "      <td>TechPipeline</td>\n",
       "      <td>\"Unity\", Unity logos, and other Unity trademar...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>b78e974545</td>\n",
       "      <td>05dfb6bb11</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>11</td>\n",
       "      <td>3918</td>\n",
       "      <td>330</td>\n",
       "      <td>3912</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3918</td>\n",
       "      <td>11</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732375e+09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732375e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.214484e+05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.214484e+05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.731884e+09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.731884e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732085e+09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732085e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732402e+09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732402e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732632e+09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732632e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732922e+09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.732922e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                     link  \\\n",
       "count                                              130388   \n",
       "unique                                              37817   \n",
       "top     https://www.npr.org/2022/11/04/1134434712/plan...   \n",
       "freq                                                   11   \n",
       "mean                                                  NaN   \n",
       "std                                                   NaN   \n",
       "min                                                   NaN   \n",
       "25%                                                   NaN   \n",
       "50%                                                   NaN   \n",
       "75%                                                   NaN   \n",
       "max                                                   NaN   \n",
       "\n",
       "                                    rss         title  \\\n",
       "count                            130388        130388   \n",
       "unique                              688         37383   \n",
       "top     https://blogs.unity3d.com/feed/  TechPipeline   \n",
       "freq                               3918           330   \n",
       "mean                                NaN           NaN   \n",
       "std                                 NaN           NaN   \n",
       "min                                 NaN           NaN   \n",
       "25%                                 NaN           NaN   \n",
       "50%                                 NaN           NaN   \n",
       "75%                                 NaN           NaN   \n",
       "max                                 NaN           NaN   \n",
       "\n",
       "                                                  content      unixTime  \\\n",
       "count                                              130388  1.303880e+05   \n",
       "unique                                              37658           NaN   \n",
       "top     \"Unity\", Unity logos, and other Unity trademar...           NaN   \n",
       "freq                                                 3912           NaN   \n",
       "mean                                                  NaN  1.732375e+09   \n",
       "std                                                   NaN  3.214484e+05   \n",
       "min                                                   NaN  1.731884e+09   \n",
       "25%                                                   NaN  1.732085e+09   \n",
       "50%                                                   NaN  1.732402e+09   \n",
       "75%                                                   NaN  1.732632e+09   \n",
       "max                                                   NaN  1.732922e+09   \n",
       "\n",
       "            rss_id  article_id      unixtime  \n",
       "count       130388      130388  1.303880e+05  \n",
       "unique         688       37817           NaN  \n",
       "top     b78e974545  05dfb6bb11           NaN  \n",
       "freq          3918          11           NaN  \n",
       "mean           NaN         NaN  1.732375e+09  \n",
       "std            NaN         NaN  3.214484e+05  \n",
       "min            NaN         NaN  1.731884e+09  \n",
       "25%            NaN         NaN  1.732085e+09  \n",
       "50%            NaN         NaN  1.732402e+09  \n",
       "75%            NaN         NaN  1.732632e+09  \n",
       "max            NaN         NaN  1.732922e+09  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>link</th>\n",
       "      <th>rss</th>\n",
       "      <th>title</th>\n",
       "      <th>content</th>\n",
       "      <th>unixTime</th>\n",
       "      <th>rss_id</th>\n",
       "      <th>article_id</th>\n",
       "      <th>unixtime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
       "      <td>https://www.npr.org/rss/podcast.php?id=510289</td>\n",
       "      <td>The crisis pollsters face ahead of the midterm...</td>\n",
       "      <td>Planet Money tries election polling\\n\\nEnlarge...</td>\n",
       "      <td>1731884660</td>\n",
       "      <td>8181d7a585</td>\n",
       "      <td>05dfb6bb11</td>\n",
       "      <td>1731884660</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://www.communitysignal.com/embodying-work...</td>\n",
       "      <td>https://www.communitysignal.com/feed/</td>\n",
       "      <td>Embodying Work-Life Balance as a Community Pro...</td>\n",
       "      <td>Are you able to step away from your community ...</td>\n",
       "      <td>1731883672</td>\n",
       "      <td>b6cef58d91</td>\n",
       "      <td>0418489c55</td>\n",
       "      <td>1731883672</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://identityweek.net/tech5-awarded-a-15-ye...</td>\n",
       "      <td>https://www.planetbiometrics.com/rss/</td>\n",
       "      <td>TECH5 awarded a 15-year contract with Virginia...</td>\n",
       "      <td>TECH5 has announced a major contract win with ...</td>\n",
       "      <td>1731883632</td>\n",
       "      <td>084b136c50</td>\n",
       "      <td>0bb2ec0554</td>\n",
       "      <td>1731883632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://edtechmagazine.com/k12/article/2024/11...</td>\n",
       "      <td>https://www.edtechmagazine.com/k12/rss.xml?tax...</td>\n",
       "      <td>How Can You Prepare Your Cloud to Safely Imple...</td>\n",
       "      <td>Many K–12 schools that operate in the cloud ha...</td>\n",
       "      <td>1731883653</td>\n",
       "      <td>7827152faf</td>\n",
       "      <td>07d39400e9</td>\n",
       "      <td>1731883653</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://www.ufc.com/news/updates-ufc-309-jones...</td>\n",
       "      <td>https://www.ufc.com/rss/news</td>\n",
       "      <td>Updates To UFC 309: Jones vs Miocic</td>\n",
       "      <td>UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...</td>\n",
       "      <td>1731883662</td>\n",
       "      <td>2c774cd014</td>\n",
       "      <td>03f31af6e2</td>\n",
       "      <td>1731883662</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                link  \\\n",
       "0  https://www.npr.org/2022/11/04/1134434712/plan...   \n",
       "1  https://www.communitysignal.com/embodying-work...   \n",
       "2  https://identityweek.net/tech5-awarded-a-15-ye...   \n",
       "3  https://edtechmagazine.com/k12/article/2024/11...   \n",
       "4  https://www.ufc.com/news/updates-ufc-309-jones...   \n",
       "\n",
       "                                                 rss  \\\n",
       "0      https://www.npr.org/rss/podcast.php?id=510289   \n",
       "1              https://www.communitysignal.com/feed/   \n",
       "2              https://www.planetbiometrics.com/rss/   \n",
       "3  https://www.edtechmagazine.com/k12/rss.xml?tax...   \n",
       "4                       https://www.ufc.com/rss/news   \n",
       "\n",
       "                                               title  \\\n",
       "0  The crisis pollsters face ahead of the midterm...   \n",
       "1  Embodying Work-Life Balance as a Community Pro...   \n",
       "2  TECH5 awarded a 15-year contract with Virginia...   \n",
       "3  How Can You Prepare Your Cloud to Safely Imple...   \n",
       "4                Updates To UFC 309: Jones vs Miocic   \n",
       "\n",
       "                                             content    unixTime      rss_id  \\\n",
       "0  Planet Money tries election polling\\n\\nEnlarge...  1731884660  8181d7a585   \n",
       "1  Are you able to step away from your community ...  1731883672  b6cef58d91   \n",
       "2  TECH5 has announced a major contract win with ...  1731883632  084b136c50   \n",
       "3  Many K–12 schools that operate in the cloud ha...  1731883653  7827152faf   \n",
       "4  UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...  1731883662  2c774cd014   \n",
       "\n",
       "   article_id    unixtime  \n",
       "0  05dfb6bb11  1731884660  \n",
       "1  0418489c55  1731883672  \n",
       "2  0bb2ec0554  1731883632  \n",
       "3  07d39400e9  1731883653  \n",
       "4  03f31af6e2  1731883662  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}