mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 05:54:22 +01:00
792 lines
25 KiB
Plaintext
792 lines
25 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# RSS Article Batch Download Examples\n",
|
||
"\n",
|
||
"This notebook demonstrates how to batch download RSS articles from S3."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"import pandas as pd\n",
|
||
"from time import time\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Root imports\n",
|
||
"\n",
|
||
"from src.search.batch import S3BatchDownloader\n",
|
||
"from dotenv import load_dotenv\n",
|
||
"\n",
|
||
"load_dotenv(override=True)\n",
|
||
"\n",
|
||
"downloader = S3BatchDownloader()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Initialize the Downloader"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"List permission: Allowed\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import boto3\n",
|
||
"from botocore.exceptions import ClientError\n",
|
||
"\n",
|
||
"s3 = boto3.client('s3')\n",
|
||
"\n",
|
||
"# Test ListObjects\n",
|
||
"try:\n",
|
||
" response = s3.list_objects_v2(Bucket=os.getenv(\"S3_BUCKET_NAME\"))\n",
|
||
" print(\"List permission: Allowed\")\n",
|
||
"except ClientError as e:\n",
|
||
" print(\"List permission: Denied\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Found 59592 objects to process\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" 53%|█████▎ | 31707/59592 [04:33<6:08:33, 1.26object/s]IOStream.flush timed out\n",
|
||
"100%|██████████| 59592/59592 [08:38<00:00, 114.83object/s] \n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Downloaded data to November-1.csv in 605.81 seconds\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"start = time()\n",
|
||
"output_path = \"November-1.csv\" # or \"consolidated_data.json\"\n",
|
||
"\n",
|
||
"# Define date range\n",
|
||
"start_date = \"2024-11-17\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
|
||
"end_date = \"2024-11-22\"\n",
|
||
"\n",
|
||
"# Start downloading\n",
|
||
"downloader.download_to_file(\n",
|
||
" output_path=output_path,\n",
|
||
" file_format=\"csv\", # or \"json\"\n",
|
||
" start_date=start_date,\n",
|
||
" end_date=end_date\n",
|
||
")\n",
|
||
"\n",
|
||
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Found 50037 objects to process\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" 66%|██████▌ | 33067/50037 [04:07<11:49, 23.92object/s] IOStream.flush timed out\n",
|
||
"100%|██████████| 50037/50037 [07:11<00:00, 116.06object/s] \n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Downloaded data to November-2.csv in 539.74 seconds\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"start = time()\n",
|
||
"output_path = \"November-2.csv\" # or \"consolidated_data.json\"\n",
|
||
"\n",
|
||
"# Define date range\n",
|
||
"start_date = \"2024-11-23\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
|
||
"end_date = \"2024-11-27\"\n",
|
||
"\n",
|
||
"# Start downloading\n",
|
||
"downloader.download_to_file(\n",
|
||
" output_path=output_path,\n",
|
||
" file_format=\"csv\", # or \"json\"\n",
|
||
" start_date=start_date,\n",
|
||
" end_date=end_date\n",
|
||
")\n",
|
||
"\n",
|
||
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Found 23857 objects to process\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"100%|██████████| 23857/23857 [02:37<00:00, 151.22object/s]\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Downloaded data to November-3.csv in 219.71 seconds\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"start = time()\n",
|
||
"output_path = \"November-3.csv\" # or \"consolidated_data.json\"\n",
|
||
"\n",
|
||
"\n",
|
||
"\n",
|
||
"# Define date range\n",
|
||
"start_date = \"2024-11-28\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
|
||
"end_date = \"2024-11-30\"\n",
|
||
"\n",
|
||
"# Start downloading\n",
|
||
"downloader.download_to_file(\n",
|
||
" output_path=output_path,\n",
|
||
" file_format=\"csv\", # or \"json\"\n",
|
||
" start_date=start_date,\n",
|
||
" end_date=end_date\n",
|
||
")\n",
|
||
"\n",
|
||
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Aggregating"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Aggregated DataFrame:\n",
|
||
" link \\\n",
|
||
"0 https://www.npr.org/2022/11/04/1134434712/plan... \n",
|
||
"1 https://www.communitysignal.com/embodying-work... \n",
|
||
"2 https://identityweek.net/tech5-awarded-a-15-ye... \n",
|
||
"3 https://edtechmagazine.com/k12/article/2024/11... \n",
|
||
"4 https://www.ufc.com/news/updates-ufc-309-jones... \n",
|
||
"\n",
|
||
" rss \\\n",
|
||
"0 https://www.npr.org/rss/podcast.php?id=510289 \n",
|
||
"1 https://www.communitysignal.com/feed/ \n",
|
||
"2 https://www.planetbiometrics.com/rss/ \n",
|
||
"3 https://www.edtechmagazine.com/k12/rss.xml?tax... \n",
|
||
"4 https://www.ufc.com/rss/news \n",
|
||
"\n",
|
||
" title \\\n",
|
||
"0 The crisis pollsters face ahead of the midterm... \n",
|
||
"1 Embodying Work-Life Balance as a Community Pro... \n",
|
||
"2 TECH5 awarded a 15-year contract with Virginia... \n",
|
||
"3 How Can You Prepare Your Cloud to Safely Imple... \n",
|
||
"4 Updates To UFC 309: Jones vs Miocic \n",
|
||
"\n",
|
||
" content unixTime rss_id \\\n",
|
||
"0 Planet Money tries election polling\\n\\nEnlarge... 1731884660 8181d7a585 \n",
|
||
"1 Are you able to step away from your community ... 1731883672 b6cef58d91 \n",
|
||
"2 TECH5 has announced a major contract win with ... 1731883632 084b136c50 \n",
|
||
"3 Many K–12 schools that operate in the cloud ha... 1731883653 7827152faf \n",
|
||
"4 UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... 1731883662 2c774cd014 \n",
|
||
"\n",
|
||
" article_id llm_summary embedding unixtime \n",
|
||
"0 05dfb6bb11 NaN NaN 1731884660 \n",
|
||
"1 0418489c55 NaN NaN 1731883672 \n",
|
||
"2 0bb2ec0554 NaN NaN 1731883632 \n",
|
||
"3 07d39400e9 NaN NaN 1731883653 \n",
|
||
"4 03f31af6e2 NaN NaN 1731883662 \n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import glob\n",
|
||
"\n",
|
||
"# List to hold dataframes\n",
|
||
"dataframes = []\n",
|
||
"\n",
|
||
"# Loop through the files November-1.csv to November-3.csv\n",
|
||
"for i in range(1, 4):\n",
|
||
" filename = f\"November-{i}.csv\"\n",
|
||
" try:\n",
|
||
" # Read each CSV and append to the list\n",
|
||
" df = pd.read_csv(filename)\n",
|
||
" dataframes.append(df)\n",
|
||
" except FileNotFoundError:\n",
|
||
" print(f\"File {filename} not found, skipping.\")\n",
|
||
"\n",
|
||
"# Aggregate the dataframes\n",
|
||
"if dataframes:\n",
|
||
" aggregated_df = pd.concat(dataframes, ignore_index=True)\n",
|
||
" print(\"Aggregated DataFrame:\")\n",
|
||
" print(aggregated_df.head())\n",
|
||
"else:\n",
|
||
" print(\"No files were aggregated.\")\n",
|
||
"\n",
|
||
"# If you want to save the result to a new CSV\n",
|
||
"aggregated_df.to_csv(\"Aggregated_November.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = pd.read_csv(\"Aggregated_November.csv\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Drop columns which are all null values. \n",
|
||
"df = df.dropna(axis=1, how='all')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Get null value percentage for each column\n",
|
||
"null_percent = df.isnull().mean().round(4) * 100"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Drop rows where the content of the column is null\n",
|
||
"df = df.dropna(subset=['content'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"link 0.0\n",
|
||
"rss 0.0\n",
|
||
"title 0.2\n",
|
||
"content 0.0\n",
|
||
"unixTime 0.0\n",
|
||
"rss_id 0.0\n",
|
||
"article_id 0.0\n",
|
||
"unixtime 0.0\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Get null value percentage for each column\n",
|
||
"null_percent = df.isnull().mean().round(4) * 100"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = df.dropna(subset=['title'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Get summary statistics for the dataframe\n",
|
||
"summary = df.describe(include='all')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# save df \n",
|
||
"df.to_csv(\"Aggregated_November.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>link</th>\n",
|
||
" <th>rss</th>\n",
|
||
" <th>title</th>\n",
|
||
" <th>content</th>\n",
|
||
" <th>unixTime</th>\n",
|
||
" <th>rss_id</th>\n",
|
||
" <th>article_id</th>\n",
|
||
" <th>unixtime</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>130388</td>\n",
|
||
" <td>130388</td>\n",
|
||
" <td>130388</td>\n",
|
||
" <td>130388</td>\n",
|
||
" <td>1.303880e+05</td>\n",
|
||
" <td>130388</td>\n",
|
||
" <td>130388</td>\n",
|
||
" <td>1.303880e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>unique</th>\n",
|
||
" <td>37817</td>\n",
|
||
" <td>688</td>\n",
|
||
" <td>37383</td>\n",
|
||
" <td>37658</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>688</td>\n",
|
||
" <td>37817</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>top</th>\n",
|
||
" <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
|
||
" <td>https://blogs.unity3d.com/feed/</td>\n",
|
||
" <td>TechPipeline</td>\n",
|
||
" <td>\"Unity\", Unity logos, and other Unity trademar...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>b78e974545</td>\n",
|
||
" <td>05dfb6bb11</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>freq</th>\n",
|
||
" <td>11</td>\n",
|
||
" <td>3918</td>\n",
|
||
" <td>330</td>\n",
|
||
" <td>3912</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>3918</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732375e+09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732375e+09</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>3.214484e+05</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>3.214484e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.731884e+09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.731884e+09</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732085e+09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732085e+09</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732402e+09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732402e+09</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732632e+09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732632e+09</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732922e+09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.732922e+09</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" link \\\n",
|
||
"count 130388 \n",
|
||
"unique 37817 \n",
|
||
"top https://www.npr.org/2022/11/04/1134434712/plan... \n",
|
||
"freq 11 \n",
|
||
"mean NaN \n",
|
||
"std NaN \n",
|
||
"min NaN \n",
|
||
"25% NaN \n",
|
||
"50% NaN \n",
|
||
"75% NaN \n",
|
||
"max NaN \n",
|
||
"\n",
|
||
" rss title \\\n",
|
||
"count 130388 130388 \n",
|
||
"unique 688 37383 \n",
|
||
"top https://blogs.unity3d.com/feed/ TechPipeline \n",
|
||
"freq 3918 330 \n",
|
||
"mean NaN NaN \n",
|
||
"std NaN NaN \n",
|
||
"min NaN NaN \n",
|
||
"25% NaN NaN \n",
|
||
"50% NaN NaN \n",
|
||
"75% NaN NaN \n",
|
||
"max NaN NaN \n",
|
||
"\n",
|
||
" content unixTime \\\n",
|
||
"count 130388 1.303880e+05 \n",
|
||
"unique 37658 NaN \n",
|
||
"top \"Unity\", Unity logos, and other Unity trademar... NaN \n",
|
||
"freq 3912 NaN \n",
|
||
"mean NaN 1.732375e+09 \n",
|
||
"std NaN 3.214484e+05 \n",
|
||
"min NaN 1.731884e+09 \n",
|
||
"25% NaN 1.732085e+09 \n",
|
||
"50% NaN 1.732402e+09 \n",
|
||
"75% NaN 1.732632e+09 \n",
|
||
"max NaN 1.732922e+09 \n",
|
||
"\n",
|
||
" rss_id article_id unixtime \n",
|
||
"count 130388 130388 1.303880e+05 \n",
|
||
"unique 688 37817 NaN \n",
|
||
"top b78e974545 05dfb6bb11 NaN \n",
|
||
"freq 3918 11 NaN \n",
|
||
"mean NaN NaN 1.732375e+09 \n",
|
||
"std NaN NaN 3.214484e+05 \n",
|
||
"min NaN NaN 1.731884e+09 \n",
|
||
"25% NaN NaN 1.732085e+09 \n",
|
||
"50% NaN NaN 1.732402e+09 \n",
|
||
"75% NaN NaN 1.732632e+09 \n",
|
||
"max NaN NaN 1.732922e+09 "
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"summary"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>link</th>\n",
|
||
" <th>rss</th>\n",
|
||
" <th>title</th>\n",
|
||
" <th>content</th>\n",
|
||
" <th>unixTime</th>\n",
|
||
" <th>rss_id</th>\n",
|
||
" <th>article_id</th>\n",
|
||
" <th>unixtime</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
|
||
" <td>https://www.npr.org/rss/podcast.php?id=510289</td>\n",
|
||
" <td>The crisis pollsters face ahead of the midterm...</td>\n",
|
||
" <td>Planet Money tries election polling\\n\\nEnlarge...</td>\n",
|
||
" <td>1731884660</td>\n",
|
||
" <td>8181d7a585</td>\n",
|
||
" <td>05dfb6bb11</td>\n",
|
||
" <td>1731884660</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>https://www.communitysignal.com/embodying-work...</td>\n",
|
||
" <td>https://www.communitysignal.com/feed/</td>\n",
|
||
" <td>Embodying Work-Life Balance as a Community Pro...</td>\n",
|
||
" <td>Are you able to step away from your community ...</td>\n",
|
||
" <td>1731883672</td>\n",
|
||
" <td>b6cef58d91</td>\n",
|
||
" <td>0418489c55</td>\n",
|
||
" <td>1731883672</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>https://identityweek.net/tech5-awarded-a-15-ye...</td>\n",
|
||
" <td>https://www.planetbiometrics.com/rss/</td>\n",
|
||
" <td>TECH5 awarded a 15-year contract with Virginia...</td>\n",
|
||
" <td>TECH5 has announced a major contract win with ...</td>\n",
|
||
" <td>1731883632</td>\n",
|
||
" <td>084b136c50</td>\n",
|
||
" <td>0bb2ec0554</td>\n",
|
||
" <td>1731883632</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>https://edtechmagazine.com/k12/article/2024/11...</td>\n",
|
||
" <td>https://www.edtechmagazine.com/k12/rss.xml?tax...</td>\n",
|
||
" <td>How Can You Prepare Your Cloud to Safely Imple...</td>\n",
|
||
" <td>Many K–12 schools that operate in the cloud ha...</td>\n",
|
||
" <td>1731883653</td>\n",
|
||
" <td>7827152faf</td>\n",
|
||
" <td>07d39400e9</td>\n",
|
||
" <td>1731883653</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>https://www.ufc.com/news/updates-ufc-309-jones...</td>\n",
|
||
" <td>https://www.ufc.com/rss/news</td>\n",
|
||
" <td>Updates To UFC 309: Jones vs Miocic</td>\n",
|
||
" <td>UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...</td>\n",
|
||
" <td>1731883662</td>\n",
|
||
" <td>2c774cd014</td>\n",
|
||
" <td>03f31af6e2</td>\n",
|
||
" <td>1731883662</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" link \\\n",
|
||
"0 https://www.npr.org/2022/11/04/1134434712/plan... \n",
|
||
"1 https://www.communitysignal.com/embodying-work... \n",
|
||
"2 https://identityweek.net/tech5-awarded-a-15-ye... \n",
|
||
"3 https://edtechmagazine.com/k12/article/2024/11... \n",
|
||
"4 https://www.ufc.com/news/updates-ufc-309-jones... \n",
|
||
"\n",
|
||
" rss \\\n",
|
||
"0 https://www.npr.org/rss/podcast.php?id=510289 \n",
|
||
"1 https://www.communitysignal.com/feed/ \n",
|
||
"2 https://www.planetbiometrics.com/rss/ \n",
|
||
"3 https://www.edtechmagazine.com/k12/rss.xml?tax... \n",
|
||
"4 https://www.ufc.com/rss/news \n",
|
||
"\n",
|
||
" title \\\n",
|
||
"0 The crisis pollsters face ahead of the midterm... \n",
|
||
"1 Embodying Work-Life Balance as a Community Pro... \n",
|
||
"2 TECH5 awarded a 15-year contract with Virginia... \n",
|
||
"3 How Can You Prepare Your Cloud to Safely Imple... \n",
|
||
"4 Updates To UFC 309: Jones vs Miocic \n",
|
||
"\n",
|
||
" content unixTime rss_id \\\n",
|
||
"0 Planet Money tries election polling\\n\\nEnlarge... 1731884660 8181d7a585 \n",
|
||
"1 Are you able to step away from your community ... 1731883672 b6cef58d91 \n",
|
||
"2 TECH5 has announced a major contract win with ... 1731883632 084b136c50 \n",
|
||
"3 Many K–12 schools that operate in the cloud ha... 1731883653 7827152faf \n",
|
||
"4 UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... 1731883662 2c774cd014 \n",
|
||
"\n",
|
||
" article_id unixtime \n",
|
||
"0 05dfb6bb11 1731884660 \n",
|
||
"1 0418489c55 1731883672 \n",
|
||
"2 0bb2ec0554 1731883632 \n",
|
||
"3 07d39400e9 1731883653 \n",
|
||
"4 03f31af6e2 1731883662 "
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "base",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.9"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|