mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 14:04:19 +01:00
302 lines
9.3 KiB
Plaintext
302 lines
9.3 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# RSS Article Batch Download Examples\n",
|
||
"\n",
|
||
"This notebook demonstrates how to batch download RSS articles from S3."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"import sys\n",
|
||
"import pandas as pd\n",
|
||
"from time import time\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Root imports\n",
|
||
"os.chdir('..')\n",
|
||
"\n",
|
||
"from src.search.batch import S3BatchDownloader\n",
|
||
"from dotenv import load_dotenv\n",
|
||
"\n",
|
||
"load_dotenv(override=True)\n",
|
||
"\n",
|
||
"downloader = S3BatchDownloader()\n",
|
||
"\n",
|
||
"os.chdir('example-notebooks')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Initialize the Downloader"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"List permission: Allowed\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import boto3\n",
|
||
"from botocore.exceptions import ClientError\n",
|
||
"\n",
|
||
"s3 = boto3.client('s3')\n",
|
||
"\n",
|
||
"# Test ListObjects\n",
|
||
"try:\n",
|
||
" response = s3.list_objects_v2(Bucket='open-rss-articles-us-east-1')\n",
|
||
" print(\"List permission: Allowed\")\n",
|
||
"except ClientError as e:\n",
|
||
" print(\"List permission: Denied\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Downloaded data to consolidated_data.csv in 59.01 seconds\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"start = time()\n",
|
||
"output_path = \"consolidated_data.csv\" # or \"consolidated_data.json\"\n",
|
||
"\n",
|
||
"# Define date range\n",
|
||
"start_date = \"2024-11-17\"\n",
|
||
"end_date = \"2024-11-20\"\n",
|
||
"\n",
|
||
"# Start downloading\n",
|
||
"downloader.download_to_file(\n",
|
||
" output_path=output_path,\n",
|
||
" file_format=\"csv\", # or \"json\"\n",
|
||
" start_date=start_date,\n",
|
||
" end_date=end_date\n",
|
||
")\n",
|
||
"\n",
|
||
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = pd.read_csv(output_path)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>link</th>\n",
|
||
" <th>rss</th>\n",
|
||
" <th>title</th>\n",
|
||
" <th>content</th>\n",
|
||
" <th>unixTime</th>\n",
|
||
" <th>rss_id</th>\n",
|
||
" <th>article_id</th>\n",
|
||
" <th>llm_summary</th>\n",
|
||
" <th>embedding</th>\n",
|
||
" <th>unixtime</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>https://www.cnet.com/home/internet/comporium-i...</td>\n",
|
||
" <td>https://www.cnet.com/rss/news/</td>\n",
|
||
" <td>Comporium Home Internet: Pricing, Speeds and A...</td>\n",
|
||
" <td>Unavailable in Provider unavailable in 90001 E...</td>\n",
|
||
" <td>1731883654</td>\n",
|
||
" <td>f5e6f52c79</td>\n",
|
||
" <td>00089f7505</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1731883654</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>https://kotaku.com/lady-gaga-wednesday-season-...</td>\n",
|
||
" <td>https://kotaku.com/rss</td>\n",
|
||
" <td>Lady Gaga Appearing In Wednesday Season 2 Is A...</td>\n",
|
||
" <td>The next stop on Lady Gaga’s years-long tour t...</td>\n",
|
||
" <td>1731883665</td>\n",
|
||
" <td>a0840ab3b4</td>\n",
|
||
" <td>009f880a86</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1731883665</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>https://expeditionportal.com/classifieds-honda...</td>\n",
|
||
" <td>https://www.expeditionportal.com/feed/</td>\n",
|
||
" <td>2008 Honda Element EX ECamper :: Classifieds</td>\n",
|
||
" <td>Overlanding is about experiences first and for...</td>\n",
|
||
" <td>1731883665</td>\n",
|
||
" <td>ecdc66bb02</td>\n",
|
||
" <td>00ac77e95f</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1731883665</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>http://www.influencerupdate.biz/news/68624/twi...</td>\n",
|
||
" <td>https://www.influencerupdate.biz/rss/news/</td>\n",
|
||
" <td>Twitch is testing mid-roll ads on creator broa...</td>\n",
|
||
" <td>Streaming giant Twitch is planning to test mid...</td>\n",
|
||
" <td>1731883669</td>\n",
|
||
" <td>e34caba76d</td>\n",
|
||
" <td>0041bc4abf</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1731883669</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>https://thejournal.com/Articles/2024/10/29/Rep...</td>\n",
|
||
" <td>https://thejournal.com/rss-feeds/news.aspx</td>\n",
|
||
" <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
|
||
" <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
|
||
" <td>1731883616</td>\n",
|
||
" <td>efd9bb9654</td>\n",
|
||
" <td>000a3da3b6</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1731883616</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" link \\\n",
|
||
"0 https://www.cnet.com/home/internet/comporium-i... \n",
|
||
"1 https://kotaku.com/lady-gaga-wednesday-season-... \n",
|
||
"2 https://expeditionportal.com/classifieds-honda... \n",
|
||
"3 http://www.influencerupdate.biz/news/68624/twi... \n",
|
||
"4 https://thejournal.com/Articles/2024/10/29/Rep... \n",
|
||
"\n",
|
||
" rss \\\n",
|
||
"0 https://www.cnet.com/rss/news/ \n",
|
||
"1 https://kotaku.com/rss \n",
|
||
"2 https://www.expeditionportal.com/feed/ \n",
|
||
"3 https://www.influencerupdate.biz/rss/news/ \n",
|
||
"4 https://thejournal.com/rss-feeds/news.aspx \n",
|
||
"\n",
|
||
" title \\\n",
|
||
"0 Comporium Home Internet: Pricing, Speeds and A... \n",
|
||
"1 Lady Gaga Appearing In Wednesday Season 2 Is A... \n",
|
||
"2 2008 Honda Element EX ECamper :: Classifieds \n",
|
||
"3 Twitch is testing mid-roll ads on creator broa... \n",
|
||
"4 Report: 90% of Schools Depend on E-rate Fundin... \n",
|
||
"\n",
|
||
" content unixTime rss_id \\\n",
|
||
"0 Unavailable in Provider unavailable in 90001 E... 1731883654 f5e6f52c79 \n",
|
||
"1 The next stop on Lady Gaga’s years-long tour t... 1731883665 a0840ab3b4 \n",
|
||
"2 Overlanding is about experiences first and for... 1731883665 ecdc66bb02 \n",
|
||
"3 Streaming giant Twitch is planning to test mid... 1731883669 e34caba76d \n",
|
||
"4 Report: 90% of Schools Depend on E-rate Fundin... 1731883616 efd9bb9654 \n",
|
||
"\n",
|
||
" article_id llm_summary embedding unixtime \n",
|
||
"0 00089f7505 NaN NaN 1731883654 \n",
|
||
"1 009f880a86 NaN NaN 1731883665 \n",
|
||
"2 00ac77e95f NaN NaN 1731883665 \n",
|
||
"3 0041bc4abf NaN NaN 1731883669 \n",
|
||
"4 000a3da3b6 NaN NaN 1731883616 "
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 1. Check Storage Statistics"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "base",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.9"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|