mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 05:54:22 +01:00
sped up batch ingestion of articles.
This commit is contained in:
@@ -11,17 +11,33 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"import pandas as pd\n",
|
||||
"from datetime import datetime, timedelta\n",
|
||||
"from src.search import S3BatchDownloader\n",
|
||||
"from time import time\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Root imports\n",
|
||||
"os.chdir('..')\n",
|
||||
"\n",
|
||||
"# Configure logging\n",
|
||||
"import logging\n",
|
||||
"logging.basicConfig(level=logging.INFO)"
|
||||
"from src.search.batch import S3BatchDownloader\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"\n",
|
||||
"downloader = S3BatchDownloader()\n",
|
||||
"\n",
|
||||
"os.chdir('example-notebooks')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -31,18 +47,226 @@
|
||||
"## Initialize the Downloader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"List permission: Allowed\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import boto3\n",
|
||||
"from botocore.exceptions import ClientError\n",
|
||||
"\n",
|
||||
"s3 = boto3.client('s3')\n",
|
||||
"\n",
|
||||
"# Test ListObjects\n",
|
||||
"try:\n",
|
||||
" response = s3.list_objects_v2(Bucket='open-rss-articles-us-east-1')\n",
|
||||
" print(\"List permission: Allowed\")\n",
|
||||
"except ClientError as e:\n",
|
||||
" print(\"List permission: Denied\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloaded data to consolidated_data.csv in 59.01 seconds\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Optional: Set environment variables\n",
|
||||
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
|
||||
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
|
||||
"os.environ['RSS_PREFIX'] = 'articles/'\n",
|
||||
"start = time()\n",
|
||||
"output_path = \"consolidated_data.csv\" # or \"consolidated_data.json\"\n",
|
||||
"\n",
|
||||
"# Initialize downloader\n",
|
||||
"downloader = S3BatchDownloader()"
|
||||
"# Define date range\n",
|
||||
"start_date = \"2024-11-17\"\n",
|
||||
"end_date = \"2024-11-20\"\n",
|
||||
"\n",
|
||||
"# Start downloading\n",
|
||||
"downloader.download_to_file(\n",
|
||||
" output_path=output_path,\n",
|
||||
" file_format=\"csv\", # or \"json\"\n",
|
||||
" start_date=start_date,\n",
|
||||
" end_date=end_date\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_csv(output_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>link</th>\n",
|
||||
" <th>rss</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>content</th>\n",
|
||||
" <th>unixTime</th>\n",
|
||||
" <th>rss_id</th>\n",
|
||||
" <th>article_id</th>\n",
|
||||
" <th>llm_summary</th>\n",
|
||||
" <th>embedding</th>\n",
|
||||
" <th>unixtime</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>https://www.cnet.com/home/internet/comporium-i...</td>\n",
|
||||
" <td>https://www.cnet.com/rss/news/</td>\n",
|
||||
" <td>Comporium Home Internet: Pricing, Speeds and A...</td>\n",
|
||||
" <td>Unavailable in Provider unavailable in 90001 E...</td>\n",
|
||||
" <td>1731883654</td>\n",
|
||||
" <td>f5e6f52c79</td>\n",
|
||||
" <td>00089f7505</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1731883654</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>https://kotaku.com/lady-gaga-wednesday-season-...</td>\n",
|
||||
" <td>https://kotaku.com/rss</td>\n",
|
||||
" <td>Lady Gaga Appearing In Wednesday Season 2 Is A...</td>\n",
|
||||
" <td>The next stop on Lady Gaga’s years-long tour t...</td>\n",
|
||||
" <td>1731883665</td>\n",
|
||||
" <td>a0840ab3b4</td>\n",
|
||||
" <td>009f880a86</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1731883665</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>https://expeditionportal.com/classifieds-honda...</td>\n",
|
||||
" <td>https://www.expeditionportal.com/feed/</td>\n",
|
||||
" <td>2008 Honda Element EX ECamper :: Classifieds</td>\n",
|
||||
" <td>Overlanding is about experiences first and for...</td>\n",
|
||||
" <td>1731883665</td>\n",
|
||||
" <td>ecdc66bb02</td>\n",
|
||||
" <td>00ac77e95f</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1731883665</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>http://www.influencerupdate.biz/news/68624/twi...</td>\n",
|
||||
" <td>https://www.influencerupdate.biz/rss/news/</td>\n",
|
||||
" <td>Twitch is testing mid-roll ads on creator broa...</td>\n",
|
||||
" <td>Streaming giant Twitch is planning to test mid...</td>\n",
|
||||
" <td>1731883669</td>\n",
|
||||
" <td>e34caba76d</td>\n",
|
||||
" <td>0041bc4abf</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1731883669</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>https://thejournal.com/Articles/2024/10/29/Rep...</td>\n",
|
||||
" <td>https://thejournal.com/rss-feeds/news.aspx</td>\n",
|
||||
" <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
|
||||
" <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
|
||||
" <td>1731883616</td>\n",
|
||||
" <td>efd9bb9654</td>\n",
|
||||
" <td>000a3da3b6</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1731883616</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" link \\\n",
|
||||
"0 https://www.cnet.com/home/internet/comporium-i... \n",
|
||||
"1 https://kotaku.com/lady-gaga-wednesday-season-... \n",
|
||||
"2 https://expeditionportal.com/classifieds-honda... \n",
|
||||
"3 http://www.influencerupdate.biz/news/68624/twi... \n",
|
||||
"4 https://thejournal.com/Articles/2024/10/29/Rep... \n",
|
||||
"\n",
|
||||
" rss \\\n",
|
||||
"0 https://www.cnet.com/rss/news/ \n",
|
||||
"1 https://kotaku.com/rss \n",
|
||||
"2 https://www.expeditionportal.com/feed/ \n",
|
||||
"3 https://www.influencerupdate.biz/rss/news/ \n",
|
||||
"4 https://thejournal.com/rss-feeds/news.aspx \n",
|
||||
"\n",
|
||||
" title \\\n",
|
||||
"0 Comporium Home Internet: Pricing, Speeds and A... \n",
|
||||
"1 Lady Gaga Appearing In Wednesday Season 2 Is A... \n",
|
||||
"2 2008 Honda Element EX ECamper :: Classifieds \n",
|
||||
"3 Twitch is testing mid-roll ads on creator broa... \n",
|
||||
"4 Report: 90% of Schools Depend on E-rate Fundin... \n",
|
||||
"\n",
|
||||
" content unixTime rss_id \\\n",
|
||||
"0 Unavailable in Provider unavailable in 90001 E... 1731883654 f5e6f52c79 \n",
|
||||
"1 The next stop on Lady Gaga’s years-long tour t... 1731883665 a0840ab3b4 \n",
|
||||
"2 Overlanding is about experiences first and for... 1731883665 ecdc66bb02 \n",
|
||||
"3 Streaming giant Twitch is planning to test mid... 1731883669 e34caba76d \n",
|
||||
"4 Report: 90% of Schools Depend on E-rate Fundin... 1731883616 efd9bb9654 \n",
|
||||
"\n",
|
||||
" article_id llm_summary embedding unixtime \n",
|
||||
"0 00089f7505 NaN NaN 1731883654 \n",
|
||||
"1 009f880a86 NaN NaN 1731883665 \n",
|
||||
"2 00ac77e95f NaN NaN 1731883665 \n",
|
||||
"3 0041bc4abf NaN NaN 1731883669 \n",
|
||||
"4 000a3da3b6 NaN NaN 1731883616 "
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -51,150 +275,27 @@
|
||||
"source": [
|
||||
"## 1. Check Storage Statistics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Get storage stats\n",
|
||||
"stats = downloader.get_storage_stats()\n",
|
||||
"\n",
|
||||
"print(f\"Total objects: {stats['total_objects']:,}\")\n",
|
||||
"print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n",
|
||||
"print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. Download Examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Download last 7 days of articles\n",
|
||||
"start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n",
|
||||
"\n",
|
||||
"recent_articles_path = downloader.download_to_csv(\n",
|
||||
" output_path='recent_articles.csv',\n",
|
||||
" start_date=start_date\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Load and display sample\n",
|
||||
"recent_df = pd.read_csv(recent_articles_path)\n",
|
||||
"print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n",
|
||||
"recent_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Download articles from specific source prefix\n",
|
||||
"tech_articles_path = downloader.download_to_csv(\n",
|
||||
" output_path='tech_articles.csv',\n",
|
||||
" prefix='articles/tech/'\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"tech_df = pd.read_csv(tech_articles_path)\n",
|
||||
"print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n",
|
||||
"tech_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. Analysis Examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Analyze downloaded data\n",
|
||||
"if 'source' in tech_df.columns:\n",
|
||||
" source_counts = tech_df['source'].value_counts()\n",
|
||||
" \n",
|
||||
" print(\"\\nArticles per Source:\")\n",
|
||||
" print(source_counts)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Time-based analysis\n",
|
||||
"if 'published_date' in tech_df.columns:\n",
|
||||
" tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n",
|
||||
" daily_counts = tech_df.resample('D', on='published_date').size()\n",
|
||||
" \n",
|
||||
" # Plot\n",
|
||||
" import matplotlib.pyplot as plt\n",
|
||||
" \n",
|
||||
" plt.figure(figsize=(15, 6))\n",
|
||||
" daily_counts.plot(kind='bar')\n",
|
||||
" plt.title('Articles per Day')\n",
|
||||
" plt.xlabel('Date')\n",
|
||||
" plt.ylabel('Number of Articles')\n",
|
||||
" plt.xticks(rotation=45)\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4. Export Filtered Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Filter and export specific data\n",
|
||||
"if 'title' in tech_df.columns and 'content' in tech_df.columns:\n",
|
||||
" # Filter articles containing specific keywords\n",
|
||||
" ai_articles = tech_df[\n",
|
||||
" tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n",
|
||||
" tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" # Export filtered data\n",
|
||||
" ai_articles.to_csv('ai_articles.csv', index=False)\n",
|
||||
" print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 5. Cleanup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Optional: Remove downloaded CSV files\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n",
|
||||
" if os.path.exists(file):\n",
|
||||
" os.remove(file)\n",
|
||||
" print(f\"Removed {file}\")"
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user