mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 14:04:19 +01:00
batch update
This commit is contained in:
200
example-notebooks/batch-notebook.ipynb
Normal file
200
example-notebooks/batch-notebook.ipynb
Normal file
@@ -0,0 +1,200 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RSS Article Batch Download Examples\n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to batch download RSS articles from S3."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import pandas as pd\n",
|
||||
"from datetime import datetime, timedelta\n",
|
||||
"from src.search import S3BatchDownloader\n",
|
||||
"\n",
|
||||
"# Configure logging\n",
|
||||
"import logging\n",
|
||||
"logging.basicConfig(level=logging.INFO)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize the Downloader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Optional: Set environment variables\n",
|
||||
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
|
||||
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
|
||||
"os.environ['RSS_PREFIX'] = 'articles/'\n",
|
||||
"\n",
|
||||
"# Initialize downloader\n",
|
||||
"downloader = S3BatchDownloader()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. Check Storage Statistics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Get storage stats\n",
|
||||
"stats = downloader.get_storage_stats()\n",
|
||||
"\n",
|
||||
"print(f\"Total objects: {stats['total_objects']:,}\")\n",
|
||||
"print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n",
|
||||
"print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. Download Examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Download last 7 days of articles\n",
|
||||
"start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n",
|
||||
"\n",
|
||||
"recent_articles_path = downloader.download_to_csv(\n",
|
||||
" output_path='recent_articles.csv',\n",
|
||||
" start_date=start_date\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Load and display sample\n",
|
||||
"recent_df = pd.read_csv(recent_articles_path)\n",
|
||||
"print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n",
|
||||
"recent_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Download articles from specific source prefix\n",
|
||||
"tech_articles_path = downloader.download_to_csv(\n",
|
||||
" output_path='tech_articles.csv',\n",
|
||||
" prefix='articles/tech/'\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"tech_df = pd.read_csv(tech_articles_path)\n",
|
||||
"print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n",
|
||||
"tech_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. Analysis Examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Analyze downloaded data\n",
|
||||
"if 'source' in tech_df.columns:\n",
|
||||
" source_counts = tech_df['source'].value_counts()\n",
|
||||
" \n",
|
||||
" print(\"\\nArticles per Source:\")\n",
|
||||
" print(source_counts)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Time-based analysis\n",
|
||||
"if 'published_date' in tech_df.columns:\n",
|
||||
" tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n",
|
||||
" daily_counts = tech_df.resample('D', on='published_date').size()\n",
|
||||
" \n",
|
||||
" # Plot\n",
|
||||
" import matplotlib.pyplot as plt\n",
|
||||
" \n",
|
||||
" plt.figure(figsize=(15, 6))\n",
|
||||
" daily_counts.plot(kind='bar')\n",
|
||||
" plt.title('Articles per Day')\n",
|
||||
" plt.xlabel('Date')\n",
|
||||
" plt.ylabel('Number of Articles')\n",
|
||||
" plt.xticks(rotation=45)\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4. Export Filtered Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Filter and export specific data\n",
|
||||
"if 'title' in tech_df.columns and 'content' in tech_df.columns:\n",
|
||||
" # Filter articles containing specific keywords\n",
|
||||
" ai_articles = tech_df[\n",
|
||||
" tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n",
|
||||
" tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" # Export filtered data\n",
|
||||
" ai_articles.to_csv('ai_articles.csv', index=False)\n",
|
||||
" print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 5. Cleanup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Optional: Remove downloaded CSV files\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n",
|
||||
" if os.path.exists(file):\n",
|
||||
" os.remove(file)\n",
|
||||
" print(f\"Removed {file}\")"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user