batch update

This commit is contained in:
Charles-Gormley
2024-11-10 21:18:48 -05:00
parent e798dce237
commit 758c2ccdde
20 changed files with 885 additions and 51 deletions

View File

@@ -0,0 +1,200 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RSS Article Batch Download Examples\n",
"\n",
"This notebook demonstrates how to batch download RSS articles from S3."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"import os\n",
"import pandas as pd\n",
"from datetime import datetime, timedelta\n",
"from src.search import S3BatchDownloader\n",
"\n",
"# Configure logging\n",
"import logging\n",
"logging.basicConfig(level=logging.INFO)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize the Downloader"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Optional: Set environment variables\n",
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
"os.environ['RSS_PREFIX'] = 'articles/'\n",
"\n",
"# Initialize downloader\n",
"downloader = S3BatchDownloader()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Check Storage Statistics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Get storage stats\n",
"stats = downloader.get_storage_stats()\n",
"\n",
"print(f\"Total objects: {stats['total_objects']:,}\")\n",
"print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n",
"print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Download Examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Download last 7 days of articles\n",
"start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n",
"\n",
"recent_articles_path = downloader.download_to_csv(\n",
" output_path='recent_articles.csv',\n",
" start_date=start_date\n",
")\n",
"\n",
"# Load and display sample\n",
"recent_df = pd.read_csv(recent_articles_path)\n",
"print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n",
"recent_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Download articles from specific source prefix\n",
"tech_articles_path = downloader.download_to_csv(\n",
" output_path='tech_articles.csv',\n",
" prefix='articles/tech/'\n",
")\n",
"\n",
"tech_df = pd.read_csv(tech_articles_path)\n",
"print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n",
"tech_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Analysis Examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Analyze downloaded data\n",
"if 'source' in tech_df.columns:\n",
" source_counts = tech_df['source'].value_counts()\n",
" \n",
" print(\"\\nArticles per Source:\")\n",
" print(source_counts)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Time-based analysis\n",
"if 'published_date' in tech_df.columns:\n",
" tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n",
" daily_counts = tech_df.resample('D', on='published_date').size()\n",
" \n",
" # Plot\n",
" import matplotlib.pyplot as plt\n",
" \n",
" plt.figure(figsize=(15, 6))\n",
" daily_counts.plot(kind='bar')\n",
" plt.title('Articles per Day')\n",
" plt.xlabel('Date')\n",
" plt.ylabel('Number of Articles')\n",
" plt.xticks(rotation=45)\n",
" plt.tight_layout()\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Export Filtered Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Filter and export specific data\n",
"if 'title' in tech_df.columns and 'content' in tech_df.columns:\n",
" # Filter articles containing specific keywords\n",
" ai_articles = tech_df[\n",
" tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n",
" tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n",
" ]\n",
" \n",
" # Export filtered data\n",
" ai_articles.to_csv('ai_articles.csv', index=False)\n",
" print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Cleanup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Optional: Remove downloaded CSV files\n",
"import os\n",
"\n",
"for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n",
" if os.path.exists(file):\n",
" os.remove(file)\n",
" print(f\"Removed {file}\")"
]
}
]
}