batch update

2025-12-17 14:04:19 +01:00 · 2024-11-10 21:18:48 -05:00
parent e798dce237
commit 758c2ccdde
20 changed files with 885 additions and 51 deletions
--- a/example-notebooks/batch-notebook.ipynb
+++ b/example-notebooks/batch-notebook.ipynb
@@ -0,0 +1,200 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RSS Article Batch Download Examples\n",
+    "\n",
+    "This notebook demonstrates how to batch download RSS articles from S3."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from datetime import datetime, timedelta\n",
+    "from src.search import S3BatchDownloader\n",
+    "\n",
+    "# Configure logging\n",
+    "import logging\n",
+    "logging.basicConfig(level=logging.INFO)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize the Downloader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Optional: Set environment variables\n",
+    "os.environ['AWS_REGION'] = 'eu-west-3'\n",
+    "os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
+    "os.environ['RSS_PREFIX'] = 'articles/'\n",
+    "\n",
+    "# Initialize downloader\n",
+    "downloader = S3BatchDownloader()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Check Storage Statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Get storage stats\n",
+    "stats = downloader.get_storage_stats()\n",
+    "\n",
+    "print(f\"Total objects: {stats['total_objects']:,}\")\n",
+    "print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n",
+    "print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Download Examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Download last 7 days of articles\n",
+    "start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n",
+    "\n",
+    "recent_articles_path = downloader.download_to_csv(\n",
+    "    output_path='recent_articles.csv',\n",
+    "    start_date=start_date\n",
+    ")\n",
+    "\n",
+    "# Load and display sample\n",
+    "recent_df = pd.read_csv(recent_articles_path)\n",
+    "print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n",
+    "recent_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Download articles from specific source prefix\n",
+    "tech_articles_path = downloader.download_to_csv(\n",
+    "    output_path='tech_articles.csv',\n",
+    "    prefix='articles/tech/'\n",
+    ")\n",
+    "\n",
+    "tech_df = pd.read_csv(tech_articles_path)\n",
+    "print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n",
+    "tech_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Analysis Examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Analyze downloaded data\n",
+    "if 'source' in tech_df.columns:\n",
+    "    source_counts = tech_df['source'].value_counts()\n",
+    "    \n",
+    "    print(\"\\nArticles per Source:\")\n",
+    "    print(source_counts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Time-based analysis\n",
+    "if 'published_date' in tech_df.columns:\n",
+    "    tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n",
+    "    daily_counts = tech_df.resample('D', on='published_date').size()\n",
+    "    \n",
+    "    # Plot\n",
+    "    import matplotlib.pyplot as plt\n",
+    "    \n",
+    "    plt.figure(figsize=(15, 6))\n",
+    "    daily_counts.plot(kind='bar')\n",
+    "    plt.title('Articles per Day')\n",
+    "    plt.xlabel('Date')\n",
+    "    plt.ylabel('Number of Articles')\n",
+    "    plt.xticks(rotation=45)\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Export Filtered Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Filter and export specific data\n",
+    "if 'title' in tech_df.columns and 'content' in tech_df.columns:\n",
+    "    # Filter articles containing specific keywords\n",
+    "    ai_articles = tech_df[\n",
+    "        tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n",
+    "        tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n",
+    "    ]\n",
+    "    \n",
+    "    # Export filtered data\n",
+    "    ai_articles.to_csv('ai_articles.csv', index=False)\n",
+    "    print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Optional: Remove downloaded CSV files\n",
+    "import os\n",
+    "\n",
+    "for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n",
+    "    if os.path.exists(file):\n",
+    "        os.remove(file)\n",
+    "        print(f\"Removed {file}\")"
+   ]
+  }
+ ]
+}