{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RSS Article Batch Download Examples\n",
    "\n",
    "This notebook demonstrates how to batch download RSS articles from S3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from datetime import datetime, timedelta\n",
    "from src.search import S3BatchDownloader\n",
    "\n",
    "# Configure logging\n",
    "import logging\n",
    "logging.basicConfig(level=logging.INFO)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Initialize the Downloader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Optional: Set environment variables\n",
    "os.environ['AWS_REGION'] = 'eu-west-3'\n",
    "os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
    "os.environ['RSS_PREFIX'] = 'articles/'\n",
    "\n",
    "# Initialize downloader\n",
    "downloader = S3BatchDownloader()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Check Storage Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Get storage stats\n",
    "stats = downloader.get_storage_stats()\n",
    "\n",
    "print(f\"Total objects: {stats['total_objects']:,}\")\n",
    "print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n",
    "print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Download Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Download last 7 days of articles\n",
    "start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n",
    "\n",
    "recent_articles_path = downloader.download_to_csv(\n",
    "    output_path='recent_articles.csv',\n",
    "    start_date=start_date\n",
    ")\n",
    "\n",
    "# Load and display sample\n",
    "recent_df = pd.read_csv(recent_articles_path)\n",
    "print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n",
    "recent_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Download articles from specific source prefix\n",
    "tech_articles_path = downloader.download_to_csv(\n",
    "    output_path='tech_articles.csv',\n",
    "    prefix='articles/tech/'\n",
    ")\n",
    "\n",
    "tech_df = pd.read_csv(tech_articles_path)\n",
    "print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n",
    "tech_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Analysis Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Analyze downloaded data\n",
    "if 'source' in tech_df.columns:\n",
    "    source_counts = tech_df['source'].value_counts()\n",
    "    \n",
    "    print(\"\\nArticles per Source:\")\n",
    "    print(source_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Time-based analysis\n",
    "if 'published_date' in tech_df.columns:\n",
    "    tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n",
    "    daily_counts = tech_df.resample('D', on='published_date').size()\n",
    "    \n",
    "    # Plot\n",
    "    import matplotlib.pyplot as plt\n",
    "    \n",
    "    plt.figure(figsize=(15, 6))\n",
    "    daily_counts.plot(kind='bar')\n",
    "    plt.title('Articles per Day')\n",
    "    plt.xlabel('Date')\n",
    "    plt.ylabel('Number of Articles')\n",
    "    plt.xticks(rotation=45)\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Export Filtered Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Filter and export specific data\n",
    "if 'title' in tech_df.columns and 'content' in tech_df.columns:\n",
    "    # Filter articles containing specific keywords\n",
    "    ai_articles = tech_df[\n",
    "        tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n",
    "        tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n",
    "    ]\n",
    "    \n",
    "    # Export filtered data\n",
    "    ai_articles.to_csv('ai_articles.csv', index=False)\n",
    "    print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Cleanup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Optional: Remove downloaded CSV files\n",
    "import os\n",
    "\n",
    "for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n",
    "    if os.path.exists(file):\n",
    "        os.remove(file)\n",
    "        print(f\"Removed {file}\")"
   ]
  }
 ]
}