{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RSS Article Batch Download Examples\n", "\n", "This notebook demonstrates how to batch download RSS articles from S3." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "import os\n", "import pandas as pd\n", "from datetime import datetime, timedelta\n", "from src.search import S3BatchDownloader\n", "\n", "# Configure logging\n", "import logging\n", "logging.basicConfig(level=logging.INFO)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Initialize the Downloader" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Optional: Set environment variables\n", "os.environ['AWS_REGION'] = 'eu-west-3'\n", "os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n", "os.environ['RSS_PREFIX'] = 'articles/'\n", "\n", "# Initialize downloader\n", "downloader = S3BatchDownloader()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Check Storage Statistics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Get storage stats\n", "stats = downloader.get_storage_stats()\n", "\n", "print(f\"Total objects: {stats['total_objects']:,}\")\n", "print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n", "print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Download Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Download last 7 days of articles\n", "start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n", "\n", "recent_articles_path = downloader.download_to_csv(\n", " output_path='recent_articles.csv',\n", " start_date=start_date\n", ")\n", "\n", "# Load and display sample\n", "recent_df = pd.read_csv(recent_articles_path)\n", "print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n", "recent_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Download articles from specific source prefix\n", "tech_articles_path = downloader.download_to_csv(\n", " output_path='tech_articles.csv',\n", " prefix='articles/tech/'\n", ")\n", "\n", "tech_df = pd.read_csv(tech_articles_path)\n", "print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n", "tech_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Analysis Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Analyze downloaded data\n", "if 'source' in tech_df.columns:\n", " source_counts = tech_df['source'].value_counts()\n", " \n", " print(\"\\nArticles per Source:\")\n", " print(source_counts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Time-based analysis\n", "if 'published_date' in tech_df.columns:\n", " tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n", " daily_counts = tech_df.resample('D', on='published_date').size()\n", " \n", " # Plot\n", " import matplotlib.pyplot as plt\n", " \n", " plt.figure(figsize=(15, 6))\n", " daily_counts.plot(kind='bar')\n", " plt.title('Articles per Day')\n", " plt.xlabel('Date')\n", " plt.ylabel('Number of Articles')\n", " plt.xticks(rotation=45)\n", " plt.tight_layout()\n", " plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Export Filtered Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Filter and export specific data\n", "if 'title' in tech_df.columns and 'content' in tech_df.columns:\n", " # Filter articles containing specific keywords\n", " ai_articles = tech_df[\n", " tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n", " tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n", " ]\n", " \n", " # Export filtered data\n", " ai_articles.to_csv('ai_articles.csv', index=False)\n", " print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Cleanup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Optional: Remove downloaded CSV files\n", "import os\n", "\n", "for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n", " if os.path.exists(file):\n", " os.remove(file)\n", " print(f\"Removed {file}\")" ] } ] }