{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RSS Article Query Examples\n", "\n", "This notebook demonstrates how to query RSS articles using the ArticleQuerier class." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "import os\n", "import pandas as pd\n", "from datetime import datetime, timedelta\n", "from src.search import ArticleQuerier\n", "\n", "# Configure logging if needed\n", "import logging\n", "logging.basicConfig(level=logging.INFO)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Initialize the Querier\n", "\n", "You can either set environment variables or use default values:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Optional: Set environment variables\n", "os.environ['AWS_REGION'] = 'eu-west-3'\n", "os.environ['RSS_DATABASE_NAME'] = 'rss_articles'\n", "os.environ['RSS_TABLE_NAME'] = 'articles'\n", "os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n", "\n", "# Initialize querier\n", "querier = ArticleQuerier()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Basic Source Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Get all sources and their article counts\n", "sources = querier.get_sources()\n", "\n", "# Display top sources\n", "print(\"Top Sources by Article Count:\")\n", "sources.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Search Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Search articles containing 'python' in title\n", "python_articles = querier.search(title=\"python\", limit=5)\n", "print(\"\\nArticles about Python:\")\n", "python_articles[['title', 'source', 'published_date']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Search with multiple filters\n", "# Get recent AWS articles from specific source\n", "filtered_articles = querier.search(\n", " content=\"aws\",\n", " source=\"techcrunch\",\n", " date_from=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),\n", " limit=5\n", ")\n", "\n", "print(\"\\nRecent AWS articles from TechCrunch:\")\n", "filtered_articles[['title', 'published_date', 'url']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Custom SQL Queries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Articles per month by source\n", "monthly_stats = querier.query(\"\"\"\n", " SELECT \n", " source,\n", " DATE_TRUNC('month', published_date) as month,\n", " COUNT(*) as article_count\n", " FROM articles\n", " WHERE published_date >= CURRENT_DATE - INTERVAL '6' MONTH\n", " GROUP BY 1, 2\n", " ORDER BY 2 DESC, 3 DESC\n", "\"\"\")\n", "\n", "print(\"\\nMonthly Article Counts:\")\n", "monthly_stats.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Topic analysis (searching for specific keywords)\n", "topic_analysis = querier.query(\"\"\"\n", " SELECT \n", " CASE\n", " WHEN LOWER(title) LIKE '%python%' THEN 'Python'\n", " WHEN LOWER(title) LIKE '%javascript%' OR LOWER(title) LIKE '%js%' THEN 'JavaScript'\n", " WHEN LOWER(title) LIKE '%aws%' THEN 'AWS'\n", " WHEN LOWER(title) LIKE '%ai%' OR LOWER(title) LIKE '%artificial intelligence%' THEN 'AI'\n", " END as topic,\n", " COUNT(*) as article_count\n", " FROM articles\n", " WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n", " GROUP BY 1\n", " HAVING topic IS NOT NULL\n", " ORDER BY 2 DESC\n", "\"\"\")\n", "\n", "print(\"\\nTopic Analysis (Last 30 Days):\")\n", "topic_analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Visualization Example" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "import matplotlib.pyplot as plt\n", "\n", "# Get daily article counts\n", "daily_counts = querier.query(\"\"\"\n", " SELECT \n", " DATE_TRUNC('day', published_date) as date,\n", " COUNT(*) as article_count\n", " FROM articles\n", " WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n", " GROUP BY 1\n", " ORDER BY 1\n", "\"\"\")\n", "\n", "# Plot\n", "plt.figure(figsize=(15, 6))\n", "plt.plot(daily_counts['date'], daily_counts['article_count'])\n", "plt.title('Daily Article Counts (Last 30 Days)')\n", "plt.xlabel('Date')\n", "plt.ylabel('Number of Articles')\n", "plt.grid(True)\n", "plt.xticks(rotation=45)\n", "plt.tight_layout()\n", "plt.show()" ] } ] }