Files
IngestRSS/example-notebooks/query-notebook.ipynb
Charles-Gormley 758c2ccdde batch update
2024-11-10 21:18:48 -05:00

200 lines
5.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RSS Article Query Examples\n",
"\n",
"This notebook demonstrates how to query RSS articles using the ArticleQuerier class."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"import os\n",
"import pandas as pd\n",
"from datetime import datetime, timedelta\n",
"from src.search import ArticleQuerier\n",
"\n",
"# Configure logging if needed\n",
"import logging\n",
"logging.basicConfig(level=logging.INFO)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize the Querier\n",
"\n",
"You can either set environment variables or use default values:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Optional: Set environment variables\n",
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
"os.environ['RSS_DATABASE_NAME'] = 'rss_articles'\n",
"os.environ['RSS_TABLE_NAME'] = 'articles'\n",
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
"\n",
"# Initialize querier\n",
"querier = ArticleQuerier()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Basic Source Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Get all sources and their article counts\n",
"sources = querier.get_sources()\n",
"\n",
"# Display top sources\n",
"print(\"Top Sources by Article Count:\")\n",
"sources.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Search Examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Search articles containing 'python' in title\n",
"python_articles = querier.search(title=\"python\", limit=5)\n",
"print(\"\\nArticles about Python:\")\n",
"python_articles[['title', 'source', 'published_date']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Search with multiple filters\n",
"# Get recent AWS articles from specific source\n",
"filtered_articles = querier.search(\n",
" content=\"aws\",\n",
" source=\"techcrunch\",\n",
" date_from=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),\n",
" limit=5\n",
")\n",
"\n",
"print(\"\\nRecent AWS articles from TechCrunch:\")\n",
"filtered_articles[['title', 'published_date', 'url']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Custom SQL Queries"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Articles per month by source\n",
"monthly_stats = querier.query(\"\"\"\n",
" SELECT \n",
" source,\n",
" DATE_TRUNC('month', published_date) as month,\n",
" COUNT(*) as article_count\n",
" FROM articles\n",
" WHERE published_date >= CURRENT_DATE - INTERVAL '6' MONTH\n",
" GROUP BY 1, 2\n",
" ORDER BY 2 DESC, 3 DESC\n",
"\"\"\")\n",
"\n",
"print(\"\\nMonthly Article Counts:\")\n",
"monthly_stats.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Topic analysis (searching for specific keywords)\n",
"topic_analysis = querier.query(\"\"\"\n",
" SELECT \n",
" CASE\n",
" WHEN LOWER(title) LIKE '%python%' THEN 'Python'\n",
" WHEN LOWER(title) LIKE '%javascript%' OR LOWER(title) LIKE '%js%' THEN 'JavaScript'\n",
" WHEN LOWER(title) LIKE '%aws%' THEN 'AWS'\n",
" WHEN LOWER(title) LIKE '%ai%' OR LOWER(title) LIKE '%artificial intelligence%' THEN 'AI'\n",
" END as topic,\n",
" COUNT(*) as article_count\n",
" FROM articles\n",
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
" GROUP BY 1\n",
" HAVING topic IS NOT NULL\n",
" ORDER BY 2 DESC\n",
"\"\"\")\n",
"\n",
"print(\"\\nTopic Analysis (Last 30 Days):\")\n",
"topic_analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Visualization Example"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"# Get daily article counts\n",
"daily_counts = querier.query(\"\"\"\n",
" SELECT \n",
" DATE_TRUNC('day', published_date) as date,\n",
" COUNT(*) as article_count\n",
" FROM articles\n",
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
" GROUP BY 1\n",
" ORDER BY 1\n",
"\"\"\")\n",
"\n",
"# Plot\n",
"plt.figure(figsize=(15, 6))\n",
"plt.plot(daily_counts['date'], daily_counts['article_count'])\n",
"plt.title('Daily Article Counts (Last 30 Days)')\n",
"plt.xlabel('Date')\n",
"plt.ylabel('Number of Articles')\n",
"plt.grid(True)\n",
"plt.xticks(rotation=45)\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
]
}