mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 14:04:19 +01:00
200 lines
5.2 KiB
Plaintext
200 lines
5.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# RSS Article Query Examples\n",
|
|
"\n",
|
|
"This notebook demonstrates how to query RSS articles using the ArticleQuerier class."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"import os\n",
|
|
"import pandas as pd\n",
|
|
"from datetime import datetime, timedelta\n",
|
|
"from src.search import ArticleQuerier\n",
|
|
"\n",
|
|
"# Configure logging if needed\n",
|
|
"import logging\n",
|
|
"logging.basicConfig(level=logging.INFO)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Initialize the Querier\n",
|
|
"\n",
|
|
"You can either set environment variables or use default values:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"# Optional: Set environment variables\n",
|
|
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
|
|
"os.environ['RSS_DATABASE_NAME'] = 'rss_articles'\n",
|
|
"os.environ['RSS_TABLE_NAME'] = 'articles'\n",
|
|
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
|
|
"\n",
|
|
"# Initialize querier\n",
|
|
"querier = ArticleQuerier()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 1. Basic Source Analysis"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"# Get all sources and their article counts\n",
|
|
"sources = querier.get_sources()\n",
|
|
"\n",
|
|
"# Display top sources\n",
|
|
"print(\"Top Sources by Article Count:\")\n",
|
|
"sources.head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Search Examples"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"# Search articles containing 'python' in title\n",
|
|
"python_articles = querier.search(title=\"python\", limit=5)\n",
|
|
"print(\"\\nArticles about Python:\")\n",
|
|
"python_articles[['title', 'source', 'published_date']]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"# Search with multiple filters\n",
|
|
"# Get recent AWS articles from specific source\n",
|
|
"filtered_articles = querier.search(\n",
|
|
" content=\"aws\",\n",
|
|
" source=\"techcrunch\",\n",
|
|
" date_from=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),\n",
|
|
" limit=5\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nRecent AWS articles from TechCrunch:\")\n",
|
|
"filtered_articles[['title', 'published_date', 'url']]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Custom SQL Queries"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"# Articles per month by source\n",
|
|
"monthly_stats = querier.query(\"\"\"\n",
|
|
" SELECT \n",
|
|
" source,\n",
|
|
" DATE_TRUNC('month', published_date) as month,\n",
|
|
" COUNT(*) as article_count\n",
|
|
" FROM articles\n",
|
|
" WHERE published_date >= CURRENT_DATE - INTERVAL '6' MONTH\n",
|
|
" GROUP BY 1, 2\n",
|
|
" ORDER BY 2 DESC, 3 DESC\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"print(\"\\nMonthly Article Counts:\")\n",
|
|
"monthly_stats.head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"# Topic analysis (searching for specific keywords)\n",
|
|
"topic_analysis = querier.query(\"\"\"\n",
|
|
" SELECT \n",
|
|
" CASE\n",
|
|
" WHEN LOWER(title) LIKE '%python%' THEN 'Python'\n",
|
|
" WHEN LOWER(title) LIKE '%javascript%' OR LOWER(title) LIKE '%js%' THEN 'JavaScript'\n",
|
|
" WHEN LOWER(title) LIKE '%aws%' THEN 'AWS'\n",
|
|
" WHEN LOWER(title) LIKE '%ai%' OR LOWER(title) LIKE '%artificial intelligence%' THEN 'AI'\n",
|
|
" END as topic,\n",
|
|
" COUNT(*) as article_count\n",
|
|
" FROM articles\n",
|
|
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
|
|
" GROUP BY 1\n",
|
|
" HAVING topic IS NOT NULL\n",
|
|
" ORDER BY 2 DESC\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"print(\"\\nTopic Analysis (Last 30 Days):\")\n",
|
|
"topic_analysis"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 4. Visualization Example"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"# Get daily article counts\n",
|
|
"daily_counts = querier.query(\"\"\"\n",
|
|
" SELECT \n",
|
|
" DATE_TRUNC('day', published_date) as date,\n",
|
|
" COUNT(*) as article_count\n",
|
|
" FROM articles\n",
|
|
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
|
|
" GROUP BY 1\n",
|
|
" ORDER BY 1\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"# Plot\n",
|
|
"plt.figure(figsize=(15, 6))\n",
|
|
"plt.plot(daily_counts['date'], daily_counts['article_count'])\n",
|
|
"plt.title('Daily Article Counts (Last 30 Days)')\n",
|
|
"plt.xlabel('Date')\n",
|
|
"plt.ylabel('Number of Articles')\n",
|
|
"plt.grid(True)\n",
|
|
"plt.xticks(rotation=45)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
}
|
|
]
|
|
}
|