mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 22:14:20 +01:00
batch update
This commit is contained in:
199
example-notebooks/query-notebook.ipynb
Normal file
199
example-notebooks/query-notebook.ipynb
Normal file
@@ -0,0 +1,199 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RSS Article Query Examples\n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to query RSS articles using the ArticleQuerier class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import pandas as pd\n",
|
||||
"from datetime import datetime, timedelta\n",
|
||||
"from src.search import ArticleQuerier\n",
|
||||
"\n",
|
||||
"# Configure logging if needed\n",
|
||||
"import logging\n",
|
||||
"logging.basicConfig(level=logging.INFO)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize the Querier\n",
|
||||
"\n",
|
||||
"You can either set environment variables or use default values:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Optional: Set environment variables\n",
|
||||
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
|
||||
"os.environ['RSS_DATABASE_NAME'] = 'rss_articles'\n",
|
||||
"os.environ['RSS_TABLE_NAME'] = 'articles'\n",
|
||||
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
|
||||
"\n",
|
||||
"# Initialize querier\n",
|
||||
"querier = ArticleQuerier()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. Basic Source Analysis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Get all sources and their article counts\n",
|
||||
"sources = querier.get_sources()\n",
|
||||
"\n",
|
||||
"# Display top sources\n",
|
||||
"print(\"Top Sources by Article Count:\")\n",
|
||||
"sources.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. Search Examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Search articles containing 'python' in title\n",
|
||||
"python_articles = querier.search(title=\"python\", limit=5)\n",
|
||||
"print(\"\\nArticles about Python:\")\n",
|
||||
"python_articles[['title', 'source', 'published_date']]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Search with multiple filters\n",
|
||||
"# Get recent AWS articles from specific source\n",
|
||||
"filtered_articles = querier.search(\n",
|
||||
" content=\"aws\",\n",
|
||||
" source=\"techcrunch\",\n",
|
||||
" date_from=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),\n",
|
||||
" limit=5\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"\\nRecent AWS articles from TechCrunch:\")\n",
|
||||
"filtered_articles[['title', 'published_date', 'url']]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. Custom SQL Queries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Articles per month by source\n",
|
||||
"monthly_stats = querier.query(\"\"\"\n",
|
||||
" SELECT \n",
|
||||
" source,\n",
|
||||
" DATE_TRUNC('month', published_date) as month,\n",
|
||||
" COUNT(*) as article_count\n",
|
||||
" FROM articles\n",
|
||||
" WHERE published_date >= CURRENT_DATE - INTERVAL '6' MONTH\n",
|
||||
" GROUP BY 1, 2\n",
|
||||
" ORDER BY 2 DESC, 3 DESC\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"print(\"\\nMonthly Article Counts:\")\n",
|
||||
"monthly_stats.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Topic analysis (searching for specific keywords)\n",
|
||||
"topic_analysis = querier.query(\"\"\"\n",
|
||||
" SELECT \n",
|
||||
" CASE\n",
|
||||
" WHEN LOWER(title) LIKE '%python%' THEN 'Python'\n",
|
||||
" WHEN LOWER(title) LIKE '%javascript%' OR LOWER(title) LIKE '%js%' THEN 'JavaScript'\n",
|
||||
" WHEN LOWER(title) LIKE '%aws%' THEN 'AWS'\n",
|
||||
" WHEN LOWER(title) LIKE '%ai%' OR LOWER(title) LIKE '%artificial intelligence%' THEN 'AI'\n",
|
||||
" END as topic,\n",
|
||||
" COUNT(*) as article_count\n",
|
||||
" FROM articles\n",
|
||||
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
|
||||
" GROUP BY 1\n",
|
||||
" HAVING topic IS NOT NULL\n",
|
||||
" ORDER BY 2 DESC\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"print(\"\\nTopic Analysis (Last 30 Days):\")\n",
|
||||
"topic_analysis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4. Visualization Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"# Get daily article counts\n",
|
||||
"daily_counts = querier.query(\"\"\"\n",
|
||||
" SELECT \n",
|
||||
" DATE_TRUNC('day', published_date) as date,\n",
|
||||
" COUNT(*) as article_count\n",
|
||||
" FROM articles\n",
|
||||
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
|
||||
" GROUP BY 1\n",
|
||||
" ORDER BY 1\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"# Plot\n",
|
||||
"plt.figure(figsize=(15, 6))\n",
|
||||
"plt.plot(daily_counts['date'], daily_counts['article_count'])\n",
|
||||
"plt.title('Daily Article Counts (Last 30 Days)')\n",
|
||||
"plt.xlabel('Date')\n",
|
||||
"plt.ylabel('Number of Articles')\n",
|
||||
"plt.grid(True)\n",
|
||||
"plt.xticks(rotation=45)\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user