mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2025-12-17 14:04:19 +01:00
sped up batch ingestion of articles.
This commit is contained in:
@@ -11,17 +11,33 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
|
"import sys\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from datetime import datetime, timedelta\n",
|
"from time import time\n"
|
||||||
"from src.search import S3BatchDownloader\n",
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Root imports\n",
|
||||||
|
"os.chdir('..')\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Configure logging\n",
|
"from src.search.batch import S3BatchDownloader\n",
|
||||||
"import logging\n",
|
"from dotenv import load_dotenv\n",
|
||||||
"logging.basicConfig(level=logging.INFO)"
|
"\n",
|
||||||
|
"load_dotenv(override=True)\n",
|
||||||
|
"\n",
|
||||||
|
"downloader = S3BatchDownloader()\n",
|
||||||
|
"\n",
|
||||||
|
"os.chdir('example-notebooks')\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -31,18 +47,226 @@
|
|||||||
"## Initialize the Downloader"
|
"## Initialize the Downloader"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"List permission: Allowed\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import boto3\n",
|
||||||
|
"from botocore.exceptions import ClientError\n",
|
||||||
|
"\n",
|
||||||
|
"s3 = boto3.client('s3')\n",
|
||||||
|
"\n",
|
||||||
|
"# Test ListObjects\n",
|
||||||
|
"try:\n",
|
||||||
|
" response = s3.list_objects_v2(Bucket='open-rss-articles-us-east-1')\n",
|
||||||
|
" print(\"List permission: Allowed\")\n",
|
||||||
|
"except ClientError as e:\n",
|
||||||
|
" print(\"List permission: Denied\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Downloaded data to consolidated_data.csv in 59.01 seconds\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Optional: Set environment variables\n",
|
"start = time()\n",
|
||||||
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
|
"output_path = \"consolidated_data.csv\" # or \"consolidated_data.json\"\n",
|
||||||
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
|
|
||||||
"os.environ['RSS_PREFIX'] = 'articles/'\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Initialize downloader\n",
|
"# Define date range\n",
|
||||||
"downloader = S3BatchDownloader()"
|
"start_date = \"2024-11-17\"\n",
|
||||||
|
"end_date = \"2024-11-20\"\n",
|
||||||
|
"\n",
|
||||||
|
"# Start downloading\n",
|
||||||
|
"downloader.download_to_file(\n",
|
||||||
|
" output_path=output_path,\n",
|
||||||
|
" file_format=\"csv\", # or \"json\"\n",
|
||||||
|
" start_date=start_date,\n",
|
||||||
|
" end_date=end_date\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df = pd.read_csv(output_path)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>link</th>\n",
|
||||||
|
" <th>rss</th>\n",
|
||||||
|
" <th>title</th>\n",
|
||||||
|
" <th>content</th>\n",
|
||||||
|
" <th>unixTime</th>\n",
|
||||||
|
" <th>rss_id</th>\n",
|
||||||
|
" <th>article_id</th>\n",
|
||||||
|
" <th>llm_summary</th>\n",
|
||||||
|
" <th>embedding</th>\n",
|
||||||
|
" <th>unixtime</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>https://www.cnet.com/home/internet/comporium-i...</td>\n",
|
||||||
|
" <td>https://www.cnet.com/rss/news/</td>\n",
|
||||||
|
" <td>Comporium Home Internet: Pricing, Speeds and A...</td>\n",
|
||||||
|
" <td>Unavailable in Provider unavailable in 90001 E...</td>\n",
|
||||||
|
" <td>1731883654</td>\n",
|
||||||
|
" <td>f5e6f52c79</td>\n",
|
||||||
|
" <td>00089f7505</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>1731883654</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>https://kotaku.com/lady-gaga-wednesday-season-...</td>\n",
|
||||||
|
" <td>https://kotaku.com/rss</td>\n",
|
||||||
|
" <td>Lady Gaga Appearing In Wednesday Season 2 Is A...</td>\n",
|
||||||
|
" <td>The next stop on Lady Gaga’s years-long tour t...</td>\n",
|
||||||
|
" <td>1731883665</td>\n",
|
||||||
|
" <td>a0840ab3b4</td>\n",
|
||||||
|
" <td>009f880a86</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>1731883665</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>https://expeditionportal.com/classifieds-honda...</td>\n",
|
||||||
|
" <td>https://www.expeditionportal.com/feed/</td>\n",
|
||||||
|
" <td>2008 Honda Element EX ECamper :: Classifieds</td>\n",
|
||||||
|
" <td>Overlanding is about experiences first and for...</td>\n",
|
||||||
|
" <td>1731883665</td>\n",
|
||||||
|
" <td>ecdc66bb02</td>\n",
|
||||||
|
" <td>00ac77e95f</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>1731883665</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>http://www.influencerupdate.biz/news/68624/twi...</td>\n",
|
||||||
|
" <td>https://www.influencerupdate.biz/rss/news/</td>\n",
|
||||||
|
" <td>Twitch is testing mid-roll ads on creator broa...</td>\n",
|
||||||
|
" <td>Streaming giant Twitch is planning to test mid...</td>\n",
|
||||||
|
" <td>1731883669</td>\n",
|
||||||
|
" <td>e34caba76d</td>\n",
|
||||||
|
" <td>0041bc4abf</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>1731883669</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>https://thejournal.com/Articles/2024/10/29/Rep...</td>\n",
|
||||||
|
" <td>https://thejournal.com/rss-feeds/news.aspx</td>\n",
|
||||||
|
" <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
|
||||||
|
" <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
|
||||||
|
" <td>1731883616</td>\n",
|
||||||
|
" <td>efd9bb9654</td>\n",
|
||||||
|
" <td>000a3da3b6</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>1731883616</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" link \\\n",
|
||||||
|
"0 https://www.cnet.com/home/internet/comporium-i... \n",
|
||||||
|
"1 https://kotaku.com/lady-gaga-wednesday-season-... \n",
|
||||||
|
"2 https://expeditionportal.com/classifieds-honda... \n",
|
||||||
|
"3 http://www.influencerupdate.biz/news/68624/twi... \n",
|
||||||
|
"4 https://thejournal.com/Articles/2024/10/29/Rep... \n",
|
||||||
|
"\n",
|
||||||
|
" rss \\\n",
|
||||||
|
"0 https://www.cnet.com/rss/news/ \n",
|
||||||
|
"1 https://kotaku.com/rss \n",
|
||||||
|
"2 https://www.expeditionportal.com/feed/ \n",
|
||||||
|
"3 https://www.influencerupdate.biz/rss/news/ \n",
|
||||||
|
"4 https://thejournal.com/rss-feeds/news.aspx \n",
|
||||||
|
"\n",
|
||||||
|
" title \\\n",
|
||||||
|
"0 Comporium Home Internet: Pricing, Speeds and A... \n",
|
||||||
|
"1 Lady Gaga Appearing In Wednesday Season 2 Is A... \n",
|
||||||
|
"2 2008 Honda Element EX ECamper :: Classifieds \n",
|
||||||
|
"3 Twitch is testing mid-roll ads on creator broa... \n",
|
||||||
|
"4 Report: 90% of Schools Depend on E-rate Fundin... \n",
|
||||||
|
"\n",
|
||||||
|
" content unixTime rss_id \\\n",
|
||||||
|
"0 Unavailable in Provider unavailable in 90001 E... 1731883654 f5e6f52c79 \n",
|
||||||
|
"1 The next stop on Lady Gaga’s years-long tour t... 1731883665 a0840ab3b4 \n",
|
||||||
|
"2 Overlanding is about experiences first and for... 1731883665 ecdc66bb02 \n",
|
||||||
|
"3 Streaming giant Twitch is planning to test mid... 1731883669 e34caba76d \n",
|
||||||
|
"4 Report: 90% of Schools Depend on E-rate Fundin... 1731883616 efd9bb9654 \n",
|
||||||
|
"\n",
|
||||||
|
" article_id llm_summary embedding unixtime \n",
|
||||||
|
"0 00089f7505 NaN NaN 1731883654 \n",
|
||||||
|
"1 009f880a86 NaN NaN 1731883665 \n",
|
||||||
|
"2 00ac77e95f NaN NaN 1731883665 \n",
|
||||||
|
"3 0041bc4abf NaN NaN 1731883669 \n",
|
||||||
|
"4 000a3da3b6 NaN NaN 1731883616 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.head()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -51,150 +275,27 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"## 1. Check Storage Statistics"
|
"## 1. Check Storage Statistics"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Get storage stats\n",
|
|
||||||
"stats = downloader.get_storage_stats()\n",
|
|
||||||
"\n",
|
|
||||||
"print(f\"Total objects: {stats['total_objects']:,}\")\n",
|
|
||||||
"print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n",
|
|
||||||
"print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## 2. Download Examples"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Download last 7 days of articles\n",
|
|
||||||
"start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n",
|
|
||||||
"\n",
|
|
||||||
"recent_articles_path = downloader.download_to_csv(\n",
|
|
||||||
" output_path='recent_articles.csv',\n",
|
|
||||||
" start_date=start_date\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"# Load and display sample\n",
|
|
||||||
"recent_df = pd.read_csv(recent_articles_path)\n",
|
|
||||||
"print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n",
|
|
||||||
"recent_df.head()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Download articles from specific source prefix\n",
|
|
||||||
"tech_articles_path = downloader.download_to_csv(\n",
|
|
||||||
" output_path='tech_articles.csv',\n",
|
|
||||||
" prefix='articles/tech/'\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"tech_df = pd.read_csv(tech_articles_path)\n",
|
|
||||||
"print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n",
|
|
||||||
"tech_df.head()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## 3. Analysis Examples"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Analyze downloaded data\n",
|
|
||||||
"if 'source' in tech_df.columns:\n",
|
|
||||||
" source_counts = tech_df['source'].value_counts()\n",
|
|
||||||
" \n",
|
|
||||||
" print(\"\\nArticles per Source:\")\n",
|
|
||||||
" print(source_counts)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Time-based analysis\n",
|
|
||||||
"if 'published_date' in tech_df.columns:\n",
|
|
||||||
" tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n",
|
|
||||||
" daily_counts = tech_df.resample('D', on='published_date').size()\n",
|
|
||||||
" \n",
|
|
||||||
" # Plot\n",
|
|
||||||
" import matplotlib.pyplot as plt\n",
|
|
||||||
" \n",
|
|
||||||
" plt.figure(figsize=(15, 6))\n",
|
|
||||||
" daily_counts.plot(kind='bar')\n",
|
|
||||||
" plt.title('Articles per Day')\n",
|
|
||||||
" plt.xlabel('Date')\n",
|
|
||||||
" plt.ylabel('Number of Articles')\n",
|
|
||||||
" plt.xticks(rotation=45)\n",
|
|
||||||
" plt.tight_layout()\n",
|
|
||||||
" plt.show()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## 4. Export Filtered Data"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Filter and export specific data\n",
|
|
||||||
"if 'title' in tech_df.columns and 'content' in tech_df.columns:\n",
|
|
||||||
" # Filter articles containing specific keywords\n",
|
|
||||||
" ai_articles = tech_df[\n",
|
|
||||||
" tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n",
|
|
||||||
" tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n",
|
|
||||||
" ]\n",
|
|
||||||
" \n",
|
|
||||||
" # Export filtered data\n",
|
|
||||||
" ai_articles.to_csv('ai_articles.csv', index=False)\n",
|
|
||||||
" print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## 5. Cleanup"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Optional: Remove downloaded CSV files\n",
|
|
||||||
"import os\n",
|
|
||||||
"\n",
|
|
||||||
"for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n",
|
|
||||||
" if os.path.exists(file):\n",
|
|
||||||
" os.remove(file)\n",
|
|
||||||
" print(f\"Removed {file}\")"
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "base",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
}
|
}
|
||||||
|
|||||||
289288
example-notebooks/consolidated_data.csv
Normal file
289288
example-notebooks/consolidated_data.csv
Normal file
File diff suppressed because one or more lines are too long
@@ -1,199 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# RSS Article Query Examples\n",
|
|
||||||
"\n",
|
|
||||||
"This notebook demonstrates how to query RSS articles using the ArticleQuerier class."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from datetime import datetime, timedelta\n",
|
|
||||||
"from src.search import ArticleQuerier\n",
|
|
||||||
"\n",
|
|
||||||
"# Configure logging if needed\n",
|
|
||||||
"import logging\n",
|
|
||||||
"logging.basicConfig(level=logging.INFO)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Initialize the Querier\n",
|
|
||||||
"\n",
|
|
||||||
"You can either set environment variables or use default values:"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Optional: Set environment variables\n",
|
|
||||||
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
|
|
||||||
"os.environ['RSS_DATABASE_NAME'] = 'rss_articles'\n",
|
|
||||||
"os.environ['RSS_TABLE_NAME'] = 'articles'\n",
|
|
||||||
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
|
|
||||||
"\n",
|
|
||||||
"# Initialize querier\n",
|
|
||||||
"querier = ArticleQuerier()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## 1. Basic Source Analysis"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Get all sources and their article counts\n",
|
|
||||||
"sources = querier.get_sources()\n",
|
|
||||||
"\n",
|
|
||||||
"# Display top sources\n",
|
|
||||||
"print(\"Top Sources by Article Count:\")\n",
|
|
||||||
"sources.head(10)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## 2. Search Examples"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Search articles containing 'python' in title\n",
|
|
||||||
"python_articles = querier.search(title=\"python\", limit=5)\n",
|
|
||||||
"print(\"\\nArticles about Python:\")\n",
|
|
||||||
"python_articles[['title', 'source', 'published_date']]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Search with multiple filters\n",
|
|
||||||
"# Get recent AWS articles from specific source\n",
|
|
||||||
"filtered_articles = querier.search(\n",
|
|
||||||
" content=\"aws\",\n",
|
|
||||||
" source=\"techcrunch\",\n",
|
|
||||||
" date_from=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),\n",
|
|
||||||
" limit=5\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"\\nRecent AWS articles from TechCrunch:\")\n",
|
|
||||||
"filtered_articles[['title', 'published_date', 'url']]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## 3. Custom SQL Queries"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Articles per month by source\n",
|
|
||||||
"monthly_stats = querier.query(\"\"\"\n",
|
|
||||||
" SELECT \n",
|
|
||||||
" source,\n",
|
|
||||||
" DATE_TRUNC('month', published_date) as month,\n",
|
|
||||||
" COUNT(*) as article_count\n",
|
|
||||||
" FROM articles\n",
|
|
||||||
" WHERE published_date >= CURRENT_DATE - INTERVAL '6' MONTH\n",
|
|
||||||
" GROUP BY 1, 2\n",
|
|
||||||
" ORDER BY 2 DESC, 3 DESC\n",
|
|
||||||
"\"\"\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"\\nMonthly Article Counts:\")\n",
|
|
||||||
"monthly_stats.head(10)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Topic analysis (searching for specific keywords)\n",
|
|
||||||
"topic_analysis = querier.query(\"\"\"\n",
|
|
||||||
" SELECT \n",
|
|
||||||
" CASE\n",
|
|
||||||
" WHEN LOWER(title) LIKE '%python%' THEN 'Python'\n",
|
|
||||||
" WHEN LOWER(title) LIKE '%javascript%' OR LOWER(title) LIKE '%js%' THEN 'JavaScript'\n",
|
|
||||||
" WHEN LOWER(title) LIKE '%aws%' THEN 'AWS'\n",
|
|
||||||
" WHEN LOWER(title) LIKE '%ai%' OR LOWER(title) LIKE '%artificial intelligence%' THEN 'AI'\n",
|
|
||||||
" END as topic,\n",
|
|
||||||
" COUNT(*) as article_count\n",
|
|
||||||
" FROM articles\n",
|
|
||||||
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
|
|
||||||
" GROUP BY 1\n",
|
|
||||||
" HAVING topic IS NOT NULL\n",
|
|
||||||
" ORDER BY 2 DESC\n",
|
|
||||||
"\"\"\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"\\nTopic Analysis (Last 30 Days):\")\n",
|
|
||||||
"topic_analysis"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## 4. Visualization Example"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"import matplotlib.pyplot as plt\n",
|
|
||||||
"\n",
|
|
||||||
"# Get daily article counts\n",
|
|
||||||
"daily_counts = querier.query(\"\"\"\n",
|
|
||||||
" SELECT \n",
|
|
||||||
" DATE_TRUNC('day', published_date) as date,\n",
|
|
||||||
" COUNT(*) as article_count\n",
|
|
||||||
" FROM articles\n",
|
|
||||||
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
|
|
||||||
" GROUP BY 1\n",
|
|
||||||
" ORDER BY 1\n",
|
|
||||||
"\"\"\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Plot\n",
|
|
||||||
"plt.figure(figsize=(15, 6))\n",
|
|
||||||
"plt.plot(daily_counts['date'], daily_counts['article_count'])\n",
|
|
||||||
"plt.title('Daily Article Counts (Last 30 Days)')\n",
|
|
||||||
"plt.xlabel('Date')\n",
|
|
||||||
"plt.ylabel('Number of Articles')\n",
|
|
||||||
"plt.grid(True)\n",
|
|
||||||
"plt.xticks(rotation=45)\n",
|
|
||||||
"plt.tight_layout()\n",
|
|
||||||
"plt.show()"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 187 KiB After Width: | Height: | Size: 154 KiB |
@@ -3,8 +3,8 @@ import pandas as pd
|
|||||||
from typing import Optional, List, Dict, Union, Any
|
from typing import Optional, List, Dict, Union, Any
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
import logging
|
import logging
|
||||||
from string import Template
|
from string import Template
|
||||||
|
|
||||||
@@ -15,7 +15,7 @@ class S3BatchDownloader:
|
|||||||
"region": "${AWS_REGION}",
|
"region": "${AWS_REGION}",
|
||||||
"bucket": "${RSS_BUCKET_NAME}",
|
"bucket": "${RSS_BUCKET_NAME}",
|
||||||
"prefix": "${RSS_PREFIX}",
|
"prefix": "${RSS_PREFIX}",
|
||||||
"max_workers": 10
|
"max_workers": os.cpu_count() or 10
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, config_path: Optional[str] = None):
|
def __init__(self, config_path: Optional[str] = None):
|
||||||
@@ -41,16 +41,14 @@ class S3BatchDownloader:
|
|||||||
template = Template(json.dumps(self.DEFAULT_CONFIG))
|
template = Template(json.dumps(self.DEFAULT_CONFIG))
|
||||||
|
|
||||||
env_vars = {
|
env_vars = {
|
||||||
'AWS_REGION': os.getenv('AWS_REGION', 'eu-west-3'),
|
'AWS_REGION': os.getenv('AWS_REGION', 'us-east-1'),
|
||||||
'RSS_BUCKET_NAME': os.getenv('RSS_BUCKET_NAME', 'your-bucket'),
|
'RSS_BUCKET_NAME': os.getenv('S3_BUCKET_NAME')
|
||||||
'RSS_PREFIX': os.getenv('RSS_PREFIX', 'articles/'),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
config_str = template.safe_substitute(env_vars)
|
config_str = template.safe_substitute(env_vars)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
config = json.loads(config_str)
|
config = json.loads(config_str)
|
||||||
# Ensure max_workers is an integer
|
|
||||||
config['max_workers'] = int(config.get('max_workers', 10))
|
config['max_workers'] = int(config.get('max_workers', 10))
|
||||||
return config
|
return config
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
@@ -60,84 +58,65 @@ class S3BatchDownloader:
|
|||||||
"""Validate the configuration"""
|
"""Validate the configuration"""
|
||||||
required_fields = ['region', 'bucket', 'prefix']
|
required_fields = ['region', 'bucket', 'prefix']
|
||||||
missing_fields = [field for field in required_fields if field not in self.config]
|
missing_fields = [field for field in required_fields if field not in self.config]
|
||||||
|
|
||||||
if missing_fields:
|
if missing_fields:
|
||||||
raise ValueError(f"Missing required config fields: {', '.join(missing_fields)}")
|
raise ValueError(f"Missing required config fields: {', '.join(missing_fields)}")
|
||||||
|
|
||||||
def download_to_csv(self,
|
def download_to_file(self,
|
||||||
output_path: str,
|
output_path: str,
|
||||||
prefix: Optional[str] = None,
|
file_format: str = 'csv',
|
||||||
start_date: Optional[str] = None,
|
start_date: Optional[str] = None,
|
||||||
end_date: Optional[str] = None,
|
end_date: Optional[str] = None) -> str:
|
||||||
batch_size: int = 1000) -> str:
|
|
||||||
"""
|
"""
|
||||||
Download articles from S3 to CSV file
|
Download articles from S3 to a consolidated file
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output_path: Path to save CSV file
|
output_path: Path to save the output file.
|
||||||
prefix: Optional S3 prefix filter
|
file_format: Format to save the file ('csv' or 'json').
|
||||||
start_date: Optional start date filter (YYYY-MM-DD)
|
start_date: Optional start date filter (YYYY-MM-DD).
|
||||||
end_date: Optional end date filter (YYYY-MM-DD)
|
end_date: Optional end date filter (YYYY-MM-DD).
|
||||||
batch_size: Number of objects to process in each batch
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Path to the saved CSV file
|
Path to the saved file.
|
||||||
"""
|
"""
|
||||||
self.logger.info(f"Starting batch download to {output_path}")
|
self.logger.info(f"Starting batch download to {output_path}")
|
||||||
|
|
||||||
# Convert dates if provided
|
# Convert date strings to UTC datetime
|
||||||
start_ts = datetime.strptime(start_date, '%Y-%m-%d') if start_date else None
|
start_ts = datetime.strptime(start_date, '%Y-%m-%d').replace(tzinfo=timezone.utc) if start_date else None
|
||||||
end_ts = datetime.strptime(end_date, '%Y-%m-%D') if end_date else None
|
end_ts = datetime.strptime(end_date, '%Y-%m-%d').replace(tzinfo=timezone.utc) if end_date else None
|
||||||
|
|
||||||
# Get list of all objects
|
# List and filter objects
|
||||||
objects = self._list_objects(prefix)
|
objects = self._list_objects()
|
||||||
|
|
||||||
# Filter by date if specified
|
|
||||||
if start_ts or end_ts:
|
if start_ts or end_ts:
|
||||||
objects = [
|
objects = [
|
||||||
obj for obj in objects
|
obj for obj in objects
|
||||||
if self._is_in_date_range(obj['LastModified'], start_ts, end_ts)
|
if self._is_in_date_range(obj['LastModified'], start_ts, end_ts)
|
||||||
]
|
]
|
||||||
|
|
||||||
self.logger.info(f"Found {len(objects)} objects to process")
|
self.logger.info(f"Found {len(objects)} objects to process")
|
||||||
|
|
||||||
# Process in batches
|
# Download and merge data
|
||||||
all_data = []
|
all_data = []
|
||||||
for i in range(0, len(objects), batch_size):
|
|
||||||
batch = objects[i:i + batch_size]
|
|
||||||
self.logger.info(f"Processing batch {i//batch_size + 1}/{(len(objects)-1)//batch_size + 1}")
|
|
||||||
|
|
||||||
# Download batch in parallel
|
|
||||||
with ThreadPoolExecutor(max_workers=self.config['max_workers']) as executor:
|
with ThreadPoolExecutor(max_workers=self.config['max_workers']) as executor:
|
||||||
results = list(executor.map(self._download_object, batch))
|
future_to_obj = {executor.submit(self._download_object, obj): obj for obj in objects}
|
||||||
|
for future in as_completed(future_to_obj):
|
||||||
# Add successful downloads to results
|
result = future.result()
|
||||||
for result in results:
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
all_data.extend(result if isinstance(result, list) else [result])
|
all_data.extend(result if isinstance(result, list) else [result])
|
||||||
|
|
||||||
# Convert to DataFrame and save
|
# Save to file
|
||||||
df = pd.DataFrame(all_data)
|
self._save_to_file(all_data, output_path, file_format)
|
||||||
df.to_csv(output_path, index=False)
|
self.logger.info(f"Successfully downloaded {len(all_data)} articles to {output_path}")
|
||||||
|
|
||||||
self.logger.info(f"Successfully downloaded {len(df)} articles to {output_path}")
|
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
def _list_objects(self, prefix: Optional[str] = None) -> List[Dict]:
|
def _list_objects(self) -> List[Dict]:
|
||||||
"""List objects in S3 bucket"""
|
"""List objects in S3 bucket"""
|
||||||
objects = []
|
objects = []
|
||||||
paginator = self.s3.get_paginator('list_objects_v2')
|
paginator = self.s3.get_paginator('list_objects')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for page in paginator.paginate(
|
for page in paginator.paginate(Bucket=self.config['bucket']):
|
||||||
Bucket=self.config['bucket'],
|
|
||||||
Prefix=prefix or self.config['prefix']
|
|
||||||
):
|
|
||||||
if 'Contents' in page:
|
if 'Contents' in page:
|
||||||
objects.extend(page['Contents'])
|
objects.extend(page['Contents'])
|
||||||
|
|
||||||
return objects
|
return objects
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error listing objects: {str(e)}")
|
self.logger.error(f"Error listing objects: {str(e)}")
|
||||||
raise
|
raise
|
||||||
@@ -145,41 +124,31 @@ class S3BatchDownloader:
|
|||||||
def _download_object(self, obj: Dict) -> Optional[Union[Dict, List[Dict]]]:
|
def _download_object(self, obj: Dict) -> Optional[Union[Dict, List[Dict]]]:
|
||||||
"""Download and parse single S3 object"""
|
"""Download and parse single S3 object"""
|
||||||
try:
|
try:
|
||||||
response = self.s3.get_object(
|
response = self.s3.get_object(Bucket=self.config['bucket'], Key=obj['Key'])
|
||||||
Bucket=self.config['bucket'],
|
|
||||||
Key=obj['Key']
|
|
||||||
)
|
|
||||||
content = response['Body'].read().decode('utf-8')
|
content = response['Body'].read().decode('utf-8')
|
||||||
|
|
||||||
# Handle both single JSON objects and arrays
|
|
||||||
data = json.loads(content)
|
data = json.loads(content)
|
||||||
return data if isinstance(data, list) else [data]
|
metadata = response.get('Metadata', {})
|
||||||
|
if isinstance(data, dict):
|
||||||
|
data.update(metadata)
|
||||||
|
return [data]
|
||||||
|
elif isinstance(data, list):
|
||||||
|
for item in data:
|
||||||
|
item.update(metadata)
|
||||||
|
return data
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error downloading {obj['Key']}: {str(e)}")
|
self.logger.error(f"Error downloading {obj['Key']}: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _is_in_date_range(self,
|
def _is_in_date_range(self, ts: datetime, start: Optional[datetime], end: Optional[datetime]) -> bool:
|
||||||
ts: datetime,
|
"""Check if timestamp is within the date range"""
|
||||||
start: Optional[datetime],
|
return (not start or ts >= start) and (not end or ts <= end)
|
||||||
end: Optional[datetime]) -> bool:
|
|
||||||
"""Check if timestamp is within date range"""
|
|
||||||
if start and ts < start:
|
|
||||||
return False
|
|
||||||
if end and ts > end:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def get_storage_stats(self) -> Dict[str, Union[int, float]]:
|
def _save_to_file(self, data: List[Dict], output_path: str, file_format: str) -> None:
|
||||||
"""
|
"""Save data to file"""
|
||||||
Get storage statistics
|
df = pd.DataFrame(data)
|
||||||
|
if file_format == 'csv':
|
||||||
Returns:
|
df.to_csv(output_path, index=False)
|
||||||
Dict containing total objects, total size, etc.
|
elif file_format == 'json':
|
||||||
"""
|
df.to_json(output_path, orient='records', lines=True)
|
||||||
objects = self._list_objects()
|
else:
|
||||||
return {
|
raise ValueError(f"Unsupported file format: {file_format}")
|
||||||
'total_objects': len(objects),
|
|
||||||
'total_size_mb': sum(obj['Size'] for obj in objects) / (1024 * 1024),
|
|
||||||
'average_size_kb': sum(obj['Size'] for obj in objects) / len(objects) / 1024 if objects else 0
|
|
||||||
}
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
from .querier import ArticleQuerier
|
|
||||||
|
|
||||||
__all__ = ['ArticleQuerier']
|
|
||||||
@@ -1,175 +0,0 @@
|
|||||||
import boto3
|
|
||||||
import pandas as pd
|
|
||||||
from typing import Optional, Dict, List, Any
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from datetime import datetime
|
|
||||||
from string import Template
|
|
||||||
import logging
|
|
||||||
|
|
||||||
class ArticleQuerier:
|
|
||||||
"""Class for querying RSS articles using Amazon Athena"""
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = {
|
|
||||||
"region": "${AWS_REGION}",
|
|
||||||
"database": "${RSS_DATABASE_NAME}",
|
|
||||||
"table": "${RSS_TABLE_NAME}",
|
|
||||||
"output_location": "s3://${RSS_BUCKET_NAME}/athena-output/"
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, config_path: Optional[str] = None):
|
|
||||||
"""
|
|
||||||
Initialize the ArticleQuerier
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config_path: Optional path to config file. If None, uses environment variables.
|
|
||||||
"""
|
|
||||||
self.logger = logging.getLogger(__name__)
|
|
||||||
self.config = self._load_config(config_path)
|
|
||||||
self._validate_config()
|
|
||||||
|
|
||||||
self.athena = boto3.client('athena', region_name=self.config['region'])
|
|
||||||
self.logger.info(f"Initialized ArticleQuerier with database: {self.config['database']}")
|
|
||||||
|
|
||||||
def _load_config(self, config_path: Optional[str]) -> Dict[str, str]:
|
|
||||||
"""Load and process configuration"""
|
|
||||||
if config_path and os.path.exists(config_path):
|
|
||||||
with open(config_path) as f:
|
|
||||||
template = Template(f.read())
|
|
||||||
else:
|
|
||||||
template = Template(json.dumps(self.DEFAULT_CONFIG))
|
|
||||||
|
|
||||||
env_vars = {
|
|
||||||
'AWS_REGION': os.getenv('AWS_REGION', 'eu-west-3'),
|
|
||||||
'RSS_DATABASE_NAME': os.getenv('RSS_DATABASE_NAME', 'rss_articles'),
|
|
||||||
'RSS_TABLE_NAME': os.getenv('RSS_TABLE_NAME', 'articles'),
|
|
||||||
'RSS_BUCKET_NAME': os.getenv('RSS_BUCKET_NAME', 'your-bucket'),
|
|
||||||
}
|
|
||||||
|
|
||||||
config_str = template.safe_substitute(env_vars)
|
|
||||||
|
|
||||||
try:
|
|
||||||
return json.loads(config_str)
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
raise ValueError(f"Invalid JSON config after variable substitution: {str(e)}")
|
|
||||||
|
|
||||||
def _validate_config(self) -> None:
|
|
||||||
"""Validate the configuration"""
|
|
||||||
required_fields = ['region', 'database', 'table', 'output_location']
|
|
||||||
missing_fields = [field for field in required_fields if field not in self.config]
|
|
||||||
|
|
||||||
if missing_fields:
|
|
||||||
raise ValueError(f"Missing required config fields: {', '.join(missing_fields)}")
|
|
||||||
|
|
||||||
if not self.config['output_location'].startswith('s3://'):
|
|
||||||
raise ValueError("output_location must be an S3 URL (s3://...)")
|
|
||||||
|
|
||||||
def search(self,
|
|
||||||
title: Optional[str] = None,
|
|
||||||
content: Optional[str] = None,
|
|
||||||
source: Optional[str] = None,
|
|
||||||
date_from: Optional[str] = None,
|
|
||||||
date_to: Optional[str] = None,
|
|
||||||
limit: int = 100) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Search articles using various filters
|
|
||||||
|
|
||||||
Args:
|
|
||||||
title: Search in article titles
|
|
||||||
content: Search in article content
|
|
||||||
source: Filter by source
|
|
||||||
date_from: Start date (YYYY-MM-DD)
|
|
||||||
date_to: End date (YYYY-MM-DD)
|
|
||||||
limit: Maximum number of results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DataFrame containing the results
|
|
||||||
"""
|
|
||||||
conditions = []
|
|
||||||
if title:
|
|
||||||
conditions.append(f"LOWER(title) LIKE LOWER('%{title}%')")
|
|
||||||
if content:
|
|
||||||
conditions.append(f"LOWER(content) LIKE LOWER('%{content}%')")
|
|
||||||
if source:
|
|
||||||
conditions.append(f"source = '{source}'")
|
|
||||||
if date_from:
|
|
||||||
conditions.append(f"published_date >= TIMESTAMP '{date_from}'")
|
|
||||||
if date_to:
|
|
||||||
conditions.append(f"published_date <= TIMESTAMP '{date_to}'")
|
|
||||||
|
|
||||||
where_clause = " AND ".join(conditions) if conditions else "1=1"
|
|
||||||
query = f"""
|
|
||||||
SELECT *
|
|
||||||
FROM {self.config['database']}.{self.config['table']}
|
|
||||||
WHERE {where_clause}
|
|
||||||
ORDER BY published_date DESC
|
|
||||||
LIMIT {limit}
|
|
||||||
"""
|
|
||||||
|
|
||||||
return self.query(query)
|
|
||||||
|
|
||||||
def query(self, query: str) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Execute custom SQL query
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query: SQL query string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DataFrame containing the results
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
self.logger.debug(f"Executing query: {query}")
|
|
||||||
response = self.athena.start_query_execution(
|
|
||||||
QueryString=query,
|
|
||||||
QueryExecutionContext={'Database': self.config['database']},
|
|
||||||
ResultConfiguration={'OutputLocation': self.config['output_location']}
|
|
||||||
)
|
|
||||||
|
|
||||||
return self._get_query_results(response['QueryExecutionId'])
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Query execution failed: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def get_sources(self) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Get list of sources and their article counts
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DataFrame with source statistics
|
|
||||||
"""
|
|
||||||
query = f"""
|
|
||||||
SELECT
|
|
||||||
source,
|
|
||||||
COUNT(*) as article_count,
|
|
||||||
MIN(published_date) as earliest_article,
|
|
||||||
MAX(published_date) as latest_article
|
|
||||||
FROM {self.config['database']}.{self.config['table']}
|
|
||||||
GROUP BY source
|
|
||||||
ORDER BY article_count DESC
|
|
||||||
"""
|
|
||||||
return self.query(query)
|
|
||||||
|
|
||||||
def _get_query_results(self, query_id: str) -> pd.DataFrame:
|
|
||||||
"""Helper method to get query results"""
|
|
||||||
while True:
|
|
||||||
status = self.athena.get_query_execution(QueryExecutionId=query_id)
|
|
||||||
state = status['QueryExecution']['Status']['State']
|
|
||||||
|
|
||||||
if state == 'SUCCEEDED':
|
|
||||||
break
|
|
||||||
elif state in ['FAILED', 'CANCELLED']:
|
|
||||||
error_message = status['QueryExecution']['Status'].get('StateChangeReason', 'Unknown error')
|
|
||||||
raise Exception(f"Query failed: {error_message}")
|
|
||||||
|
|
||||||
results = []
|
|
||||||
columns = None
|
|
||||||
paginator = self.athena.get_paginator('get_query_results')
|
|
||||||
|
|
||||||
for page in paginator.paginate(QueryExecutionId=query_id):
|
|
||||||
if not columns:
|
|
||||||
columns = [col['Name'] for col in page['ResultSet']['ResultSetMetadata']['ColumnInfo']]
|
|
||||||
for row in page['ResultSet']['Rows'][1:]:
|
|
||||||
results.append([field.get('VarCharValue', '') for field in row['Data']])
|
|
||||||
|
|
||||||
return pd.DataFrame(results, columns=columns)
|
|
||||||
22
todo.md
22
todo.md
@@ -1,18 +1,20 @@
|
|||||||
# Before Public Launch
|
# Before Public Launch
|
||||||
* Rethink Partitioning Strategy [Done]
|
|
||||||
* Implement New Partitioning Strategy. [Done]
|
|
||||||
- Incorporate: All the json data except for content and link.
|
|
||||||
Partitioning Strategy: {Year}/{Month}/{Day}/{article_id}
|
|
||||||
* API Tool - to Pull data that you have down.
|
* API Tool - to Pull data that you have down.
|
||||||
|
[x] Faster Batch Responses ( Parrallelization of requests. )
|
||||||
|
[x] Query API ( Technical Feasability / is S3 even good for this. )
|
||||||
|
|
||||||
|
* Test out Vector Databases at Small Scale.
|
||||||
|
* Test out Vector Databases at Scale.
|
||||||
|
* Test out LLM Summarizaiton At Small Scale.
|
||||||
|
* Test out LLM Summarization At Scall
|
||||||
|
|
||||||
|
* Text Processings & Text Cleaning of Content during ingestion step.
|
||||||
|
|
||||||
|
* Fix up ReadMe
|
||||||
|
* Publish RoadMap.
|
||||||
* Tips on where to gather RSS Feeds.
|
* Tips on where to gather RSS Feeds.
|
||||||
* Public Launch Posts
|
* Public Launch Posts
|
||||||
* Reddit
|
* Reddit
|
||||||
* Twitter
|
* Twitter
|
||||||
* Kaggle
|
* Kaggle
|
||||||
|
|
||||||
* Test out Vector Databases at Small Scale.
|
|
||||||
* Test out Vector Databases at Scale.
|
|
||||||
* Test out LLM Summarizaiton At Small Scale
|
|
||||||
* Test out LLM Summarization At Scall
|
|
||||||
|
|
||||||
* Text Processings & Text Cleaning of Content during ingestion step.
|
|
||||||
Reference in New Issue
Block a user