sped up batch ingestion of articles.

This commit is contained in:
Charles-Gormley
2024-11-20 18:31:11 -05:00
parent f400c8bda5
commit 28148f1181
8 changed files with 289613 additions and 630 deletions

View File

@@ -11,17 +11,33 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import pandas as pd\n",
"from datetime import datetime, timedelta\n",
"from src.search import S3BatchDownloader\n",
"from time import time\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Root imports\n",
"os.chdir('..')\n",
"\n",
"# Configure logging\n",
"import logging\n",
"logging.basicConfig(level=logging.INFO)"
"from src.search.batch import S3BatchDownloader\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv(override=True)\n",
"\n",
"downloader = S3BatchDownloader()\n",
"\n",
"os.chdir('example-notebooks')\n"
]
},
{
@@ -31,18 +47,226 @@
"## Initialize the Downloader"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"List permission: Allowed\n"
]
}
],
"source": [
"import boto3\n",
"from botocore.exceptions import ClientError\n",
"\n",
"s3 = boto3.client('s3')\n",
"\n",
"# Test ListObjects\n",
"try:\n",
" response = s3.list_objects_v2(Bucket='open-rss-articles-us-east-1')\n",
" print(\"List permission: Allowed\")\n",
"except ClientError as e:\n",
" print(\"List permission: Denied\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded data to consolidated_data.csv in 59.01 seconds\n"
]
}
],
"source": [
"# Optional: Set environment variables\n",
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
"os.environ['RSS_PREFIX'] = 'articles/'\n",
"start = time()\n",
"output_path = \"consolidated_data.csv\" # or \"consolidated_data.json\"\n",
"\n",
"# Initialize downloader\n",
"downloader = S3BatchDownloader()"
"# Define date range\n",
"start_date = \"2024-11-17\"\n",
"end_date = \"2024-11-20\"\n",
"\n",
"# Start downloading\n",
"downloader.download_to_file(\n",
" output_path=output_path,\n",
" file_format=\"csv\", # or \"json\"\n",
" start_date=start_date,\n",
" end_date=end_date\n",
")\n",
"\n",
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(output_path)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link</th>\n",
" <th>rss</th>\n",
" <th>title</th>\n",
" <th>content</th>\n",
" <th>unixTime</th>\n",
" <th>rss_id</th>\n",
" <th>article_id</th>\n",
" <th>llm_summary</th>\n",
" <th>embedding</th>\n",
" <th>unixtime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>https://www.cnet.com/home/internet/comporium-i...</td>\n",
" <td>https://www.cnet.com/rss/news/</td>\n",
" <td>Comporium Home Internet: Pricing, Speeds and A...</td>\n",
" <td>Unavailable in Provider unavailable in 90001 E...</td>\n",
" <td>1731883654</td>\n",
" <td>f5e6f52c79</td>\n",
" <td>00089f7505</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1731883654</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>https://kotaku.com/lady-gaga-wednesday-season-...</td>\n",
" <td>https://kotaku.com/rss</td>\n",
" <td>Lady Gaga Appearing In Wednesday Season 2 Is A...</td>\n",
" <td>The next stop on Lady Gagas years-long tour t...</td>\n",
" <td>1731883665</td>\n",
" <td>a0840ab3b4</td>\n",
" <td>009f880a86</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1731883665</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>https://expeditionportal.com/classifieds-honda...</td>\n",
" <td>https://www.expeditionportal.com/feed/</td>\n",
" <td>2008 Honda Element EX ECamper :: Classifieds</td>\n",
" <td>Overlanding is about experiences first and for...</td>\n",
" <td>1731883665</td>\n",
" <td>ecdc66bb02</td>\n",
" <td>00ac77e95f</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1731883665</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>http://www.influencerupdate.biz/news/68624/twi...</td>\n",
" <td>https://www.influencerupdate.biz/rss/news/</td>\n",
" <td>Twitch is testing mid-roll ads on creator broa...</td>\n",
" <td>Streaming giant Twitch is planning to test mid...</td>\n",
" <td>1731883669</td>\n",
" <td>e34caba76d</td>\n",
" <td>0041bc4abf</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1731883669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>https://thejournal.com/Articles/2024/10/29/Rep...</td>\n",
" <td>https://thejournal.com/rss-feeds/news.aspx</td>\n",
" <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
" <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
" <td>1731883616</td>\n",
" <td>efd9bb9654</td>\n",
" <td>000a3da3b6</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1731883616</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link \\\n",
"0 https://www.cnet.com/home/internet/comporium-i... \n",
"1 https://kotaku.com/lady-gaga-wednesday-season-... \n",
"2 https://expeditionportal.com/classifieds-honda... \n",
"3 http://www.influencerupdate.biz/news/68624/twi... \n",
"4 https://thejournal.com/Articles/2024/10/29/Rep... \n",
"\n",
" rss \\\n",
"0 https://www.cnet.com/rss/news/ \n",
"1 https://kotaku.com/rss \n",
"2 https://www.expeditionportal.com/feed/ \n",
"3 https://www.influencerupdate.biz/rss/news/ \n",
"4 https://thejournal.com/rss-feeds/news.aspx \n",
"\n",
" title \\\n",
"0 Comporium Home Internet: Pricing, Speeds and A... \n",
"1 Lady Gaga Appearing In Wednesday Season 2 Is A... \n",
"2 2008 Honda Element EX ECamper :: Classifieds \n",
"3 Twitch is testing mid-roll ads on creator broa... \n",
"4 Report: 90% of Schools Depend on E-rate Fundin... \n",
"\n",
" content unixTime rss_id \\\n",
"0 Unavailable in Provider unavailable in 90001 E... 1731883654 f5e6f52c79 \n",
"1 The next stop on Lady Gagas years-long tour t... 1731883665 a0840ab3b4 \n",
"2 Overlanding is about experiences first and for... 1731883665 ecdc66bb02 \n",
"3 Streaming giant Twitch is planning to test mid... 1731883669 e34caba76d \n",
"4 Report: 90% of Schools Depend on E-rate Fundin... 1731883616 efd9bb9654 \n",
"\n",
" article_id llm_summary embedding unixtime \n",
"0 00089f7505 NaN NaN 1731883654 \n",
"1 009f880a86 NaN NaN 1731883665 \n",
"2 00ac77e95f NaN NaN 1731883665 \n",
"3 0041bc4abf NaN NaN 1731883669 \n",
"4 000a3da3b6 NaN NaN 1731883616 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
@@ -51,150 +275,27 @@
"source": [
"## 1. Check Storage Statistics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Get storage stats\n",
"stats = downloader.get_storage_stats()\n",
"\n",
"print(f\"Total objects: {stats['total_objects']:,}\")\n",
"print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n",
"print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Download Examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Download last 7 days of articles\n",
"start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n",
"\n",
"recent_articles_path = downloader.download_to_csv(\n",
" output_path='recent_articles.csv',\n",
" start_date=start_date\n",
")\n",
"\n",
"# Load and display sample\n",
"recent_df = pd.read_csv(recent_articles_path)\n",
"print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n",
"recent_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Download articles from specific source prefix\n",
"tech_articles_path = downloader.download_to_csv(\n",
" output_path='tech_articles.csv',\n",
" prefix='articles/tech/'\n",
")\n",
"\n",
"tech_df = pd.read_csv(tech_articles_path)\n",
"print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n",
"tech_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Analysis Examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Analyze downloaded data\n",
"if 'source' in tech_df.columns:\n",
" source_counts = tech_df['source'].value_counts()\n",
" \n",
" print(\"\\nArticles per Source:\")\n",
" print(source_counts)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Time-based analysis\n",
"if 'published_date' in tech_df.columns:\n",
" tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n",
" daily_counts = tech_df.resample('D', on='published_date').size()\n",
" \n",
" # Plot\n",
" import matplotlib.pyplot as plt\n",
" \n",
" plt.figure(figsize=(15, 6))\n",
" daily_counts.plot(kind='bar')\n",
" plt.title('Articles per Day')\n",
" plt.xlabel('Date')\n",
" plt.ylabel('Number of Articles')\n",
" plt.xticks(rotation=45)\n",
" plt.tight_layout()\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Export Filtered Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Filter and export specific data\n",
"if 'title' in tech_df.columns and 'content' in tech_df.columns:\n",
" # Filter articles containing specific keywords\n",
" ai_articles = tech_df[\n",
" tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n",
" tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n",
" ]\n",
" \n",
" # Export filtered data\n",
" ai_articles.to_csv('ai_articles.csv', index=False)\n",
" print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Cleanup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Optional: Remove downloaded CSV files\n",
"import os\n",
"\n",
"for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n",
" if os.path.exists(file):\n",
" os.remove(file)\n",
" print(f\"Removed {file}\")"
]
}
]
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@@ -1,199 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RSS Article Query Examples\n",
"\n",
"This notebook demonstrates how to query RSS articles using the ArticleQuerier class."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"import os\n",
"import pandas as pd\n",
"from datetime import datetime, timedelta\n",
"from src.search import ArticleQuerier\n",
"\n",
"# Configure logging if needed\n",
"import logging\n",
"logging.basicConfig(level=logging.INFO)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize the Querier\n",
"\n",
"You can either set environment variables or use default values:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Optional: Set environment variables\n",
"os.environ['AWS_REGION'] = 'eu-west-3'\n",
"os.environ['RSS_DATABASE_NAME'] = 'rss_articles'\n",
"os.environ['RSS_TABLE_NAME'] = 'articles'\n",
"os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
"\n",
"# Initialize querier\n",
"querier = ArticleQuerier()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Basic Source Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Get all sources and their article counts\n",
"sources = querier.get_sources()\n",
"\n",
"# Display top sources\n",
"print(\"Top Sources by Article Count:\")\n",
"sources.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Search Examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Search articles containing 'python' in title\n",
"python_articles = querier.search(title=\"python\", limit=5)\n",
"print(\"\\nArticles about Python:\")\n",
"python_articles[['title', 'source', 'published_date']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Search with multiple filters\n",
"# Get recent AWS articles from specific source\n",
"filtered_articles = querier.search(\n",
" content=\"aws\",\n",
" source=\"techcrunch\",\n",
" date_from=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),\n",
" limit=5\n",
")\n",
"\n",
"print(\"\\nRecent AWS articles from TechCrunch:\")\n",
"filtered_articles[['title', 'published_date', 'url']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Custom SQL Queries"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Articles per month by source\n",
"monthly_stats = querier.query(\"\"\"\n",
" SELECT \n",
" source,\n",
" DATE_TRUNC('month', published_date) as month,\n",
" COUNT(*) as article_count\n",
" FROM articles\n",
" WHERE published_date >= CURRENT_DATE - INTERVAL '6' MONTH\n",
" GROUP BY 1, 2\n",
" ORDER BY 2 DESC, 3 DESC\n",
"\"\"\")\n",
"\n",
"print(\"\\nMonthly Article Counts:\")\n",
"monthly_stats.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Topic analysis (searching for specific keywords)\n",
"topic_analysis = querier.query(\"\"\"\n",
" SELECT \n",
" CASE\n",
" WHEN LOWER(title) LIKE '%python%' THEN 'Python'\n",
" WHEN LOWER(title) LIKE '%javascript%' OR LOWER(title) LIKE '%js%' THEN 'JavaScript'\n",
" WHEN LOWER(title) LIKE '%aws%' THEN 'AWS'\n",
" WHEN LOWER(title) LIKE '%ai%' OR LOWER(title) LIKE '%artificial intelligence%' THEN 'AI'\n",
" END as topic,\n",
" COUNT(*) as article_count\n",
" FROM articles\n",
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
" GROUP BY 1\n",
" HAVING topic IS NOT NULL\n",
" ORDER BY 2 DESC\n",
"\"\"\")\n",
"\n",
"print(\"\\nTopic Analysis (Last 30 Days):\")\n",
"topic_analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Visualization Example"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"# Get daily article counts\n",
"daily_counts = querier.query(\"\"\"\n",
" SELECT \n",
" DATE_TRUNC('day', published_date) as date,\n",
" COUNT(*) as article_count\n",
" FROM articles\n",
" WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
" GROUP BY 1\n",
" ORDER BY 1\n",
"\"\"\")\n",
"\n",
"# Plot\n",
"plt.figure(figsize=(15, 6))\n",
"plt.plot(daily_counts['date'], daily_counts['article_count'])\n",
"plt.title('Daily Article Counts (Last 30 Days)')\n",
"plt.xlabel('Date')\n",
"plt.ylabel('Number of Articles')\n",
"plt.grid(True)\n",
"plt.xticks(rotation=45)\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
]
}