sped up batch ingestion of articles.

2025-12-17 22:14:20 +01:00 · 2024-11-20 18:31:11 -05:00
parent f400c8bda5
commit 28148f1181
8 changed files with 289613 additions and 630 deletions
--- a/example-notebooks/batch-notebook.ipynb
+++ b/example-notebooks/batch-notebook.ipynb
@@ -11,17 +11,33 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
+   "outputs": [],
   "source": [
    "import os\n",
+    "import sys\n",
    "import pandas as pd\n",
-    "from datetime import datetime, timedelta\n",
-    "from src.search import S3BatchDownloader\n",
+    "from time import time\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Root imports\n",
+    "os.chdir('..')\n",
    "\n",
-    "# Configure logging\n",
-    "import logging\n",
-    "logging.basicConfig(level=logging.INFO)"
+    "from src.search.batch import S3BatchDownloader\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv(override=True)\n",
+    "\n",
+    "downloader = S3BatchDownloader()\n",
+    "\n",
+    "os.chdir('example-notebooks')\n"
   ]
  },
  {
@@ -31,18 +47,226 @@
    "## Initialize the Downloader"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "List permission: Allowed\n"
+     ]
+    }
+   ],
+   "source": [
+    "import boto3\n",
+    "from botocore.exceptions import ClientError\n",
+    "\n",
+    "s3 = boto3.client('s3')\n",
+    "\n",
+    "# Test ListObjects\n",
+    "try:\n",
+    "    response = s3.list_objects_v2(Bucket='open-rss-articles-us-east-1')\n",
+    "    print(\"List permission: Allowed\")\n",
+    "except ClientError as e:\n",
+    "    print(\"List permission: Denied\")\n"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloaded data to consolidated_data.csv in 59.01 seconds\n"
+     ]
+    }
+   ],
   "source": [
-    "# Optional: Set environment variables\n",
-    "os.environ['AWS_REGION'] = 'eu-west-3'\n",
-    "os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
-    "os.environ['RSS_PREFIX'] = 'articles/'\n",
+    "start = time()\n",
+    "output_path = \"consolidated_data.csv\"  # or \"consolidated_data.json\"\n",
    "\n",
-    "# Initialize downloader\n",
-    "downloader = S3BatchDownloader()"
+    "# Define date range\n",
+    "start_date = \"2024-11-17\"\n",
+    "end_date = \"2024-11-20\"\n",
+    "\n",
+    "# Start downloading\n",
+    "downloader.download_to_file(\n",
+    "    output_path=output_path,\n",
+    "    file_format=\"csv\",  # or \"json\"\n",
+    "    start_date=start_date,\n",
+    "    end_date=end_date\n",
+    ")\n",
+    "\n",
+    "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(output_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>link</th>\n",
+       "      <th>rss</th>\n",
+       "      <th>title</th>\n",
+       "      <th>content</th>\n",
+       "      <th>unixTime</th>\n",
+       "      <th>rss_id</th>\n",
+       "      <th>article_id</th>\n",
+       "      <th>llm_summary</th>\n",
+       "      <th>embedding</th>\n",
+       "      <th>unixtime</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>https://www.cnet.com/home/internet/comporium-i...</td>\n",
+       "      <td>https://www.cnet.com/rss/news/</td>\n",
+       "      <td>Comporium Home Internet: Pricing, Speeds and A...</td>\n",
+       "      <td>Unavailable in Provider unavailable in 90001 E...</td>\n",
+       "      <td>1731883654</td>\n",
+       "      <td>f5e6f52c79</td>\n",
+       "      <td>00089f7505</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883654</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>https://kotaku.com/lady-gaga-wednesday-season-...</td>\n",
+       "      <td>https://kotaku.com/rss</td>\n",
+       "      <td>Lady Gaga Appearing In Wednesday Season 2 Is A...</td>\n",
+       "      <td>The next stop on Lady Gaga’s years-long tour t...</td>\n",
+       "      <td>1731883665</td>\n",
+       "      <td>a0840ab3b4</td>\n",
+       "      <td>009f880a86</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883665</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>https://expeditionportal.com/classifieds-honda...</td>\n",
+       "      <td>https://www.expeditionportal.com/feed/</td>\n",
+       "      <td>2008 Honda Element EX ECamper :: Classifieds</td>\n",
+       "      <td>Overlanding is about experiences first and for...</td>\n",
+       "      <td>1731883665</td>\n",
+       "      <td>ecdc66bb02</td>\n",
+       "      <td>00ac77e95f</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883665</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>http://www.influencerupdate.biz/news/68624/twi...</td>\n",
+       "      <td>https://www.influencerupdate.biz/rss/news/</td>\n",
+       "      <td>Twitch is testing mid-roll ads on creator broa...</td>\n",
+       "      <td>Streaming giant Twitch is planning to test mid...</td>\n",
+       "      <td>1731883669</td>\n",
+       "      <td>e34caba76d</td>\n",
+       "      <td>0041bc4abf</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883669</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>https://thejournal.com/Articles/2024/10/29/Rep...</td>\n",
+       "      <td>https://thejournal.com/rss-feeds/news.aspx</td>\n",
+       "      <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
+       "      <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
+       "      <td>1731883616</td>\n",
+       "      <td>efd9bb9654</td>\n",
+       "      <td>000a3da3b6</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883616</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                link  \\\n",
+       "0  https://www.cnet.com/home/internet/comporium-i...   \n",
+       "1  https://kotaku.com/lady-gaga-wednesday-season-...   \n",
+       "2  https://expeditionportal.com/classifieds-honda...   \n",
+       "3  http://www.influencerupdate.biz/news/68624/twi...   \n",
+       "4  https://thejournal.com/Articles/2024/10/29/Rep...   \n",
+       "\n",
+       "                                          rss  \\\n",
+       "0              https://www.cnet.com/rss/news/   \n",
+       "1                      https://kotaku.com/rss   \n",
+       "2      https://www.expeditionportal.com/feed/   \n",
+       "3  https://www.influencerupdate.biz/rss/news/   \n",
+       "4  https://thejournal.com/rss-feeds/news.aspx   \n",
+       "\n",
+       "                                               title  \\\n",
+       "0  Comporium Home Internet: Pricing, Speeds and A...   \n",
+       "1  Lady Gaga Appearing In Wednesday Season 2 Is A...   \n",
+       "2       2008 Honda Element EX ECamper :: Classifieds   \n",
+       "3  Twitch is testing mid-roll ads on creator broa...   \n",
+       "4  Report: 90% of Schools Depend on E-rate Fundin...   \n",
+       "\n",
+       "                                             content    unixTime      rss_id  \\\n",
+       "0  Unavailable in Provider unavailable in 90001 E...  1731883654  f5e6f52c79   \n",
+       "1  The next stop on Lady Gaga’s years-long tour t...  1731883665  a0840ab3b4   \n",
+       "2  Overlanding is about experiences first and for...  1731883665  ecdc66bb02   \n",
+       "3  Streaming giant Twitch is planning to test mid...  1731883669  e34caba76d   \n",
+       "4  Report: 90% of Schools Depend on E-rate Fundin...  1731883616  efd9bb9654   \n",
+       "\n",
+       "   article_id  llm_summary  embedding    unixtime  \n",
+       "0  00089f7505          NaN        NaN  1731883654  \n",
+       "1  009f880a86          NaN        NaN  1731883665  \n",
+       "2  00ac77e95f          NaN        NaN  1731883665  \n",
+       "3  0041bc4abf          NaN        NaN  1731883669  \n",
+       "4  000a3da3b6          NaN        NaN  1731883616  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
   ]
  },
  {
@@ -51,150 +275,27 @@
   "source": [
    "## 1. Check Storage Statistics"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Get storage stats\n",
-    "stats = downloader.get_storage_stats()\n",
-    "\n",
-    "print(f\"Total objects: {stats['total_objects']:,}\")\n",
-    "print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n",
-    "print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Download Examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Download last 7 days of articles\n",
-    "start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n",
-    "\n",
-    "recent_articles_path = downloader.download_to_csv(\n",
-    "    output_path='recent_articles.csv',\n",
-    "    start_date=start_date\n",
-    ")\n",
-    "\n",
-    "# Load and display sample\n",
-    "recent_df = pd.read_csv(recent_articles_path)\n",
-    "print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n",
-    "recent_df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Download articles from specific source prefix\n",
-    "tech_articles_path = downloader.download_to_csv(\n",
-    "    output_path='tech_articles.csv',\n",
-    "    prefix='articles/tech/'\n",
-    ")\n",
-    "\n",
-    "tech_df = pd.read_csv(tech_articles_path)\n",
-    "print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n",
-    "tech_df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Analysis Examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Analyze downloaded data\n",
-    "if 'source' in tech_df.columns:\n",
-    "    source_counts = tech_df['source'].value_counts()\n",
-    "    \n",
-    "    print(\"\\nArticles per Source:\")\n",
-    "    print(source_counts)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Time-based analysis\n",
-    "if 'published_date' in tech_df.columns:\n",
-    "    tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n",
-    "    daily_counts = tech_df.resample('D', on='published_date').size()\n",
-    "    \n",
-    "    # Plot\n",
-    "    import matplotlib.pyplot as plt\n",
-    "    \n",
-    "    plt.figure(figsize=(15, 6))\n",
-    "    daily_counts.plot(kind='bar')\n",
-    "    plt.title('Articles per Day')\n",
-    "    plt.xlabel('Date')\n",
-    "    plt.ylabel('Number of Articles')\n",
-    "    plt.xticks(rotation=45)\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Export Filtered Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Filter and export specific data\n",
-    "if 'title' in tech_df.columns and 'content' in tech_df.columns:\n",
-    "    # Filter articles containing specific keywords\n",
-    "    ai_articles = tech_df[\n",
-    "        tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n",
-    "        tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n",
-    "    ]\n",
-    "    \n",
-    "    # Export filtered data\n",
-    "    ai_articles.to_csv('ai_articles.csv', index=False)\n",
-    "    print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 5. Cleanup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Optional: Remove downloaded CSV files\n",
-    "import os\n",
-    "\n",
-    "for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n",
-    "    if os.path.exists(file):\n",
-    "        os.remove(file)\n",
-    "        print(f\"Removed {file}\")"
-   ]
  }
- ]
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
--- a/example-notebooks/consolidated_data.csv
+++ b/example-notebooks/consolidated_data.csv
--- a/example-notebooks/query-notebook.ipynb
+++ b/example-notebooks/query-notebook.ipynb
@@ -1,199 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# RSS Article Query Examples\n",
-    "\n",
-    "This notebook demonstrates how to query RSS articles using the ArticleQuerier class."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "import os\n",
-    "import pandas as pd\n",
-    "from datetime import datetime, timedelta\n",
-    "from src.search import ArticleQuerier\n",
-    "\n",
-    "# Configure logging if needed\n",
-    "import logging\n",
-    "logging.basicConfig(level=logging.INFO)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Initialize the Querier\n",
-    "\n",
-    "You can either set environment variables or use default values:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Optional: Set environment variables\n",
-    "os.environ['AWS_REGION'] = 'eu-west-3'\n",
-    "os.environ['RSS_DATABASE_NAME'] = 'rss_articles'\n",
-    "os.environ['RSS_TABLE_NAME'] = 'articles'\n",
-    "os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
-    "\n",
-    "# Initialize querier\n",
-    "querier = ArticleQuerier()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Basic Source Analysis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Get all sources and their article counts\n",
-    "sources = querier.get_sources()\n",
-    "\n",
-    "# Display top sources\n",
-    "print(\"Top Sources by Article Count:\")\n",
-    "sources.head(10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Search Examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Search articles containing 'python' in title\n",
-    "python_articles = querier.search(title=\"python\", limit=5)\n",
-    "print(\"\\nArticles about Python:\")\n",
-    "python_articles[['title', 'source', 'published_date']]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Search with multiple filters\n",
-    "# Get recent AWS articles from specific source\n",
-    "filtered_articles = querier.search(\n",
-    "    content=\"aws\",\n",
-    "    source=\"techcrunch\",\n",
-    "    date_from=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),\n",
-    "    limit=5\n",
-    ")\n",
-    "\n",
-    "print(\"\\nRecent AWS articles from TechCrunch:\")\n",
-    "filtered_articles[['title', 'published_date', 'url']]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Custom SQL Queries"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Articles per month by source\n",
-    "monthly_stats = querier.query(\"\"\"\n",
-    "    SELECT \n",
-    "        source,\n",
-    "        DATE_TRUNC('month', published_date) as month,\n",
-    "        COUNT(*) as article_count\n",
-    "    FROM articles\n",
-    "    WHERE published_date >= CURRENT_DATE - INTERVAL '6' MONTH\n",
-    "    GROUP BY 1, 2\n",
-    "    ORDER BY 2 DESC, 3 DESC\n",
-    "\"\"\")\n",
-    "\n",
-    "print(\"\\nMonthly Article Counts:\")\n",
-    "monthly_stats.head(10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Topic analysis (searching for specific keywords)\n",
-    "topic_analysis = querier.query(\"\"\"\n",
-    "    SELECT \n",
-    "        CASE\n",
-    "            WHEN LOWER(title) LIKE '%python%' THEN 'Python'\n",
-    "            WHEN LOWER(title) LIKE '%javascript%' OR LOWER(title) LIKE '%js%' THEN 'JavaScript'\n",
-    "            WHEN LOWER(title) LIKE '%aws%' THEN 'AWS'\n",
-    "            WHEN LOWER(title) LIKE '%ai%' OR LOWER(title) LIKE '%artificial intelligence%' THEN 'AI'\n",
-    "        END as topic,\n",
-    "        COUNT(*) as article_count\n",
-    "    FROM articles\n",
-    "    WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
-    "    GROUP BY 1\n",
-    "    HAVING topic IS NOT NULL\n",
-    "    ORDER BY 2 DESC\n",
-    "\"\"\")\n",
-    "\n",
-    "print(\"\\nTopic Analysis (Last 30 Days):\")\n",
-    "topic_analysis"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Visualization Example"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "# Get daily article counts\n",
-    "daily_counts = querier.query(\"\"\"\n",
-    "    SELECT \n",
-    "        DATE_TRUNC('day', published_date) as date,\n",
-    "        COUNT(*) as article_count\n",
-    "    FROM articles\n",
-    "    WHERE published_date >= CURRENT_DATE - INTERVAL '30' DAY\n",
-    "    GROUP BY 1\n",
-    "    ORDER BY 1\n",
-    "\"\"\")\n",
-    "\n",
-    "# Plot\n",
-    "plt.figure(figsize=(15, 6))\n",
-    "plt.plot(daily_counts['date'], daily_counts['article_count'])\n",
-    "plt.title('Daily Article Counts (Last 30 Days)')\n",
-    "plt.xlabel('Date')\n",
-    "plt.ylabel('Number of Articles')\n",
-    "plt.grid(True)\n",
-    "plt.xticks(rotation=45)\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  }
- ]
-}