sped up batch ingestion of articles.

2025-12-17 05:54:22 +01:00 · 2024-11-20 18:31:11 -05:00
parent f400c8bda5
commit 28148f1181
8 changed files with 289613 additions and 630 deletions
--- a/example-notebooks/batch-notebook.ipynb
+++ b/example-notebooks/batch-notebook.ipynb
@@ -11,17 +11,33 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
+   "outputs": [],
   "source": [
    "import os\n",
+    "import sys\n",
    "import pandas as pd\n",
-    "from datetime import datetime, timedelta\n",
-    "from src.search import S3BatchDownloader\n",
+    "from time import time\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Root imports\n",
+    "os.chdir('..')\n",
    "\n",
-    "# Configure logging\n",
-    "import logging\n",
-    "logging.basicConfig(level=logging.INFO)"
+    "from src.search.batch import S3BatchDownloader\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv(override=True)\n",
+    "\n",
+    "downloader = S3BatchDownloader()\n",
+    "\n",
+    "os.chdir('example-notebooks')\n"
   ]
  },
  {
@@ -31,18 +47,226 @@
    "## Initialize the Downloader"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "List permission: Allowed\n"
+     ]
+    }
+   ],
+   "source": [
+    "import boto3\n",
+    "from botocore.exceptions import ClientError\n",
+    "\n",
+    "s3 = boto3.client('s3')\n",
+    "\n",
+    "# Test ListObjects\n",
+    "try:\n",
+    "    response = s3.list_objects_v2(Bucket='open-rss-articles-us-east-1')\n",
+    "    print(\"List permission: Allowed\")\n",
+    "except ClientError as e:\n",
+    "    print(\"List permission: Denied\")\n"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloaded data to consolidated_data.csv in 59.01 seconds\n"
+     ]
+    }
+   ],
   "source": [
-    "# Optional: Set environment variables\n",
-    "os.environ['AWS_REGION'] = 'eu-west-3'\n",
-    "os.environ['RSS_BUCKET_NAME'] = 'your-bucket'\n",
-    "os.environ['RSS_PREFIX'] = 'articles/'\n",
+    "start = time()\n",
+    "output_path = \"consolidated_data.csv\"  # or \"consolidated_data.json\"\n",
    "\n",
-    "# Initialize downloader\n",
-    "downloader = S3BatchDownloader()"
+    "# Define date range\n",
+    "start_date = \"2024-11-17\"\n",
+    "end_date = \"2024-11-20\"\n",
+    "\n",
+    "# Start downloading\n",
+    "downloader.download_to_file(\n",
+    "    output_path=output_path,\n",
+    "    file_format=\"csv\",  # or \"json\"\n",
+    "    start_date=start_date,\n",
+    "    end_date=end_date\n",
+    ")\n",
+    "\n",
+    "print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(output_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>link</th>\n",
+       "      <th>rss</th>\n",
+       "      <th>title</th>\n",
+       "      <th>content</th>\n",
+       "      <th>unixTime</th>\n",
+       "      <th>rss_id</th>\n",
+       "      <th>article_id</th>\n",
+       "      <th>llm_summary</th>\n",
+       "      <th>embedding</th>\n",
+       "      <th>unixtime</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>https://www.cnet.com/home/internet/comporium-i...</td>\n",
+       "      <td>https://www.cnet.com/rss/news/</td>\n",
+       "      <td>Comporium Home Internet: Pricing, Speeds and A...</td>\n",
+       "      <td>Unavailable in Provider unavailable in 90001 E...</td>\n",
+       "      <td>1731883654</td>\n",
+       "      <td>f5e6f52c79</td>\n",
+       "      <td>00089f7505</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883654</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>https://kotaku.com/lady-gaga-wednesday-season-...</td>\n",
+       "      <td>https://kotaku.com/rss</td>\n",
+       "      <td>Lady Gaga Appearing In Wednesday Season 2 Is A...</td>\n",
+       "      <td>The next stop on Lady Gaga’s years-long tour t...</td>\n",
+       "      <td>1731883665</td>\n",
+       "      <td>a0840ab3b4</td>\n",
+       "      <td>009f880a86</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883665</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>https://expeditionportal.com/classifieds-honda...</td>\n",
+       "      <td>https://www.expeditionportal.com/feed/</td>\n",
+       "      <td>2008 Honda Element EX ECamper :: Classifieds</td>\n",
+       "      <td>Overlanding is about experiences first and for...</td>\n",
+       "      <td>1731883665</td>\n",
+       "      <td>ecdc66bb02</td>\n",
+       "      <td>00ac77e95f</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883665</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>http://www.influencerupdate.biz/news/68624/twi...</td>\n",
+       "      <td>https://www.influencerupdate.biz/rss/news/</td>\n",
+       "      <td>Twitch is testing mid-roll ads on creator broa...</td>\n",
+       "      <td>Streaming giant Twitch is planning to test mid...</td>\n",
+       "      <td>1731883669</td>\n",
+       "      <td>e34caba76d</td>\n",
+       "      <td>0041bc4abf</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883669</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>https://thejournal.com/Articles/2024/10/29/Rep...</td>\n",
+       "      <td>https://thejournal.com/rss-feeds/news.aspx</td>\n",
+       "      <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
+       "      <td>Report: 90% of Schools Depend on E-rate Fundin...</td>\n",
+       "      <td>1731883616</td>\n",
+       "      <td>efd9bb9654</td>\n",
+       "      <td>000a3da3b6</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1731883616</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                link  \\\n",
+       "0  https://www.cnet.com/home/internet/comporium-i...   \n",
+       "1  https://kotaku.com/lady-gaga-wednesday-season-...   \n",
+       "2  https://expeditionportal.com/classifieds-honda...   \n",
+       "3  http://www.influencerupdate.biz/news/68624/twi...   \n",
+       "4  https://thejournal.com/Articles/2024/10/29/Rep...   \n",
+       "\n",
+       "                                          rss  \\\n",
+       "0              https://www.cnet.com/rss/news/   \n",
+       "1                      https://kotaku.com/rss   \n",
+       "2      https://www.expeditionportal.com/feed/   \n",
+       "3  https://www.influencerupdate.biz/rss/news/   \n",
+       "4  https://thejournal.com/rss-feeds/news.aspx   \n",
+       "\n",
+       "                                               title  \\\n",
+       "0  Comporium Home Internet: Pricing, Speeds and A...   \n",
+       "1  Lady Gaga Appearing In Wednesday Season 2 Is A...   \n",
+       "2       2008 Honda Element EX ECamper :: Classifieds   \n",
+       "3  Twitch is testing mid-roll ads on creator broa...   \n",
+       "4  Report: 90% of Schools Depend on E-rate Fundin...   \n",
+       "\n",
+       "                                             content    unixTime      rss_id  \\\n",
+       "0  Unavailable in Provider unavailable in 90001 E...  1731883654  f5e6f52c79   \n",
+       "1  The next stop on Lady Gaga’s years-long tour t...  1731883665  a0840ab3b4   \n",
+       "2  Overlanding is about experiences first and for...  1731883665  ecdc66bb02   \n",
+       "3  Streaming giant Twitch is planning to test mid...  1731883669  e34caba76d   \n",
+       "4  Report: 90% of Schools Depend on E-rate Fundin...  1731883616  efd9bb9654   \n",
+       "\n",
+       "   article_id  llm_summary  embedding    unixtime  \n",
+       "0  00089f7505          NaN        NaN  1731883654  \n",
+       "1  009f880a86          NaN        NaN  1731883665  \n",
+       "2  00ac77e95f          NaN        NaN  1731883665  \n",
+       "3  0041bc4abf          NaN        NaN  1731883669  \n",
+       "4  000a3da3b6          NaN        NaN  1731883616  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
   ]
  },
  {
@@ -51,150 +275,27 @@
   "source": [
    "## 1. Check Storage Statistics"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Get storage stats\n",
-    "stats = downloader.get_storage_stats()\n",
-    "\n",
-    "print(f\"Total objects: {stats['total_objects']:,}\")\n",
-    "print(f\"Total size: {stats['total_size_mb']:.2f} MB\")\n",
-    "print(f\"Average object size: {stats['average_size_kb']:.2f} KB\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Download Examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Download last 7 days of articles\n",
-    "start_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')\n",
-    "\n",
-    "recent_articles_path = downloader.download_to_csv(\n",
-    "    output_path='recent_articles.csv',\n",
-    "    start_date=start_date\n",
-    ")\n",
-    "\n",
-    "# Load and display sample\n",
-    "recent_df = pd.read_csv(recent_articles_path)\n",
-    "print(f\"\\nDownloaded {len(recent_df)} recent articles\")\n",
-    "recent_df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Download articles from specific source prefix\n",
-    "tech_articles_path = downloader.download_to_csv(\n",
-    "    output_path='tech_articles.csv',\n",
-    "    prefix='articles/tech/'\n",
-    ")\n",
-    "\n",
-    "tech_df = pd.read_csv(tech_articles_path)\n",
-    "print(f\"\\nDownloaded {len(tech_df)} tech articles\")\n",
-    "tech_df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Analysis Examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Analyze downloaded data\n",
-    "if 'source' in tech_df.columns:\n",
-    "    source_counts = tech_df['source'].value_counts()\n",
-    "    \n",
-    "    print(\"\\nArticles per Source:\")\n",
-    "    print(source_counts)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Time-based analysis\n",
-    "if 'published_date' in tech_df.columns:\n",
-    "    tech_df['published_date'] = pd.to_datetime(tech_df['published_date'])\n",
-    "    daily_counts = tech_df.resample('D', on='published_date').size()\n",
-    "    \n",
-    "    # Plot\n",
-    "    import matplotlib.pyplot as plt\n",
-    "    \n",
-    "    plt.figure(figsize=(15, 6))\n",
-    "    daily_counts.plot(kind='bar')\n",
-    "    plt.title('Articles per Day')\n",
-    "    plt.xlabel('Date')\n",
-    "    plt.ylabel('Number of Articles')\n",
-    "    plt.xticks(rotation=45)\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Export Filtered Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Filter and export specific data\n",
-    "if 'title' in tech_df.columns and 'content' in tech_df.columns:\n",
-    "    # Filter articles containing specific keywords\n",
-    "    ai_articles = tech_df[\n",
-    "        tech_df['title'].str.contains('AI|artificial intelligence', case=False, na=False) |\n",
-    "        tech_df['content'].str.contains('AI|artificial intelligence', case=False, na=False)\n",
-    "    ]\n",
-    "    \n",
-    "    # Export filtered data\n",
-    "    ai_articles.to_csv('ai_articles.csv', index=False)\n",
-    "    print(f\"\\nExported {len(ai_articles)} AI-related articles to ai_articles.csv\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 5. Cleanup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "source": [
-    "# Optional: Remove downloaded CSV files\n",
-    "import os\n",
-    "\n",
-    "for file in ['recent_articles.csv', 'tech_articles.csv', 'ai_articles.csv']:\n",
-    "    if os.path.exists(file):\n",
-    "        os.remove(file)\n",
-    "        print(f\"Removed {file}\")"
-   ]
  }
- ]
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }