Added logging for pinecone.

2026-02-09 00:04:27 +01:00 · 2024-12-15 11:47:26 -05:00
parent b3c9832fdb
commit 0e8c701ae5
6 changed files with 505 additions and 50 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@ repo_structure.txt
 *__pycache__*
 *feeds.json
 todo.md
-*.csv
+*.csv
+delete*
--- a/delete.py
+++ b/delete.py
@@ -1,33 +0,0 @@
-import requests
-import openai
-
-def mock_bank_api_call(data):
-    # Mock response from the bank API
-    bank_response = {
-        "status": "success",
-        "account_balance": 1000,
-        "currency": "USD"
-    }
-    return bank_response
-
-def process_data_with_openai(data):
-    # Call the mock bank API
-    bank_data = mock_bank_api_call(data)
-    
-    # Prepare the prompt for OpenAI API
-    prompt = f"Bank API returned the following data: {bank_data}. Process this data."
-
-    # Call the OpenAI API
-    openai.api_key = 'your-openai-api-key'
-    response = openai.Completion.create(
-        engine="davinci",
-        prompt=prompt,
-        max_tokens=50
-    )
-    
-    return response.choices[0].text.strip()
-
-# Example usage
-data = {"account_id": "12345"}
-result = process_data_with_openai(data)
-print(result)
--- a/monthly_ingestion.ipynb
+++ b/monthly_ingestion.ipynb
@@ -11,7 +11,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -22,7 +22,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -45,7 +45,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@@ -295,6 +295,470 @@
    "aggregated_df.to_csv(\"Aggregated_November.csv\", index=False)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"Aggregated_November.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop columns which are all null values. \n",
+    "df = df.dropna(axis=1, how='all')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get null value percentage for each column\n",
+    "null_percent = df.isnull().mean().round(4) * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop rows where the content of the column is null\n",
+    "df = df.dropna(subset=['content'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "link          0.0\n",
+       "rss           0.0\n",
+       "title         0.2\n",
+       "content       0.0\n",
+       "unixTime      0.0\n",
+       "rss_id        0.0\n",
+       "article_id    0.0\n",
+       "unixtime      0.0\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Get null value percentage for each column\n",
+    "null_percent = df.isnull().mean().round(4) * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.dropna(subset=['title'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get summary statistics for the dataframe\n",
+    "summary = df.describe(include='all')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save df \n",
+    "df.to_csv(\"Aggregated_November.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>link</th>\n",
+       "      <th>rss</th>\n",
+       "      <th>title</th>\n",
+       "      <th>content</th>\n",
+       "      <th>unixTime</th>\n",
+       "      <th>rss_id</th>\n",
+       "      <th>article_id</th>\n",
+       "      <th>unixtime</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>130388</td>\n",
+       "      <td>130388</td>\n",
+       "      <td>130388</td>\n",
+       "      <td>130388</td>\n",
+       "      <td>1.303880e+05</td>\n",
+       "      <td>130388</td>\n",
+       "      <td>130388</td>\n",
+       "      <td>1.303880e+05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>37817</td>\n",
+       "      <td>688</td>\n",
+       "      <td>37383</td>\n",
+       "      <td>37658</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>688</td>\n",
+       "      <td>37817</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
+       "      <td>https://blogs.unity3d.com/feed/</td>\n",
+       "      <td>TechPipeline</td>\n",
+       "      <td>\"Unity\", Unity logos, and other Unity trademar...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>b78e974545</td>\n",
+       "      <td>05dfb6bb11</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>11</td>\n",
+       "      <td>3918</td>\n",
+       "      <td>330</td>\n",
+       "      <td>3912</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3918</td>\n",
+       "      <td>11</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732375e+09</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732375e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3.214484e+05</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3.214484e+05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.731884e+09</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.731884e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732085e+09</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732085e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732402e+09</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732402e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732632e+09</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732632e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732922e+09</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.732922e+09</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                     link  \\\n",
+       "count                                              130388   \n",
+       "unique                                              37817   \n",
+       "top     https://www.npr.org/2022/11/04/1134434712/plan...   \n",
+       "freq                                                   11   \n",
+       "mean                                                  NaN   \n",
+       "std                                                   NaN   \n",
+       "min                                                   NaN   \n",
+       "25%                                                   NaN   \n",
+       "50%                                                   NaN   \n",
+       "75%                                                   NaN   \n",
+       "max                                                   NaN   \n",
+       "\n",
+       "                                    rss         title  \\\n",
+       "count                            130388        130388   \n",
+       "unique                              688         37383   \n",
+       "top     https://blogs.unity3d.com/feed/  TechPipeline   \n",
+       "freq                               3918           330   \n",
+       "mean                                NaN           NaN   \n",
+       "std                                 NaN           NaN   \n",
+       "min                                 NaN           NaN   \n",
+       "25%                                 NaN           NaN   \n",
+       "50%                                 NaN           NaN   \n",
+       "75%                                 NaN           NaN   \n",
+       "max                                 NaN           NaN   \n",
+       "\n",
+       "                                                  content      unixTime  \\\n",
+       "count                                              130388  1.303880e+05   \n",
+       "unique                                              37658           NaN   \n",
+       "top     \"Unity\", Unity logos, and other Unity trademar...           NaN   \n",
+       "freq                                                 3912           NaN   \n",
+       "mean                                                  NaN  1.732375e+09   \n",
+       "std                                                   NaN  3.214484e+05   \n",
+       "min                                                   NaN  1.731884e+09   \n",
+       "25%                                                   NaN  1.732085e+09   \n",
+       "50%                                                   NaN  1.732402e+09   \n",
+       "75%                                                   NaN  1.732632e+09   \n",
+       "max                                                   NaN  1.732922e+09   \n",
+       "\n",
+       "            rss_id  article_id      unixtime  \n",
+       "count       130388      130388  1.303880e+05  \n",
+       "unique         688       37817           NaN  \n",
+       "top     b78e974545  05dfb6bb11           NaN  \n",
+       "freq          3918          11           NaN  \n",
+       "mean           NaN         NaN  1.732375e+09  \n",
+       "std            NaN         NaN  3.214484e+05  \n",
+       "min            NaN         NaN  1.731884e+09  \n",
+       "25%            NaN         NaN  1.732085e+09  \n",
+       "50%            NaN         NaN  1.732402e+09  \n",
+       "75%            NaN         NaN  1.732632e+09  \n",
+       "max            NaN         NaN  1.732922e+09  "
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>link</th>\n",
+       "      <th>rss</th>\n",
+       "      <th>title</th>\n",
+       "      <th>content</th>\n",
+       "      <th>unixTime</th>\n",
+       "      <th>rss_id</th>\n",
+       "      <th>article_id</th>\n",
+       "      <th>unixtime</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
+       "      <td>https://www.npr.org/rss/podcast.php?id=510289</td>\n",
+       "      <td>The crisis pollsters face ahead of the midterm...</td>\n",
+       "      <td>Planet Money tries election polling\\n\\nEnlarge...</td>\n",
+       "      <td>1731884660</td>\n",
+       "      <td>8181d7a585</td>\n",
+       "      <td>05dfb6bb11</td>\n",
+       "      <td>1731884660</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>https://www.communitysignal.com/embodying-work...</td>\n",
+       "      <td>https://www.communitysignal.com/feed/</td>\n",
+       "      <td>Embodying Work-Life Balance as a Community Pro...</td>\n",
+       "      <td>Are you able to step away from your community ...</td>\n",
+       "      <td>1731883672</td>\n",
+       "      <td>b6cef58d91</td>\n",
+       "      <td>0418489c55</td>\n",
+       "      <td>1731883672</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>https://identityweek.net/tech5-awarded-a-15-ye...</td>\n",
+       "      <td>https://www.planetbiometrics.com/rss/</td>\n",
+       "      <td>TECH5 awarded a 15-year contract with Virginia...</td>\n",
+       "      <td>TECH5 has announced a major contract win with ...</td>\n",
+       "      <td>1731883632</td>\n",
+       "      <td>084b136c50</td>\n",
+       "      <td>0bb2ec0554</td>\n",
+       "      <td>1731883632</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>https://edtechmagazine.com/k12/article/2024/11...</td>\n",
+       "      <td>https://www.edtechmagazine.com/k12/rss.xml?tax...</td>\n",
+       "      <td>How Can You Prepare Your Cloud to Safely Imple...</td>\n",
+       "      <td>Many K–12 schools that operate in the cloud ha...</td>\n",
+       "      <td>1731883653</td>\n",
+       "      <td>7827152faf</td>\n",
+       "      <td>07d39400e9</td>\n",
+       "      <td>1731883653</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>https://www.ufc.com/news/updates-ufc-309-jones...</td>\n",
+       "      <td>https://www.ufc.com/rss/news</td>\n",
+       "      <td>Updates To UFC 309: Jones vs Miocic</td>\n",
+       "      <td>UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...</td>\n",
+       "      <td>1731883662</td>\n",
+       "      <td>2c774cd014</td>\n",
+       "      <td>03f31af6e2</td>\n",
+       "      <td>1731883662</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                link  \\\n",
+       "0  https://www.npr.org/2022/11/04/1134434712/plan...   \n",
+       "1  https://www.communitysignal.com/embodying-work...   \n",
+       "2  https://identityweek.net/tech5-awarded-a-15-ye...   \n",
+       "3  https://edtechmagazine.com/k12/article/2024/11...   \n",
+       "4  https://www.ufc.com/news/updates-ufc-309-jones...   \n",
+       "\n",
+       "                                                 rss  \\\n",
+       "0      https://www.npr.org/rss/podcast.php?id=510289   \n",
+       "1              https://www.communitysignal.com/feed/   \n",
+       "2              https://www.planetbiometrics.com/rss/   \n",
+       "3  https://www.edtechmagazine.com/k12/rss.xml?tax...   \n",
+       "4                       https://www.ufc.com/rss/news   \n",
+       "\n",
+       "                                               title  \\\n",
+       "0  The crisis pollsters face ahead of the midterm...   \n",
+       "1  Embodying Work-Life Balance as a Community Pro...   \n",
+       "2  TECH5 awarded a 15-year contract with Virginia...   \n",
+       "3  How Can You Prepare Your Cloud to Safely Imple...   \n",
+       "4                Updates To UFC 309: Jones vs Miocic   \n",
+       "\n",
+       "                                             content    unixTime      rss_id  \\\n",
+       "0  Planet Money tries election polling\\n\\nEnlarge...  1731884660  8181d7a585   \n",
+       "1  Are you able to step away from your community ...  1731883672  b6cef58d91   \n",
+       "2  TECH5 has announced a major contract win with ...  1731883632  084b136c50   \n",
+       "3  Many K–12 schools that operate in the cloud ha...  1731883653  7827152faf   \n",
+       "4  UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...  1731883662  2c774cd014   \n",
+       "\n",
+       "   article_id    unixtime  \n",
+       "0  05dfb6bb11  1731884660  \n",
+       "1  0418489c55  1731883672  \n",
+       "2  0bb2ec0554  1731883632  \n",
+       "3  07d39400e9  1731883653  \n",
+       "4  03f31af6e2  1731883662  "
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py
@@ -1,9 +1,13 @@
 import os

 from pinecone import Pinecone
-
 from openai import OpenAI

+from utils import setup_logging
+
+logger = setup_logging()
+
+
 # Set up Pinecone client
 api_key = os.getenv("PINCEONE_API_KEY")
 shards = os.getenv("PINECONE_SHARDS")
@@ -32,10 +36,11 @@ def vectorize(article:str) -> list[float]:


 def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted. 
-    index.upsert(
+    response = index.upsert(
        vectors=vectors,
        namespace=namespace
    )
+    logger.info(f"Upserted vectors Response : {response}")

 def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working. 
    
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -38,11 +38,13 @@ def pinecone_save_article(article:dict):
    logging.info(f"Article content into Pinecone")
    data["values"] = vectorize(article=article["content"])
    
+    data = list(data)
+    
    
    namespace = os.getenv('PINECONE_NAMESPACE')
    
    logger.info("Upserting article to Pinecone")
-    upsert_vectors(index, [data], namespace) 
+    upsert_vectors(index, data, namespace) 
    logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone with namespace {namespace}")

 def dynamodb_save_article(article:dict):
--- a/todo.md
+++ b/todo.md
@@ -3,22 +3,38 @@

 * Vector Database Initialization at earlier phase. [ Done ]
 * Test out Vector Databases at Small Scale. 
-    * [ ] Testing 
+    * [x] Testing 
        * [x] Fix OpenAI Error.
        * [x] Fix Pinecone Error
-        * [ ] Fix input error.
+        * [x] Fix input error.
    * [ ] Let it run for a day
-* Test out Vector Databases at Scale. 
+        * [x] Check Open AI Bill
+        * [x] Check Vector Database Bill
+        * [ ] Figure out Vector Database Bug. 
+    * [ ] Figure out best way to store articles since metadata or in S3. 
+* [ ] Decreae the cost of cloudwatch Logs
+* [ ] Test out Vector Databases at Scale. 
+* [ ] Add in text cleaning before after ingesting article but before storage.
+* [ ] Automate the monthly data ingestion job
+* [ ] Lambda Optimization


 * Monthly ingestion job
 * Protocol for annotating data. 
-    * Promotional Material 
-    * Stuff that is not news
-    * Stuff that is news but like a list of news topics. 
-    * Annotating Keywwords or topics of interest, or categorization.
-    * Stuff that people wouldn't find genuinely interesting ( no interesting factor or novelty or news worthiness ) 
-    * Annotating Entity Type Data. 
-    * Annotating Podcast Scripts or Video Scripts
+    * [ ] DSPy Integration
+    * [ ] LLMRouter integration
+    * [ ] Main topic/Category ( list )
+    * [ ] Writing Stley ( e.g. Informal, professional, etc...)
+    * [ ] Promotional Material ( 0=Not Promotional, 1=Promotional)
+    * [ ] Stuff that is news ( 0= Not News, 1=News)
+    * [ ] Stuff that is news but like a list of news topics. ( 0=Opposite,  1=News Topic Lists)
+    * [ ] Annotating Entities ( List of Key entities with entity specific sentiment )
+    * [ ] List of Major Events ( e.g. Ukraine War, Israel Palestine, etc... )
+    * [ ] List of Minor Event ( e.g. Specific Battle, Court Case step, etc..)
+    * [ ] Novelty Factor ( Scale from 0(Not Interesting) -> 100(Interesting))
+    * [ ] Annotating Podcast Scripts or Video Scripts ( 0=is not a script, 1=Is a script)
+    * [ ] Political Quadrant ( Or that eight dimensional thing that guy had. )
+Also we need to specify a consistent output format
+
 * Estimation Algorithm for annotation cost. 
 * Open Source Protocol for running this.