diff --git a/.gitignore b/.gitignore
index deb8ad3..ca1059d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@ repo_structure.txt
*__pycache__*
*feeds.json
todo.md
-*.csv
\ No newline at end of file
+*.csv
+delete*
\ No newline at end of file
diff --git a/delete.py b/delete.py
deleted file mode 100644
index 2e699a3..0000000
--- a/delete.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import requests
-import openai
-
-def mock_bank_api_call(data):
- # Mock response from the bank API
- bank_response = {
- "status": "success",
- "account_balance": 1000,
- "currency": "USD"
- }
- return bank_response
-
-def process_data_with_openai(data):
- # Call the mock bank API
- bank_data = mock_bank_api_call(data)
-
- # Prepare the prompt for OpenAI API
- prompt = f"Bank API returned the following data: {bank_data}. Process this data."
-
- # Call the OpenAI API
- openai.api_key = 'your-openai-api-key'
- response = openai.Completion.create(
- engine="davinci",
- prompt=prompt,
- max_tokens=50
- )
-
- return response.choices[0].text.strip()
-
-# Example usage
-data = {"account_id": "12345"}
-result = process_data_with_openai(data)
-print(result)
\ No newline at end of file
diff --git a/monthly_ingestion.ipynb b/monthly_ingestion.ipynb
index 2a3d187..2cdd10b 100644
--- a/monthly_ingestion.ipynb
+++ b/monthly_ingestion.ipynb
@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -22,7 +22,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -45,7 +45,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -295,6 +295,470 @@
"aggregated_df.to_csv(\"Aggregated_November.csv\", index=False)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(\"Aggregated_November.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop columns which are all null values. \n",
+ "df = df.dropna(axis=1, how='all')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get null value percentage for each column\n",
+ "null_percent = df.isnull().mean().round(4) * 100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop rows where the content of the column is null\n",
+ "df = df.dropna(subset=['content'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "link 0.0\n",
+ "rss 0.0\n",
+ "title 0.2\n",
+ "content 0.0\n",
+ "unixTime 0.0\n",
+ "rss_id 0.0\n",
+ "article_id 0.0\n",
+ "unixtime 0.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Get null value percentage for each column\n",
+ "null_percent = df.isnull().mean().round(4) * 100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.dropna(subset=['title'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get summary statistics for the dataframe\n",
+ "summary = df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save df \n",
+ "df.to_csv(\"Aggregated_November.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link | \n",
+ " rss | \n",
+ " title | \n",
+ " content | \n",
+ " unixTime | \n",
+ " rss_id | \n",
+ " article_id | \n",
+ " unixtime | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 130388 | \n",
+ " 130388 | \n",
+ " 130388 | \n",
+ " 130388 | \n",
+ " 1.303880e+05 | \n",
+ " 130388 | \n",
+ " 130388 | \n",
+ " 1.303880e+05 | \n",
+ "
\n",
+ " \n",
+ " | unique | \n",
+ " 37817 | \n",
+ " 688 | \n",
+ " 37383 | \n",
+ " 37658 | \n",
+ " NaN | \n",
+ " 688 | \n",
+ " 37817 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | top | \n",
+ " https://www.npr.org/2022/11/04/1134434712/plan... | \n",
+ " https://blogs.unity3d.com/feed/ | \n",
+ " TechPipeline | \n",
+ " \"Unity\", Unity logos, and other Unity trademar... | \n",
+ " NaN | \n",
+ " b78e974545 | \n",
+ " 05dfb6bb11 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | freq | \n",
+ " 11 | \n",
+ " 3918 | \n",
+ " 330 | \n",
+ " 3912 | \n",
+ " NaN | \n",
+ " 3918 | \n",
+ " 11 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732375e+09 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732375e+09 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 3.214484e+05 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 3.214484e+05 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.731884e+09 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.731884e+09 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732085e+09 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732085e+09 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732402e+09 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732402e+09 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732632e+09 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732632e+09 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732922e+09 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.732922e+09 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link \\\n",
+ "count 130388 \n",
+ "unique 37817 \n",
+ "top https://www.npr.org/2022/11/04/1134434712/plan... \n",
+ "freq 11 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " rss title \\\n",
+ "count 130388 130388 \n",
+ "unique 688 37383 \n",
+ "top https://blogs.unity3d.com/feed/ TechPipeline \n",
+ "freq 3918 330 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " content unixTime \\\n",
+ "count 130388 1.303880e+05 \n",
+ "unique 37658 NaN \n",
+ "top \"Unity\", Unity logos, and other Unity trademar... NaN \n",
+ "freq 3912 NaN \n",
+ "mean NaN 1.732375e+09 \n",
+ "std NaN 3.214484e+05 \n",
+ "min NaN 1.731884e+09 \n",
+ "25% NaN 1.732085e+09 \n",
+ "50% NaN 1.732402e+09 \n",
+ "75% NaN 1.732632e+09 \n",
+ "max NaN 1.732922e+09 \n",
+ "\n",
+ " rss_id article_id unixtime \n",
+ "count 130388 130388 1.303880e+05 \n",
+ "unique 688 37817 NaN \n",
+ "top b78e974545 05dfb6bb11 NaN \n",
+ "freq 3918 11 NaN \n",
+ "mean NaN NaN 1.732375e+09 \n",
+ "std NaN NaN 3.214484e+05 \n",
+ "min NaN NaN 1.731884e+09 \n",
+ "25% NaN NaN 1.732085e+09 \n",
+ "50% NaN NaN 1.732402e+09 \n",
+ "75% NaN NaN 1.732632e+09 \n",
+ "max NaN NaN 1.732922e+09 "
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "summary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link | \n",
+ " rss | \n",
+ " title | \n",
+ " content | \n",
+ " unixTime | \n",
+ " rss_id | \n",
+ " article_id | \n",
+ " unixtime | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " https://www.npr.org/2022/11/04/1134434712/plan... | \n",
+ " https://www.npr.org/rss/podcast.php?id=510289 | \n",
+ " The crisis pollsters face ahead of the midterm... | \n",
+ " Planet Money tries election polling\\n\\nEnlarge... | \n",
+ " 1731884660 | \n",
+ " 8181d7a585 | \n",
+ " 05dfb6bb11 | \n",
+ " 1731884660 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " https://www.communitysignal.com/embodying-work... | \n",
+ " https://www.communitysignal.com/feed/ | \n",
+ " Embodying Work-Life Balance as a Community Pro... | \n",
+ " Are you able to step away from your community ... | \n",
+ " 1731883672 | \n",
+ " b6cef58d91 | \n",
+ " 0418489c55 | \n",
+ " 1731883672 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " https://identityweek.net/tech5-awarded-a-15-ye... | \n",
+ " https://www.planetbiometrics.com/rss/ | \n",
+ " TECH5 awarded a 15-year contract with Virginia... | \n",
+ " TECH5 has announced a major contract win with ... | \n",
+ " 1731883632 | \n",
+ " 084b136c50 | \n",
+ " 0bb2ec0554 | \n",
+ " 1731883632 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " https://edtechmagazine.com/k12/article/2024/11... | \n",
+ " https://www.edtechmagazine.com/k12/rss.xml?tax... | \n",
+ " How Can You Prepare Your Cloud to Safely Imple... | \n",
+ " Many K–12 schools that operate in the cloud ha... | \n",
+ " 1731883653 | \n",
+ " 7827152faf | \n",
+ " 07d39400e9 | \n",
+ " 1731883653 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " https://www.ufc.com/news/updates-ufc-309-jones... | \n",
+ " https://www.ufc.com/rss/news | \n",
+ " Updates To UFC 309: Jones vs Miocic | \n",
+ " UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... | \n",
+ " 1731883662 | \n",
+ " 2c774cd014 | \n",
+ " 03f31af6e2 | \n",
+ " 1731883662 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link \\\n",
+ "0 https://www.npr.org/2022/11/04/1134434712/plan... \n",
+ "1 https://www.communitysignal.com/embodying-work... \n",
+ "2 https://identityweek.net/tech5-awarded-a-15-ye... \n",
+ "3 https://edtechmagazine.com/k12/article/2024/11... \n",
+ "4 https://www.ufc.com/news/updates-ufc-309-jones... \n",
+ "\n",
+ " rss \\\n",
+ "0 https://www.npr.org/rss/podcast.php?id=510289 \n",
+ "1 https://www.communitysignal.com/feed/ \n",
+ "2 https://www.planetbiometrics.com/rss/ \n",
+ "3 https://www.edtechmagazine.com/k12/rss.xml?tax... \n",
+ "4 https://www.ufc.com/rss/news \n",
+ "\n",
+ " title \\\n",
+ "0 The crisis pollsters face ahead of the midterm... \n",
+ "1 Embodying Work-Life Balance as a Community Pro... \n",
+ "2 TECH5 awarded a 15-year contract with Virginia... \n",
+ "3 How Can You Prepare Your Cloud to Safely Imple... \n",
+ "4 Updates To UFC 309: Jones vs Miocic \n",
+ "\n",
+ " content unixTime rss_id \\\n",
+ "0 Planet Money tries election polling\\n\\nEnlarge... 1731884660 8181d7a585 \n",
+ "1 Are you able to step away from your community ... 1731883672 b6cef58d91 \n",
+ "2 TECH5 has announced a major contract win with ... 1731883632 084b136c50 \n",
+ "3 Many K–12 schools that operate in the cloud ha... 1731883653 7827152faf \n",
+ "4 UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... 1731883662 2c774cd014 \n",
+ "\n",
+ " article_id unixtime \n",
+ "0 05dfb6bb11 1731884660 \n",
+ "1 0418489c55 1731883672 \n",
+ "2 0bb2ec0554 1731883632 \n",
+ "3 07d39400e9 1731883653 \n",
+ "4 03f31af6e2 1731883662 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py
index 0281b44..679bc11 100644
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py
@@ -1,9 +1,13 @@
import os
from pinecone import Pinecone
-
from openai import OpenAI
+from utils import setup_logging
+
+logger = setup_logging()
+
+
# Set up Pinecone client
api_key = os.getenv("PINCEONE_API_KEY")
shards = os.getenv("PINECONE_SHARDS")
@@ -32,10 +36,11 @@ def vectorize(article:str) -> list[float]:
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted.
- index.upsert(
+ response = index.upsert(
vectors=vectors,
namespace=namespace
)
+ logger.info(f"Upserted vectors Response : {response}")
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working.
diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
index 18850fc..e741d11 100644
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -38,11 +38,13 @@ def pinecone_save_article(article:dict):
logging.info(f"Article content into Pinecone")
data["values"] = vectorize(article=article["content"])
+ data = list(data)
+
namespace = os.getenv('PINECONE_NAMESPACE')
logger.info("Upserting article to Pinecone")
- upsert_vectors(index, [data], namespace)
+ upsert_vectors(index, data, namespace)
logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone with namespace {namespace}")
def dynamodb_save_article(article:dict):
diff --git a/todo.md b/todo.md
index 4d1cd47..94f2d3f 100644
--- a/todo.md
+++ b/todo.md
@@ -3,22 +3,38 @@
* Vector Database Initialization at earlier phase. [ Done ]
* Test out Vector Databases at Small Scale.
- * [ ] Testing
+ * [x] Testing
* [x] Fix OpenAI Error.
* [x] Fix Pinecone Error
- * [ ] Fix input error.
+ * [x] Fix input error.
* [ ] Let it run for a day
-* Test out Vector Databases at Scale.
+ * [x] Check Open AI Bill
+ * [x] Check Vector Database Bill
+ * [ ] Figure out Vector Database Bug.
+ * [ ] Figure out best way to store articles since metadata or in S3.
+* [ ] Decreae the cost of cloudwatch Logs
+* [ ] Test out Vector Databases at Scale.
+* [ ] Add in text cleaning before after ingesting article but before storage.
+* [ ] Automate the monthly data ingestion job
+* [ ] Lambda Optimization
* Monthly ingestion job
* Protocol for annotating data.
- * Promotional Material
- * Stuff that is not news
- * Stuff that is news but like a list of news topics.
- * Annotating Keywwords or topics of interest, or categorization.
- * Stuff that people wouldn't find genuinely interesting ( no interesting factor or novelty or news worthiness )
- * Annotating Entity Type Data.
- * Annotating Podcast Scripts or Video Scripts
+ * [ ] DSPy Integration
+ * [ ] LLMRouter integration
+ * [ ] Main topic/Category ( list )
+ * [ ] Writing Stley ( e.g. Informal, professional, etc...)
+ * [ ] Promotional Material ( 0=Not Promotional, 1=Promotional)
+ * [ ] Stuff that is news ( 0= Not News, 1=News)
+ * [ ] Stuff that is news but like a list of news topics. ( 0=Opposite, 1=News Topic Lists)
+ * [ ] Annotating Entities ( List of Key entities with entity specific sentiment )
+ * [ ] List of Major Events ( e.g. Ukraine War, Israel Palestine, etc... )
+ * [ ] List of Minor Event ( e.g. Specific Battle, Court Case step, etc..)
+ * [ ] Novelty Factor ( Scale from 0(Not Interesting) -> 100(Interesting))
+ * [ ] Annotating Podcast Scripts or Video Scripts ( 0=is not a script, 1=Is a script)
+ * [ ] Political Quadrant ( Or that eight dimensional thing that guy had. )
+Also we need to specify a consistent output format
+
* Estimation Algorithm for annotation cost.
* Open Source Protocol for running this.