diff --git a/.gitignore b/.gitignore index deb8ad3..ca1059d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ repo_structure.txt *__pycache__* *feeds.json todo.md -*.csv \ No newline at end of file +*.csv +delete* \ No newline at end of file diff --git a/delete.py b/delete.py deleted file mode 100644 index 2e699a3..0000000 --- a/delete.py +++ /dev/null @@ -1,33 +0,0 @@ -import requests -import openai - -def mock_bank_api_call(data): - # Mock response from the bank API - bank_response = { - "status": "success", - "account_balance": 1000, - "currency": "USD" - } - return bank_response - -def process_data_with_openai(data): - # Call the mock bank API - bank_data = mock_bank_api_call(data) - - # Prepare the prompt for OpenAI API - prompt = f"Bank API returned the following data: {bank_data}. Process this data." - - # Call the OpenAI API - openai.api_key = 'your-openai-api-key' - response = openai.Completion.create( - engine="davinci", - prompt=prompt, - max_tokens=50 - ) - - return response.choices[0].text.strip() - -# Example usage -data = {"account_id": "12345"} -result = process_data_with_openai(data) -print(result) \ No newline at end of file diff --git a/monthly_ingestion.ipynb b/monthly_ingestion.ipynb index 2a3d187..2cdd10b 100644 --- a/monthly_ingestion.ipynb +++ b/monthly_ingestion.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -295,6 +295,470 @@ "aggregated_df.to_csv(\"Aggregated_November.csv\", index=False)" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"Aggregated_November.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop columns which are all null values. \n", + "df = df.dropna(axis=1, how='all')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Get null value percentage for each column\n", + "null_percent = df.isnull().mean().round(4) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop rows where the content of the column is null\n", + "df = df.dropna(subset=['content'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "link 0.0\n", + "rss 0.0\n", + "title 0.2\n", + "content 0.0\n", + "unixTime 0.0\n", + "rss_id 0.0\n", + "article_id 0.0\n", + "unixtime 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get null value percentage for each column\n", + "null_percent = df.isnull().mean().round(4) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.dropna(subset=['title'])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Get summary statistics for the dataframe\n", + "summary = df.describe(include='all')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# save df \n", + "df.to_csv(\"Aggregated_November.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
linkrsstitlecontentunixTimerss_idarticle_idunixtime
count1303881303881303881303881.303880e+051303881303881.303880e+05
unique378176883738337658NaN68837817NaN
tophttps://www.npr.org/2022/11/04/1134434712/plan...https://blogs.unity3d.com/feed/TechPipeline\"Unity\", Unity logos, and other Unity trademar...NaNb78e97454505dfb6bb11NaN
freq1139183303912NaN391811NaN
meanNaNNaNNaNNaN1.732375e+09NaNNaN1.732375e+09
stdNaNNaNNaNNaN3.214484e+05NaNNaN3.214484e+05
minNaNNaNNaNNaN1.731884e+09NaNNaN1.731884e+09
25%NaNNaNNaNNaN1.732085e+09NaNNaN1.732085e+09
50%NaNNaNNaNNaN1.732402e+09NaNNaN1.732402e+09
75%NaNNaNNaNNaN1.732632e+09NaNNaN1.732632e+09
maxNaNNaNNaNNaN1.732922e+09NaNNaN1.732922e+09
\n", + "
" + ], + "text/plain": [ + " link \\\n", + "count 130388 \n", + "unique 37817 \n", + "top https://www.npr.org/2022/11/04/1134434712/plan... \n", + "freq 11 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " rss title \\\n", + "count 130388 130388 \n", + "unique 688 37383 \n", + "top https://blogs.unity3d.com/feed/ TechPipeline \n", + "freq 3918 330 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " content unixTime \\\n", + "count 130388 1.303880e+05 \n", + "unique 37658 NaN \n", + "top \"Unity\", Unity logos, and other Unity trademar... NaN \n", + "freq 3912 NaN \n", + "mean NaN 1.732375e+09 \n", + "std NaN 3.214484e+05 \n", + "min NaN 1.731884e+09 \n", + "25% NaN 1.732085e+09 \n", + "50% NaN 1.732402e+09 \n", + "75% NaN 1.732632e+09 \n", + "max NaN 1.732922e+09 \n", + "\n", + " rss_id article_id unixtime \n", + "count 130388 130388 1.303880e+05 \n", + "unique 688 37817 NaN \n", + "top b78e974545 05dfb6bb11 NaN \n", + "freq 3918 11 NaN \n", + "mean NaN NaN 1.732375e+09 \n", + "std NaN NaN 3.214484e+05 \n", + "min NaN NaN 1.731884e+09 \n", + "25% NaN NaN 1.732085e+09 \n", + "50% NaN NaN 1.732402e+09 \n", + "75% NaN NaN 1.732632e+09 \n", + "max NaN NaN 1.732922e+09 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
linkrsstitlecontentunixTimerss_idarticle_idunixtime
0https://www.npr.org/2022/11/04/1134434712/plan...https://www.npr.org/rss/podcast.php?id=510289The crisis pollsters face ahead of the midterm...Planet Money tries election polling\\n\\nEnlarge...17318846608181d7a58505dfb6bb111731884660
1https://www.communitysignal.com/embodying-work...https://www.communitysignal.com/feed/Embodying Work-Life Balance as a Community Pro...Are you able to step away from your community ...1731883672b6cef58d910418489c551731883672
2https://identityweek.net/tech5-awarded-a-15-ye...https://www.planetbiometrics.com/rss/TECH5 awarded a 15-year contract with Virginia...TECH5 has announced a major contract win with ...1731883632084b136c500bb2ec05541731883632
3https://edtechmagazine.com/k12/article/2024/11...https://www.edtechmagazine.com/k12/rss.xml?tax...How Can You Prepare Your Cloud to Safely Imple...Many K–12 schools that operate in the cloud ha...17318836537827152faf07d39400e91731883653
4https://www.ufc.com/news/updates-ufc-309-jones...https://www.ufc.com/rss/newsUpdates To UFC 309: Jones vs MiocicUFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...17318836622c774cd01403f31af6e21731883662
\n", + "
" + ], + "text/plain": [ + " link \\\n", + "0 https://www.npr.org/2022/11/04/1134434712/plan... \n", + "1 https://www.communitysignal.com/embodying-work... \n", + "2 https://identityweek.net/tech5-awarded-a-15-ye... \n", + "3 https://edtechmagazine.com/k12/article/2024/11... \n", + "4 https://www.ufc.com/news/updates-ufc-309-jones... \n", + "\n", + " rss \\\n", + "0 https://www.npr.org/rss/podcast.php?id=510289 \n", + "1 https://www.communitysignal.com/feed/ \n", + "2 https://www.planetbiometrics.com/rss/ \n", + "3 https://www.edtechmagazine.com/k12/rss.xml?tax... \n", + "4 https://www.ufc.com/rss/news \n", + "\n", + " title \\\n", + "0 The crisis pollsters face ahead of the midterm... \n", + "1 Embodying Work-Life Balance as a Community Pro... \n", + "2 TECH5 awarded a 15-year contract with Virginia... \n", + "3 How Can You Prepare Your Cloud to Safely Imple... \n", + "4 Updates To UFC 309: Jones vs Miocic \n", + "\n", + " content unixTime rss_id \\\n", + "0 Planet Money tries election polling\\n\\nEnlarge... 1731884660 8181d7a585 \n", + "1 Are you able to step away from your community ... 1731883672 b6cef58d91 \n", + "2 TECH5 has announced a major contract win with ... 1731883632 084b136c50 \n", + "3 Many K–12 schools that operate in the cloud ha... 1731883653 7827152faf \n", + "4 UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... 1731883662 2c774cd014 \n", + "\n", + " article_id unixtime \n", + "0 05dfb6bb11 1731884660 \n", + "1 0418489c55 1731883672 \n", + "2 0bb2ec0554 1731883632 \n", + "3 07d39400e9 1731883653 \n", + "4 03f31af6e2 1731883662 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py index 0281b44..679bc11 100644 --- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py +++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py @@ -1,9 +1,13 @@ import os from pinecone import Pinecone - from openai import OpenAI +from utils import setup_logging + +logger = setup_logging() + + # Set up Pinecone client api_key = os.getenv("PINCEONE_API_KEY") shards = os.getenv("PINECONE_SHARDS") @@ -32,10 +36,11 @@ def vectorize(article:str) -> list[float]: def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted. - index.upsert( + response = index.upsert( vectors=vectors, namespace=namespace ) + logger.info(f"Upserted vectors Response : {response}") def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working. diff --git a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py index 18850fc..e741d11 100644 --- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py +++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py @@ -38,11 +38,13 @@ def pinecone_save_article(article:dict): logging.info(f"Article content into Pinecone") data["values"] = vectorize(article=article["content"]) + data = list(data) + namespace = os.getenv('PINECONE_NAMESPACE') logger.info("Upserting article to Pinecone") - upsert_vectors(index, [data], namespace) + upsert_vectors(index, data, namespace) logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone with namespace {namespace}") def dynamodb_save_article(article:dict): diff --git a/todo.md b/todo.md index 4d1cd47..94f2d3f 100644 --- a/todo.md +++ b/todo.md @@ -3,22 +3,38 @@ * Vector Database Initialization at earlier phase. [ Done ] * Test out Vector Databases at Small Scale. - * [ ] Testing + * [x] Testing * [x] Fix OpenAI Error. * [x] Fix Pinecone Error - * [ ] Fix input error. + * [x] Fix input error. * [ ] Let it run for a day -* Test out Vector Databases at Scale. + * [x] Check Open AI Bill + * [x] Check Vector Database Bill + * [ ] Figure out Vector Database Bug. + * [ ] Figure out best way to store articles since metadata or in S3. +* [ ] Decreae the cost of cloudwatch Logs +* [ ] Test out Vector Databases at Scale. +* [ ] Add in text cleaning before after ingesting article but before storage. +* [ ] Automate the monthly data ingestion job +* [ ] Lambda Optimization * Monthly ingestion job * Protocol for annotating data. - * Promotional Material - * Stuff that is not news - * Stuff that is news but like a list of news topics. - * Annotating Keywwords or topics of interest, or categorization. - * Stuff that people wouldn't find genuinely interesting ( no interesting factor or novelty or news worthiness ) - * Annotating Entity Type Data. - * Annotating Podcast Scripts or Video Scripts + * [ ] DSPy Integration + * [ ] LLMRouter integration + * [ ] Main topic/Category ( list ) + * [ ] Writing Stley ( e.g. Informal, professional, etc...) + * [ ] Promotional Material ( 0=Not Promotional, 1=Promotional) + * [ ] Stuff that is news ( 0= Not News, 1=News) + * [ ] Stuff that is news but like a list of news topics. ( 0=Opposite, 1=News Topic Lists) + * [ ] Annotating Entities ( List of Key entities with entity specific sentiment ) + * [ ] List of Major Events ( e.g. Ukraine War, Israel Palestine, etc... ) + * [ ] List of Minor Event ( e.g. Specific Battle, Court Case step, etc..) + * [ ] Novelty Factor ( Scale from 0(Not Interesting) -> 100(Interesting)) + * [ ] Annotating Podcast Scripts or Video Scripts ( 0=is not a script, 1=Is a script) + * [ ] Political Quadrant ( Or that eight dimensional thing that guy had. ) +Also we need to specify a consistent output format + * Estimation Algorithm for annotation cost. * Open Source Protocol for running this.