Added logging for pinecone.

This commit is contained in:
Charles-Gormley
2024-12-15 11:47:26 -05:00
parent b3c9832fdb
commit 0e8c701ae5
6 changed files with 505 additions and 50 deletions

3
.gitignore vendored
View File

@@ -4,4 +4,5 @@ repo_structure.txt
*__pycache__*
*feeds.json
todo.md
*.csv
*.csv
delete*

View File

@@ -1,33 +0,0 @@
import requests
import openai
def mock_bank_api_call(data):
# Mock response from the bank API
bank_response = {
"status": "success",
"account_balance": 1000,
"currency": "USD"
}
return bank_response
def process_data_with_openai(data):
# Call the mock bank API
bank_data = mock_bank_api_call(data)
# Prepare the prompt for OpenAI API
prompt = f"Bank API returned the following data: {bank_data}. Process this data."
# Call the OpenAI API
openai.api_key = 'your-openai-api-key'
response = openai.Completion.create(
engine="davinci",
prompt=prompt,
max_tokens=50
)
return response.choices[0].text.strip()
# Example usage
data = {"account_id": "12345"}
result = process_data_with_openai(data)
print(result)

View File

@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -22,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -45,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -295,6 +295,470 @@
"aggregated_df.to_csv(\"Aggregated_November.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"Aggregated_November.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Drop columns which are all null values. \n",
"df = df.dropna(axis=1, how='all')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Get null value percentage for each column\n",
"null_percent = df.isnull().mean().round(4) * 100"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Drop rows where the content of the column is null\n",
"df = df.dropna(subset=['content'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"link 0.0\n",
"rss 0.0\n",
"title 0.2\n",
"content 0.0\n",
"unixTime 0.0\n",
"rss_id 0.0\n",
"article_id 0.0\n",
"unixtime 0.0\n",
"dtype: float64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get null value percentage for each column\n",
"null_percent = df.isnull().mean().round(4) * 100"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df = df.dropna(subset=['title'])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# Get summary statistics for the dataframe\n",
"summary = df.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# save df \n",
"df.to_csv(\"Aggregated_November.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link</th>\n",
" <th>rss</th>\n",
" <th>title</th>\n",
" <th>content</th>\n",
" <th>unixTime</th>\n",
" <th>rss_id</th>\n",
" <th>article_id</th>\n",
" <th>unixtime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>130388</td>\n",
" <td>130388</td>\n",
" <td>130388</td>\n",
" <td>130388</td>\n",
" <td>1.303880e+05</td>\n",
" <td>130388</td>\n",
" <td>130388</td>\n",
" <td>1.303880e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>37817</td>\n",
" <td>688</td>\n",
" <td>37383</td>\n",
" <td>37658</td>\n",
" <td>NaN</td>\n",
" <td>688</td>\n",
" <td>37817</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
" <td>https://blogs.unity3d.com/feed/</td>\n",
" <td>TechPipeline</td>\n",
" <td>\"Unity\", Unity logos, and other Unity trademar...</td>\n",
" <td>NaN</td>\n",
" <td>b78e974545</td>\n",
" <td>05dfb6bb11</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>11</td>\n",
" <td>3918</td>\n",
" <td>330</td>\n",
" <td>3912</td>\n",
" <td>NaN</td>\n",
" <td>3918</td>\n",
" <td>11</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732375e+09</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732375e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.214484e+05</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.214484e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.731884e+09</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.731884e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732085e+09</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732085e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732402e+09</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732402e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732632e+09</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732632e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732922e+09</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.732922e+09</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link \\\n",
"count 130388 \n",
"unique 37817 \n",
"top https://www.npr.org/2022/11/04/1134434712/plan... \n",
"freq 11 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" rss title \\\n",
"count 130388 130388 \n",
"unique 688 37383 \n",
"top https://blogs.unity3d.com/feed/ TechPipeline \n",
"freq 3918 330 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" content unixTime \\\n",
"count 130388 1.303880e+05 \n",
"unique 37658 NaN \n",
"top \"Unity\", Unity logos, and other Unity trademar... NaN \n",
"freq 3912 NaN \n",
"mean NaN 1.732375e+09 \n",
"std NaN 3.214484e+05 \n",
"min NaN 1.731884e+09 \n",
"25% NaN 1.732085e+09 \n",
"50% NaN 1.732402e+09 \n",
"75% NaN 1.732632e+09 \n",
"max NaN 1.732922e+09 \n",
"\n",
" rss_id article_id unixtime \n",
"count 130388 130388 1.303880e+05 \n",
"unique 688 37817 NaN \n",
"top b78e974545 05dfb6bb11 NaN \n",
"freq 3918 11 NaN \n",
"mean NaN NaN 1.732375e+09 \n",
"std NaN NaN 3.214484e+05 \n",
"min NaN NaN 1.731884e+09 \n",
"25% NaN NaN 1.732085e+09 \n",
"50% NaN NaN 1.732402e+09 \n",
"75% NaN NaN 1.732632e+09 \n",
"max NaN NaN 1.732922e+09 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summary"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link</th>\n",
" <th>rss</th>\n",
" <th>title</th>\n",
" <th>content</th>\n",
" <th>unixTime</th>\n",
" <th>rss_id</th>\n",
" <th>article_id</th>\n",
" <th>unixtime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
" <td>https://www.npr.org/rss/podcast.php?id=510289</td>\n",
" <td>The crisis pollsters face ahead of the midterm...</td>\n",
" <td>Planet Money tries election polling\\n\\nEnlarge...</td>\n",
" <td>1731884660</td>\n",
" <td>8181d7a585</td>\n",
" <td>05dfb6bb11</td>\n",
" <td>1731884660</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>https://www.communitysignal.com/embodying-work...</td>\n",
" <td>https://www.communitysignal.com/feed/</td>\n",
" <td>Embodying Work-Life Balance as a Community Pro...</td>\n",
" <td>Are you able to step away from your community ...</td>\n",
" <td>1731883672</td>\n",
" <td>b6cef58d91</td>\n",
" <td>0418489c55</td>\n",
" <td>1731883672</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>https://identityweek.net/tech5-awarded-a-15-ye...</td>\n",
" <td>https://www.planetbiometrics.com/rss/</td>\n",
" <td>TECH5 awarded a 15-year contract with Virginia...</td>\n",
" <td>TECH5 has announced a major contract win with ...</td>\n",
" <td>1731883632</td>\n",
" <td>084b136c50</td>\n",
" <td>0bb2ec0554</td>\n",
" <td>1731883632</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>https://edtechmagazine.com/k12/article/2024/11...</td>\n",
" <td>https://www.edtechmagazine.com/k12/rss.xml?tax...</td>\n",
" <td>How Can You Prepare Your Cloud to Safely Imple...</td>\n",
" <td>Many K12 schools that operate in the cloud ha...</td>\n",
" <td>1731883653</td>\n",
" <td>7827152faf</td>\n",
" <td>07d39400e9</td>\n",
" <td>1731883653</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>https://www.ufc.com/news/updates-ufc-309-jones...</td>\n",
" <td>https://www.ufc.com/rss/news</td>\n",
" <td>Updates To UFC 309: Jones vs Miocic</td>\n",
" <td>UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...</td>\n",
" <td>1731883662</td>\n",
" <td>2c774cd014</td>\n",
" <td>03f31af6e2</td>\n",
" <td>1731883662</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link \\\n",
"0 https://www.npr.org/2022/11/04/1134434712/plan... \n",
"1 https://www.communitysignal.com/embodying-work... \n",
"2 https://identityweek.net/tech5-awarded-a-15-ye... \n",
"3 https://edtechmagazine.com/k12/article/2024/11... \n",
"4 https://www.ufc.com/news/updates-ufc-309-jones... \n",
"\n",
" rss \\\n",
"0 https://www.npr.org/rss/podcast.php?id=510289 \n",
"1 https://www.communitysignal.com/feed/ \n",
"2 https://www.planetbiometrics.com/rss/ \n",
"3 https://www.edtechmagazine.com/k12/rss.xml?tax... \n",
"4 https://www.ufc.com/rss/news \n",
"\n",
" title \\\n",
"0 The crisis pollsters face ahead of the midterm... \n",
"1 Embodying Work-Life Balance as a Community Pro... \n",
"2 TECH5 awarded a 15-year contract with Virginia... \n",
"3 How Can You Prepare Your Cloud to Safely Imple... \n",
"4 Updates To UFC 309: Jones vs Miocic \n",
"\n",
" content unixTime rss_id \\\n",
"0 Planet Money tries election polling\\n\\nEnlarge... 1731884660 8181d7a585 \n",
"1 Are you able to step away from your community ... 1731883672 b6cef58d91 \n",
"2 TECH5 has announced a major contract win with ... 1731883632 084b136c50 \n",
"3 Many K12 schools that operate in the cloud ha... 1731883653 7827152faf \n",
"4 UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... 1731883662 2c774cd014 \n",
"\n",
" article_id unixtime \n",
"0 05dfb6bb11 1731884660 \n",
"1 0418489c55 1731883672 \n",
"2 0bb2ec0554 1731883632 \n",
"3 07d39400e9 1731883653 \n",
"4 03f31af6e2 1731883662 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,

View File

@@ -1,9 +1,13 @@
import os
from pinecone import Pinecone
from openai import OpenAI
from utils import setup_logging
logger = setup_logging()
# Set up Pinecone client
api_key = os.getenv("PINCEONE_API_KEY")
shards = os.getenv("PINECONE_SHARDS")
@@ -32,10 +36,11 @@ def vectorize(article:str) -> list[float]:
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted.
index.upsert(
response = index.upsert(
vectors=vectors,
namespace=namespace
)
logger.info(f"Upserted vectors Response : {response}")
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working.

View File

@@ -38,11 +38,13 @@ def pinecone_save_article(article:dict):
logging.info(f"Article content into Pinecone")
data["values"] = vectorize(article=article["content"])
data = list(data)
namespace = os.getenv('PINECONE_NAMESPACE')
logger.info("Upserting article to Pinecone")
upsert_vectors(index, [data], namespace)
upsert_vectors(index, data, namespace)
logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone with namespace {namespace}")
def dynamodb_save_article(article:dict):

36
todo.md
View File

@@ -3,22 +3,38 @@
* Vector Database Initialization at earlier phase. [ Done ]
* Test out Vector Databases at Small Scale.
* [ ] Testing
* [x] Testing
* [x] Fix OpenAI Error.
* [x] Fix Pinecone Error
* [ ] Fix input error.
* [x] Fix input error.
* [ ] Let it run for a day
* Test out Vector Databases at Scale.
* [x] Check Open AI Bill
* [x] Check Vector Database Bill
* [ ] Figure out Vector Database Bug.
* [ ] Figure out best way to store articles since metadata or in S3.
* [ ] Decreae the cost of cloudwatch Logs
* [ ] Test out Vector Databases at Scale.
* [ ] Add in text cleaning before after ingesting article but before storage.
* [ ] Automate the monthly data ingestion job
* [ ] Lambda Optimization
* Monthly ingestion job
* Protocol for annotating data.
* Promotional Material
* Stuff that is not news
* Stuff that is news but like a list of news topics.
* Annotating Keywwords or topics of interest, or categorization.
* Stuff that people wouldn't find genuinely interesting ( no interesting factor or novelty or news worthiness )
* Annotating Entity Type Data.
* Annotating Podcast Scripts or Video Scripts
* [ ] DSPy Integration
* [ ] LLMRouter integration
* [ ] Main topic/Category ( list )
* [ ] Writing Stley ( e.g. Informal, professional, etc...)
* [ ] Promotional Material ( 0=Not Promotional, 1=Promotional)
* [ ] Stuff that is news ( 0= Not News, 1=News)
* [ ] Stuff that is news but like a list of news topics. ( 0=Opposite, 1=News Topic Lists)
* [ ] Annotating Entities ( List of Key entities with entity specific sentiment )
* [ ] List of Major Events ( e.g. Ukraine War, Israel Palestine, etc... )
* [ ] List of Minor Event ( e.g. Specific Battle, Court Case step, etc..)
* [ ] Novelty Factor ( Scale from 0(Not Interesting) -> 100(Interesting))
* [ ] Annotating Podcast Scripts or Video Scripts ( 0=is not a script, 1=Is a script)
* [ ] Political Quadrant ( Or that eight dimensional thing that guy had. )
Also we need to specify a consistent output format
* Estimation Algorithm for annotation cost.
* Open Source Protocol for running this.