mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2026-02-09 00:04:27 +01:00
Added logging for pinecone.
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -4,4 +4,5 @@ repo_structure.txt
|
||||
*__pycache__*
|
||||
*feeds.json
|
||||
todo.md
|
||||
*.csv
|
||||
*.csv
|
||||
delete*
|
||||
33
delete.py
33
delete.py
@@ -1,33 +0,0 @@
|
||||
import requests
|
||||
import openai
|
||||
|
||||
def mock_bank_api_call(data):
|
||||
# Mock response from the bank API
|
||||
bank_response = {
|
||||
"status": "success",
|
||||
"account_balance": 1000,
|
||||
"currency": "USD"
|
||||
}
|
||||
return bank_response
|
||||
|
||||
def process_data_with_openai(data):
|
||||
# Call the mock bank API
|
||||
bank_data = mock_bank_api_call(data)
|
||||
|
||||
# Prepare the prompt for OpenAI API
|
||||
prompt = f"Bank API returned the following data: {bank_data}. Process this data."
|
||||
|
||||
# Call the OpenAI API
|
||||
openai.api_key = 'your-openai-api-key'
|
||||
response = openai.Completion.create(
|
||||
engine="davinci",
|
||||
prompt=prompt,
|
||||
max_tokens=50
|
||||
)
|
||||
|
||||
return response.choices[0].text.strip()
|
||||
|
||||
# Example usage
|
||||
data = {"account_id": "12345"}
|
||||
result = process_data_with_openai(data)
|
||||
print(result)
|
||||
@@ -11,7 +11,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -22,7 +22,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -45,7 +45,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -295,6 +295,470 @@
|
||||
"aggregated_df.to_csv(\"Aggregated_November.csv\", index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_csv(\"Aggregated_November.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Drop columns which are all null values. \n",
|
||||
"df = df.dropna(axis=1, how='all')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get null value percentage for each column\n",
|
||||
"null_percent = df.isnull().mean().round(4) * 100"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Drop rows where the content of the column is null\n",
|
||||
"df = df.dropna(subset=['content'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"link 0.0\n",
|
||||
"rss 0.0\n",
|
||||
"title 0.2\n",
|
||||
"content 0.0\n",
|
||||
"unixTime 0.0\n",
|
||||
"rss_id 0.0\n",
|
||||
"article_id 0.0\n",
|
||||
"unixtime 0.0\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Get null value percentage for each column\n",
|
||||
"null_percent = df.isnull().mean().round(4) * 100"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = df.dropna(subset=['title'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get summary statistics for the dataframe\n",
|
||||
"summary = df.describe(include='all')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# save df \n",
|
||||
"df.to_csv(\"Aggregated_November.csv\", index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>link</th>\n",
|
||||
" <th>rss</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>content</th>\n",
|
||||
" <th>unixTime</th>\n",
|
||||
" <th>rss_id</th>\n",
|
||||
" <th>article_id</th>\n",
|
||||
" <th>unixtime</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>130388</td>\n",
|
||||
" <td>130388</td>\n",
|
||||
" <td>130388</td>\n",
|
||||
" <td>130388</td>\n",
|
||||
" <td>1.303880e+05</td>\n",
|
||||
" <td>130388</td>\n",
|
||||
" <td>130388</td>\n",
|
||||
" <td>1.303880e+05</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unique</th>\n",
|
||||
" <td>37817</td>\n",
|
||||
" <td>688</td>\n",
|
||||
" <td>37383</td>\n",
|
||||
" <td>37658</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>688</td>\n",
|
||||
" <td>37817</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>top</th>\n",
|
||||
" <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
|
||||
" <td>https://blogs.unity3d.com/feed/</td>\n",
|
||||
" <td>TechPipeline</td>\n",
|
||||
" <td>\"Unity\", Unity logos, and other Unity trademar...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>b78e974545</td>\n",
|
||||
" <td>05dfb6bb11</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>freq</th>\n",
|
||||
" <td>11</td>\n",
|
||||
" <td>3918</td>\n",
|
||||
" <td>330</td>\n",
|
||||
" <td>3912</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>3918</td>\n",
|
||||
" <td>11</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732375e+09</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732375e+09</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>3.214484e+05</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>3.214484e+05</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.731884e+09</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.731884e+09</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732085e+09</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732085e+09</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732402e+09</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732402e+09</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732632e+09</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732632e+09</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>max</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732922e+09</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.732922e+09</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" link \\\n",
|
||||
"count 130388 \n",
|
||||
"unique 37817 \n",
|
||||
"top https://www.npr.org/2022/11/04/1134434712/plan... \n",
|
||||
"freq 11 \n",
|
||||
"mean NaN \n",
|
||||
"std NaN \n",
|
||||
"min NaN \n",
|
||||
"25% NaN \n",
|
||||
"50% NaN \n",
|
||||
"75% NaN \n",
|
||||
"max NaN \n",
|
||||
"\n",
|
||||
" rss title \\\n",
|
||||
"count 130388 130388 \n",
|
||||
"unique 688 37383 \n",
|
||||
"top https://blogs.unity3d.com/feed/ TechPipeline \n",
|
||||
"freq 3918 330 \n",
|
||||
"mean NaN NaN \n",
|
||||
"std NaN NaN \n",
|
||||
"min NaN NaN \n",
|
||||
"25% NaN NaN \n",
|
||||
"50% NaN NaN \n",
|
||||
"75% NaN NaN \n",
|
||||
"max NaN NaN \n",
|
||||
"\n",
|
||||
" content unixTime \\\n",
|
||||
"count 130388 1.303880e+05 \n",
|
||||
"unique 37658 NaN \n",
|
||||
"top \"Unity\", Unity logos, and other Unity trademar... NaN \n",
|
||||
"freq 3912 NaN \n",
|
||||
"mean NaN 1.732375e+09 \n",
|
||||
"std NaN 3.214484e+05 \n",
|
||||
"min NaN 1.731884e+09 \n",
|
||||
"25% NaN 1.732085e+09 \n",
|
||||
"50% NaN 1.732402e+09 \n",
|
||||
"75% NaN 1.732632e+09 \n",
|
||||
"max NaN 1.732922e+09 \n",
|
||||
"\n",
|
||||
" rss_id article_id unixtime \n",
|
||||
"count 130388 130388 1.303880e+05 \n",
|
||||
"unique 688 37817 NaN \n",
|
||||
"top b78e974545 05dfb6bb11 NaN \n",
|
||||
"freq 3918 11 NaN \n",
|
||||
"mean NaN NaN 1.732375e+09 \n",
|
||||
"std NaN NaN 3.214484e+05 \n",
|
||||
"min NaN NaN 1.731884e+09 \n",
|
||||
"25% NaN NaN 1.732085e+09 \n",
|
||||
"50% NaN NaN 1.732402e+09 \n",
|
||||
"75% NaN NaN 1.732632e+09 \n",
|
||||
"max NaN NaN 1.732922e+09 "
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>link</th>\n",
|
||||
" <th>rss</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>content</th>\n",
|
||||
" <th>unixTime</th>\n",
|
||||
" <th>rss_id</th>\n",
|
||||
" <th>article_id</th>\n",
|
||||
" <th>unixtime</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>https://www.npr.org/2022/11/04/1134434712/plan...</td>\n",
|
||||
" <td>https://www.npr.org/rss/podcast.php?id=510289</td>\n",
|
||||
" <td>The crisis pollsters face ahead of the midterm...</td>\n",
|
||||
" <td>Planet Money tries election polling\\n\\nEnlarge...</td>\n",
|
||||
" <td>1731884660</td>\n",
|
||||
" <td>8181d7a585</td>\n",
|
||||
" <td>05dfb6bb11</td>\n",
|
||||
" <td>1731884660</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>https://www.communitysignal.com/embodying-work...</td>\n",
|
||||
" <td>https://www.communitysignal.com/feed/</td>\n",
|
||||
" <td>Embodying Work-Life Balance as a Community Pro...</td>\n",
|
||||
" <td>Are you able to step away from your community ...</td>\n",
|
||||
" <td>1731883672</td>\n",
|
||||
" <td>b6cef58d91</td>\n",
|
||||
" <td>0418489c55</td>\n",
|
||||
" <td>1731883672</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>https://identityweek.net/tech5-awarded-a-15-ye...</td>\n",
|
||||
" <td>https://www.planetbiometrics.com/rss/</td>\n",
|
||||
" <td>TECH5 awarded a 15-year contract with Virginia...</td>\n",
|
||||
" <td>TECH5 has announced a major contract win with ...</td>\n",
|
||||
" <td>1731883632</td>\n",
|
||||
" <td>084b136c50</td>\n",
|
||||
" <td>0bb2ec0554</td>\n",
|
||||
" <td>1731883632</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>https://edtechmagazine.com/k12/article/2024/11...</td>\n",
|
||||
" <td>https://www.edtechmagazine.com/k12/rss.xml?tax...</td>\n",
|
||||
" <td>How Can You Prepare Your Cloud to Safely Imple...</td>\n",
|
||||
" <td>Many K–12 schools that operate in the cloud ha...</td>\n",
|
||||
" <td>1731883653</td>\n",
|
||||
" <td>7827152faf</td>\n",
|
||||
" <td>07d39400e9</td>\n",
|
||||
" <td>1731883653</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>https://www.ufc.com/news/updates-ufc-309-jones...</td>\n",
|
||||
" <td>https://www.ufc.com/rss/news</td>\n",
|
||||
" <td>Updates To UFC 309: Jones vs Miocic</td>\n",
|
||||
" <td>UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me...</td>\n",
|
||||
" <td>1731883662</td>\n",
|
||||
" <td>2c774cd014</td>\n",
|
||||
" <td>03f31af6e2</td>\n",
|
||||
" <td>1731883662</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" link \\\n",
|
||||
"0 https://www.npr.org/2022/11/04/1134434712/plan... \n",
|
||||
"1 https://www.communitysignal.com/embodying-work... \n",
|
||||
"2 https://identityweek.net/tech5-awarded-a-15-ye... \n",
|
||||
"3 https://edtechmagazine.com/k12/article/2024/11... \n",
|
||||
"4 https://www.ufc.com/news/updates-ufc-309-jones... \n",
|
||||
"\n",
|
||||
" rss \\\n",
|
||||
"0 https://www.npr.org/rss/podcast.php?id=510289 \n",
|
||||
"1 https://www.communitysignal.com/feed/ \n",
|
||||
"2 https://www.planetbiometrics.com/rss/ \n",
|
||||
"3 https://www.edtechmagazine.com/k12/rss.xml?tax... \n",
|
||||
"4 https://www.ufc.com/rss/news \n",
|
||||
"\n",
|
||||
" title \\\n",
|
||||
"0 The crisis pollsters face ahead of the midterm... \n",
|
||||
"1 Embodying Work-Life Balance as a Community Pro... \n",
|
||||
"2 TECH5 awarded a 15-year contract with Virginia... \n",
|
||||
"3 How Can You Prepare Your Cloud to Safely Imple... \n",
|
||||
"4 Updates To UFC 309: Jones vs Miocic \n",
|
||||
"\n",
|
||||
" content unixTime rss_id \\\n",
|
||||
"0 Planet Money tries election polling\\n\\nEnlarge... 1731884660 8181d7a585 \n",
|
||||
"1 Are you able to step away from your community ... 1731883672 b6cef58d91 \n",
|
||||
"2 TECH5 has announced a major contract win with ... 1731883632 084b136c50 \n",
|
||||
"3 Many K–12 schools that operate in the cloud ha... 1731883653 7827152faf \n",
|
||||
"4 UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... 1731883662 2c774cd014 \n",
|
||||
"\n",
|
||||
" article_id unixtime \n",
|
||||
"0 05dfb6bb11 1731884660 \n",
|
||||
"1 0418489c55 1731883672 \n",
|
||||
"2 0bb2ec0554 1731883632 \n",
|
||||
"3 07d39400e9 1731883653 \n",
|
||||
"4 03f31af6e2 1731883662 "
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import os
|
||||
|
||||
from pinecone import Pinecone
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
from utils import setup_logging
|
||||
|
||||
logger = setup_logging()
|
||||
|
||||
|
||||
# Set up Pinecone client
|
||||
api_key = os.getenv("PINCEONE_API_KEY")
|
||||
shards = os.getenv("PINECONE_SHARDS")
|
||||
@@ -32,10 +36,11 @@ def vectorize(article:str) -> list[float]:
|
||||
|
||||
|
||||
def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted.
|
||||
index.upsert(
|
||||
response = index.upsert(
|
||||
vectors=vectors,
|
||||
namespace=namespace
|
||||
)
|
||||
logger.info(f"Upserted vectors Response : {response}")
|
||||
|
||||
def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working.
|
||||
|
||||
|
||||
@@ -38,11 +38,13 @@ def pinecone_save_article(article:dict):
|
||||
logging.info(f"Article content into Pinecone")
|
||||
data["values"] = vectorize(article=article["content"])
|
||||
|
||||
data = list(data)
|
||||
|
||||
|
||||
namespace = os.getenv('PINECONE_NAMESPACE')
|
||||
|
||||
logger.info("Upserting article to Pinecone")
|
||||
upsert_vectors(index, [data], namespace)
|
||||
upsert_vectors(index, data, namespace)
|
||||
logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone with namespace {namespace}")
|
||||
|
||||
def dynamodb_save_article(article:dict):
|
||||
|
||||
36
todo.md
36
todo.md
@@ -3,22 +3,38 @@
|
||||
|
||||
* Vector Database Initialization at earlier phase. [ Done ]
|
||||
* Test out Vector Databases at Small Scale.
|
||||
* [ ] Testing
|
||||
* [x] Testing
|
||||
* [x] Fix OpenAI Error.
|
||||
* [x] Fix Pinecone Error
|
||||
* [ ] Fix input error.
|
||||
* [x] Fix input error.
|
||||
* [ ] Let it run for a day
|
||||
* Test out Vector Databases at Scale.
|
||||
* [x] Check Open AI Bill
|
||||
* [x] Check Vector Database Bill
|
||||
* [ ] Figure out Vector Database Bug.
|
||||
* [ ] Figure out best way to store articles since metadata or in S3.
|
||||
* [ ] Decreae the cost of cloudwatch Logs
|
||||
* [ ] Test out Vector Databases at Scale.
|
||||
* [ ] Add in text cleaning before after ingesting article but before storage.
|
||||
* [ ] Automate the monthly data ingestion job
|
||||
* [ ] Lambda Optimization
|
||||
|
||||
|
||||
* Monthly ingestion job
|
||||
* Protocol for annotating data.
|
||||
* Promotional Material
|
||||
* Stuff that is not news
|
||||
* Stuff that is news but like a list of news topics.
|
||||
* Annotating Keywwords or topics of interest, or categorization.
|
||||
* Stuff that people wouldn't find genuinely interesting ( no interesting factor or novelty or news worthiness )
|
||||
* Annotating Entity Type Data.
|
||||
* Annotating Podcast Scripts or Video Scripts
|
||||
* [ ] DSPy Integration
|
||||
* [ ] LLMRouter integration
|
||||
* [ ] Main topic/Category ( list )
|
||||
* [ ] Writing Stley ( e.g. Informal, professional, etc...)
|
||||
* [ ] Promotional Material ( 0=Not Promotional, 1=Promotional)
|
||||
* [ ] Stuff that is news ( 0= Not News, 1=News)
|
||||
* [ ] Stuff that is news but like a list of news topics. ( 0=Opposite, 1=News Topic Lists)
|
||||
* [ ] Annotating Entities ( List of Key entities with entity specific sentiment )
|
||||
* [ ] List of Major Events ( e.g. Ukraine War, Israel Palestine, etc... )
|
||||
* [ ] List of Minor Event ( e.g. Specific Battle, Court Case step, etc..)
|
||||
* [ ] Novelty Factor ( Scale from 0(Not Interesting) -> 100(Interesting))
|
||||
* [ ] Annotating Podcast Scripts or Video Scripts ( 0=is not a script, 1=Is a script)
|
||||
* [ ] Political Quadrant ( Or that eight dimensional thing that guy had. )
|
||||
Also we need to specify a consistent output format
|
||||
|
||||
* Estimation Algorithm for annotation cost.
|
||||
* Open Source Protocol for running this.
|
||||
|
||||
Reference in New Issue
Block a user