vector databases are launched

This commit is contained in:
Charles-Gormley
2024-12-01 00:59:15 -05:00
parent 9dda7ff55a
commit b3c9832fdb
7 changed files with 120 additions and 14 deletions

View File

@@ -166,15 +166,39 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 23857 objects to process\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 23857/23857 [02:37<00:00, 151.22object/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded data to November-3.csv in 219.71 seconds\n"
]
}
],
"source": [
"start = time()\n",
"output_path = \"November-3.csv\" # or \"consolidated_data.json\"\n",
"\n",
"\n",
"\n",
"# Define date range\n",
"start_date = \"2024-11-8\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
"start_date = \"2024-11-28\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
"end_date = \"2024-11-30\"\n",
"\n",
"# Start downloading\n",
@@ -195,6 +219,82 @@
"# Aggregating"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Aggregated DataFrame:\n",
" link \\\n",
"0 https://www.npr.org/2022/11/04/1134434712/plan... \n",
"1 https://www.communitysignal.com/embodying-work... \n",
"2 https://identityweek.net/tech5-awarded-a-15-ye... \n",
"3 https://edtechmagazine.com/k12/article/2024/11... \n",
"4 https://www.ufc.com/news/updates-ufc-309-jones... \n",
"\n",
" rss \\\n",
"0 https://www.npr.org/rss/podcast.php?id=510289 \n",
"1 https://www.communitysignal.com/feed/ \n",
"2 https://www.planetbiometrics.com/rss/ \n",
"3 https://www.edtechmagazine.com/k12/rss.xml?tax... \n",
"4 https://www.ufc.com/rss/news \n",
"\n",
" title \\\n",
"0 The crisis pollsters face ahead of the midterm... \n",
"1 Embodying Work-Life Balance as a Community Pro... \n",
"2 TECH5 awarded a 15-year contract with Virginia... \n",
"3 How Can You Prepare Your Cloud to Safely Imple... \n",
"4 Updates To UFC 309: Jones vs Miocic \n",
"\n",
" content unixTime rss_id \\\n",
"0 Planet Money tries election polling\\n\\nEnlarge... 1731884660 8181d7a585 \n",
"1 Are you able to step away from your community ... 1731883672 b6cef58d91 \n",
"2 TECH5 has announced a major contract win with ... 1731883632 084b136c50 \n",
"3 Many K12 schools that operate in the cloud ha... 1731883653 7827152faf \n",
"4 UFC 309: JONES vs MIOCIC Updates:\\n\\nDue to me... 1731883662 2c774cd014 \n",
"\n",
" article_id llm_summary embedding unixtime \n",
"0 05dfb6bb11 NaN NaN 1731884660 \n",
"1 0418489c55 NaN NaN 1731883672 \n",
"2 0bb2ec0554 NaN NaN 1731883632 \n",
"3 07d39400e9 NaN NaN 1731883653 \n",
"4 03f31af6e2 NaN NaN 1731883662 \n"
]
}
],
"source": [
"import pandas as pd\n",
"import glob\n",
"\n",
"# List to hold dataframes\n",
"dataframes = []\n",
"\n",
"# Loop through the files November-1.csv to November-3.csv\n",
"for i in range(1, 4):\n",
" filename = f\"November-{i}.csv\"\n",
" try:\n",
" # Read each CSV and append to the list\n",
" df = pd.read_csv(filename)\n",
" dataframes.append(df)\n",
" except FileNotFoundError:\n",
" print(f\"File {filename} not found, skipping.\")\n",
"\n",
"# Aggregate the dataframes\n",
"if dataframes:\n",
" aggregated_df = pd.concat(dataframes, ignore_index=True)\n",
" print(\"Aggregated DataFrame:\")\n",
" print(aggregated_df.head())\n",
"else:\n",
" print(\"No files were aggregated.\")\n",
"\n",
"# If you want to save the result to a new CSV\n",
"aggregated_df.to_csv(\"Aggregated_November.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,

View File

@@ -207,7 +207,7 @@ def deploy_infrastructure():
metric=vector_search_metric,
spec = ServerlessSpec(
cloud="aws",
region=os.getenv("AWS_REGION"),
region="us-east-1",
),
)

View File

@@ -24,8 +24,8 @@ def get_index():
def vectorize(article:str) -> list[float]:
response = client.embeddings.create(
input=article,
model=os.getenv('OPENAI_EMBEDDING_MODEL', 'text')
input=article, # FIXME: This fails when article is something else, find what the 'something else' is and implement fix.
model=os.getenv('OPENAI_EMBEDDING_MODEL', 'text-')
)
return response.data[0].embedding

View File

@@ -29,16 +29,21 @@ def save_article(article:dict, strategy:str):
def pinecone_save_article(article:dict):
logger.info("Saving article to Pinecone")
index = get_index()
# Expected Keys from Pinecone *MUST* include 'id' and 'values'
article["id"] = article["article_id"]
article["values"] = vectorize(article["content"])
data = dict()
logging.info(f"Article ID into Pinecone")
data["id"] = article["article_id"]
logging.info(f"Article content into Pinecone")
data["values"] = vectorize(article=article["content"])
namespace = os.getenv('PINECONE_NAMESPACE')
logger.info("Upserting article to Pinecone")
upsert_vectors(index, [article], namespace)
logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone index {index.name} with namespace {namespace}")
upsert_vectors(index, [data], namespace)
logger.info(f"Successfully upserted article w/ article-id: {article["article_id"]} to Pinecone with namespace {namespace}")
def dynamodb_save_article(article:dict):
pass

View File

@@ -29,8 +29,6 @@ def lambda_handler(event, context):
feed = event["Records"][0]["body"]
logger.info(f"Received message from SQS: {feed}")
feed = json.loads(feed)
receipt_handle = event["Records"][0]['receiptHandle']

View File

@@ -65,7 +65,8 @@ def update_env_vars(function_name):
'VECTOR_SEARCH_METRIC': os.environ.get('VECTOR_SEARCH_METRIC'),
# OpenAI Configuration
'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY')
'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY'),
"OPENAI_EMBEDDING_MODEL": os.environ.get('OPENAI_EMBEDDING_MODEL'),
}
return lambda_client.update_function_configuration(

View File

@@ -4,7 +4,9 @@
* Vector Database Initialization at earlier phase. [ Done ]
* Test out Vector Databases at Small Scale.
* [ ] Testing
* [ ] Fix OpenAI Error.
* [x] Fix OpenAI Error.
* [x] Fix Pinecone Error
* [ ] Fix input error.
* [ ] Let it run for a day
* Test out Vector Databases at Scale.