fixed partitioning strategy

This commit is contained in:
Charles-Gormley
2024-11-13 18:38:43 -05:00
parent 4841eebda4
commit 3ef61aef06
2 changed files with 12 additions and 8 deletions

View File

@@ -3,7 +3,7 @@ import json
import os
import logging
from random import randint
from datetime import datetime
# TODO: Move this article storage logic to a separate module inside of lambda.
# TODO: Get better at handling loading local moduels insdie of the lambdda.
from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
@@ -47,15 +47,15 @@ def dynamodb_save_article(article:dict):
def s3_save_article(article:dict):
logger.info("Saving article to S3")
rss_feed_id = article['rss_id']
now = datetime.now()
article_id = article['article_id']
logger.info(f"Content ")
if not rss_feed_id or not article_id:
if not article_id:
logger.error(f"Missing rss_id or article_id in article: {article}")
return
file_path = f"/tmp/{rss_feed_id}-{article_id}-article.json"
file_key = f"articles/{rss_feed_id}/{article_id}/article.json"
file_path = f"/tmp/{article_id}-article.json"
file_key = f"{now.year}/{now.month}/{now.day}/{article_id}.json"
# Save article to /tmp json file
with open(file_path, "w") as f:

10
todo.md
View File

@@ -1,6 +1,8 @@
# Before Public Launch
* Testing from 3rd party aws account. [Today]
* Make sure that the scraping works from a 3rd Party Account [ Today ]
* Rethink Partitioning Strategy [Done]
* Implement New Partitioning Strategy. [Next]
- Incorporate: All the json data except for content and link.
Partitioning Strategy: {Year}/{Month}/{Day}/{article_id}
* API Tool - to Pull data that you have down.
* Tips on where to gather RSS Feeds.
* Public Launch Posts
@@ -11,4 +13,6 @@
* Test out Vector Databases at Small Scale.
* Test out Vector Databases at Scale.
* Test out LLM Summarizaiton At Small Scale
* Test out LLM Summarization At Scall
* Test out LLM Summarization At Scall
* Text Processings & Text Cleaning of Content during ingestion step.