fixed partitioning strategy

2026-02-23 07:04:32 +01:00 · 2024-11-13 18:38:43 -05:00
parent 4841eebda4
commit 3ef61aef06
2 changed files with 12 additions and 8 deletions
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -3,7 +3,7 @@ import json
 import os
 import logging
 from random import randint
-
+from datetime import datetime
 # TODO: Move this article storage logic to a separate module inside of lambda. 
 # TODO: Get better at handling loading local moduels insdie of the lambdda. 
 from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
@@ -47,15 +47,15 @@ def dynamodb_save_article(article:dict):
 def s3_save_article(article:dict):    
    logger.info("Saving article to S3")

-    rss_feed_id = article['rss_id']
+    now = datetime.now()
    article_id = article['article_id']
    logger.info(f"Content ")
-    if not rss_feed_id or not article_id:
+    if not article_id:
        logger.error(f"Missing rss_id or article_id in article: {article}")
        return

-    file_path = f"/tmp/{rss_feed_id}-{article_id}-article.json"
-    file_key = f"articles/{rss_feed_id}/{article_id}/article.json"
+    file_path = f"/tmp/{article_id}-article.json"
+    file_key = f"{now.year}/{now.month}/{now.day}/{article_id}.json"
    
    # Save article to /tmp json file
    with open(file_path, "w") as f:
--- a/todo.md
+++ b/todo.md
@@ -1,6 +1,8 @@
 # Before Public Launch
-* Testing from 3rd party aws account. [Today]
-* Make sure that the scraping works from a 3rd Party Account [ Today ]
+* Rethink Partitioning Strategy [Done]
+* Implement New Partitioning Strategy. [Next]
+- Incorporate: All the json data except for content and link. 
+  Partitioning Strategy: {Year}/{Month}/{Day}/{article_id}
 * API Tool - to Pull data that you have down. 
 * Tips on where to gather RSS Feeds. 
 * Public Launch Posts
@@ -11,4 +13,6 @@
 * Test out Vector Databases at Small Scale.
 * Test out Vector Databases at Scale. 
 * Test out LLM Summarizaiton At Small Scale
-* Test out LLM Summarization At Scall 
+* Test out LLM Summarization At Scall 
+
+* Text Processings & Text Cleaning of Content during ingestion step.