added in text cleaning operations.

2026-02-01 12:24:28 +01:00 · 2024-11-23 20:16:40 -05:00
parent 28148f1181
commit 71be98e993
7 changed files with 207 additions and 86 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ repo_structure.txt
 *__pycache__*
 *feeds.json
 todo.md
+*.csv
--- a/example-notebooks/batch-notebook.ipynb
+++ b/example-notebooks/batch-notebook.ipynb
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/article_cleaning.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/article_cleaning.py
@@ -0,0 +1,14 @@
+import re
+
+def remove_newlines(text: str) -> str:
+    return text.replace('\n', '')
+
+def remove_urls(text: str) -> str:
+    url_pattern = re.compile(r'http\S+|www\S+')
+    return url_pattern.sub('', text)
+
+
+def clean_text(text: str) -> str: 
+    text = remove_newlines(text)
+    text = remove_urls(text)
+    return text
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/article_extractor.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/article_extractor.py
@@ -1,6 +1,8 @@
 import newspaper
 import logging

+
+
 logger = logging.getLogger()

 def extract_article(url):
@@ -21,7 +23,8 @@ def extract_article(url):
        article.download()
        logger.debug(f"Downloaded Article {url}")
        article.parse()
-        logger.debug(f"Parsed Article {url}")
+        logger.debug(f"Parsed Article {url}")    
+        
        return article.title, article.text
    except Exception as e:
        logger.error(f"Failed to extract article {url}: {str(e)}")
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/feed_processor.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/feed_processor.py
@@ -6,6 +6,7 @@ import threading
 import logging
 from utils import generate_key
 from article_extractor import extract_article
+from article_cleaning import clean_text

 logger = logging.getLogger()

@@ -47,7 +48,8 @@ def extract_feed_threading(rss: dict, output_queue, stop_thread):
            pub_date = parse_pub_date(entry['published'])
            
            if pub_date > last_date:
-                title, text = extract_article(entry.link) 
+                title, text = extract_article(entry.link)
+                title, text = clean_text(title), clean_text(text)
                article = {
                    'link': entry.link,
                    'rss': feed_url,
--- a/src/search/init.py
+++ b/src/search/init.py
@@ -1,4 +0,0 @@
-from .query.querier import ArticleQuerier
-from .batch.downloader import S3BatchDownloader
-
-__all__ = ['ArticleQuerier', 'S3BatchDownloader']
--- a/todo.md
+++ b/todo.md
@@ -3,12 +3,11 @@
    [x] Faster Batch Responses ( Parrallelization of requests. ) 
    [x] Query API ( Technical Feasability / is S3 even good for this. ) 

+* Text Processings & Text Cleaning of Content during ingestion step Research [ Done ] 
+* Do this inside of the lambda code. [ Today ] 
+
 * Test out Vector Databases at Small Scale.
 * Test out Vector Databases at Scale. 
-* Test out LLM Summarizaiton At Small Scale.
-* Test out LLM Summarization At Scall 
-
-* Text Processings & Text Cleaning of Content during ingestion step.

 * Fix up ReadMe
 * Publish RoadMap.
@@ -18,3 +17,13 @@
    * Twitter
    * Kaggle

+* Protocol for annotating data. 
+    * Promotional Material 
+    * Stuff that is not news
+    * Stuff that is news but like a list of news topics. 
+    * Annotating Keywwords or topics of interest, or categorization.
+    * Stuff that people wouldn't find genuinely interesting ( no interesting factor or novelty or news worthiness ) 
+    * Annotating Entity Type Data. 
+    * Annotating Podcast Scripts or Video Scripts
+* Estimation Algorithm for annotation cost. 
+* Open Source Protocol for running this.