added in text cleaning operations.

2026-02-19 13:14:21 +01:00 · 2024-11-23 20:16:40 -05:00
parent 28148f1181
commit 71be98e993
7 changed files with 207 additions and 86 deletions
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/article_cleaning.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/article_cleaning.py
@@ -0,0 +1,14 @@
+import re
+
+def remove_newlines(text: str) -> str:
+    return text.replace('\n', '')
+
+def remove_urls(text: str) -> str:
+    url_pattern = re.compile(r'http\S+|www\S+')
+    return url_pattern.sub('', text)
+
+
+def clean_text(text: str) -> str: 
+    text = remove_newlines(text)
+    text = remove_urls(text)
+    return text
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/article_extractor.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/article_extractor.py
@@ -1,6 +1,8 @@
 import newspaper
 import logging

+
+
 logger = logging.getLogger()

 def extract_article(url):
@@ -21,7 +23,8 @@ def extract_article(url):
        article.download()
        logger.debug(f"Downloaded Article {url}")
        article.parse()
-        logger.debug(f"Parsed Article {url}")
+        logger.debug(f"Parsed Article {url}")    
+        
        return article.title, article.text
    except Exception as e:
        logger.error(f"Failed to extract article {url}: {str(e)}")
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/feed_processor.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/feed_processor.py
@@ -6,6 +6,7 @@ import threading
 import logging
 from utils import generate_key
 from article_extractor import extract_article
+from article_cleaning import clean_text

 logger = logging.getLogger()

@@ -47,7 +48,8 @@ def extract_feed_threading(rss: dict, output_queue, stop_thread):
            pub_date = parse_pub_date(entry['published'])
            
            if pub_date > last_date:
-                title, text = extract_article(entry.link) 
+                title, text = extract_article(entry.link)
+                title, text = clean_text(title), clean_text(text)
                article = {
                    'link': entry.link,
                    'rss': feed_url,
--- a/src/search/init.py
+++ b/src/search/init.py
@@ -1,4 +0,0 @@
-from .query.querier import ArticleQuerier
-from .batch.downloader import S3BatchDownloader
-
-__all__ = ['ArticleQuerier', 'S3BatchDownloader']