added in text cleaning operations.

This commit is contained in:
Charles-Gormley
2024-11-23 20:16:40 -05:00
parent 28148f1181
commit 71be98e993
7 changed files with 207 additions and 86 deletions

View File

@@ -0,0 +1,14 @@
import re
def remove_newlines(text: str) -> str:
return text.replace('\n', '')
def remove_urls(text: str) -> str:
url_pattern = re.compile(r'http\S+|www\S+')
return url_pattern.sub('', text)
def clean_text(text: str) -> str:
text = remove_newlines(text)
text = remove_urls(text)
return text

View File

@@ -1,6 +1,8 @@
import newspaper
import logging
logger = logging.getLogger()
def extract_article(url):
@@ -21,7 +23,8 @@ def extract_article(url):
article.download()
logger.debug(f"Downloaded Article {url}")
article.parse()
logger.debug(f"Parsed Article {url}")
logger.debug(f"Parsed Article {url}")
return article.title, article.text
except Exception as e:
logger.error(f"Failed to extract article {url}: {str(e)}")

View File

@@ -6,6 +6,7 @@ import threading
import logging
from utils import generate_key
from article_extractor import extract_article
from article_cleaning import clean_text
logger = logging.getLogger()
@@ -47,7 +48,8 @@ def extract_feed_threading(rss: dict, output_queue, stop_thread):
pub_date = parse_pub_date(entry['published'])
if pub_date > last_date:
title, text = extract_article(entry.link)
title, text = extract_article(entry.link)
title, text = clean_text(title), clean_text(text)
article = {
'link': entry.link,
'rss': feed_url,

View File

@@ -1,4 +0,0 @@
from .query.querier import ArticleQuerier
from .batch.downloader import S3BatchDownloader
__all__ = ['ArticleQuerier', 'S3BatchDownloader']