mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2026-02-19 13:14:21 +01:00
added in text cleaning operations.
This commit is contained in:
@@ -0,0 +1,14 @@
|
||||
import re
|
||||
|
||||
def remove_newlines(text: str) -> str:
|
||||
return text.replace('\n', '')
|
||||
|
||||
def remove_urls(text: str) -> str:
|
||||
url_pattern = re.compile(r'http\S+|www\S+')
|
||||
return url_pattern.sub('', text)
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
text = remove_newlines(text)
|
||||
text = remove_urls(text)
|
||||
return text
|
||||
@@ -1,6 +1,8 @@
|
||||
import newspaper
|
||||
import logging
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
def extract_article(url):
|
||||
@@ -21,7 +23,8 @@ def extract_article(url):
|
||||
article.download()
|
||||
logger.debug(f"Downloaded Article {url}")
|
||||
article.parse()
|
||||
logger.debug(f"Parsed Article {url}")
|
||||
logger.debug(f"Parsed Article {url}")
|
||||
|
||||
return article.title, article.text
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract article {url}: {str(e)}")
|
||||
|
||||
@@ -6,6 +6,7 @@ import threading
|
||||
import logging
|
||||
from utils import generate_key
|
||||
from article_extractor import extract_article
|
||||
from article_cleaning import clean_text
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
@@ -47,7 +48,8 @@ def extract_feed_threading(rss: dict, output_queue, stop_thread):
|
||||
pub_date = parse_pub_date(entry['published'])
|
||||
|
||||
if pub_date > last_date:
|
||||
title, text = extract_article(entry.link)
|
||||
title, text = extract_article(entry.link)
|
||||
title, text = clean_text(title), clean_text(text)
|
||||
article = {
|
||||
'link': entry.link,
|
||||
'rss': feed_url,
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
from .query.querier import ArticleQuerier
|
||||
from .batch.downloader import S3BatchDownloader
|
||||
|
||||
__all__ = ['ArticleQuerier', 'S3BatchDownloader']
|
||||
Reference in New Issue
Block a user