mirror of
https://github.com/aljazceru/IngestRSS.git
synced 2026-02-01 12:24:28 +01:00
added in text cleaning operations.
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,3 +4,4 @@ repo_structure.txt
|
||||
*__pycache__*
|
||||
*feeds.json
|
||||
todo.md
|
||||
*.csv
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,14 @@
|
||||
import re
|
||||
|
||||
def remove_newlines(text: str) -> str:
|
||||
return text.replace('\n', '')
|
||||
|
||||
def remove_urls(text: str) -> str:
|
||||
url_pattern = re.compile(r'http\S+|www\S+')
|
||||
return url_pattern.sub('', text)
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
text = remove_newlines(text)
|
||||
text = remove_urls(text)
|
||||
return text
|
||||
@@ -1,6 +1,8 @@
|
||||
import newspaper
|
||||
import logging
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
def extract_article(url):
|
||||
@@ -21,7 +23,8 @@ def extract_article(url):
|
||||
article.download()
|
||||
logger.debug(f"Downloaded Article {url}")
|
||||
article.parse()
|
||||
logger.debug(f"Parsed Article {url}")
|
||||
logger.debug(f"Parsed Article {url}")
|
||||
|
||||
return article.title, article.text
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract article {url}: {str(e)}")
|
||||
|
||||
@@ -6,6 +6,7 @@ import threading
|
||||
import logging
|
||||
from utils import generate_key
|
||||
from article_extractor import extract_article
|
||||
from article_cleaning import clean_text
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
@@ -47,7 +48,8 @@ def extract_feed_threading(rss: dict, output_queue, stop_thread):
|
||||
pub_date = parse_pub_date(entry['published'])
|
||||
|
||||
if pub_date > last_date:
|
||||
title, text = extract_article(entry.link)
|
||||
title, text = extract_article(entry.link)
|
||||
title, text = clean_text(title), clean_text(text)
|
||||
article = {
|
||||
'link': entry.link,
|
||||
'rss': feed_url,
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
from .query.querier import ArticleQuerier
|
||||
from .batch.downloader import S3BatchDownloader
|
||||
|
||||
__all__ = ['ArticleQuerier', 'S3BatchDownloader']
|
||||
17
todo.md
17
todo.md
@@ -3,12 +3,11 @@
|
||||
[x] Faster Batch Responses ( Parrallelization of requests. )
|
||||
[x] Query API ( Technical Feasability / is S3 even good for this. )
|
||||
|
||||
* Text Processings & Text Cleaning of Content during ingestion step Research [ Done ]
|
||||
* Do this inside of the lambda code. [ Today ]
|
||||
|
||||
* Test out Vector Databases at Small Scale.
|
||||
* Test out Vector Databases at Scale.
|
||||
* Test out LLM Summarizaiton At Small Scale.
|
||||
* Test out LLM Summarization At Scall
|
||||
|
||||
* Text Processings & Text Cleaning of Content during ingestion step.
|
||||
|
||||
* Fix up ReadMe
|
||||
* Publish RoadMap.
|
||||
@@ -18,3 +17,13 @@
|
||||
* Twitter
|
||||
* Kaggle
|
||||
|
||||
* Protocol for annotating data.
|
||||
* Promotional Material
|
||||
* Stuff that is not news
|
||||
* Stuff that is news but like a list of news topics.
|
||||
* Annotating Keywwords or topics of interest, or categorization.
|
||||
* Stuff that people wouldn't find genuinely interesting ( no interesting factor or novelty or news worthiness )
|
||||
* Annotating Entity Type Data.
|
||||
* Annotating Podcast Scripts or Video Scripts
|
||||
* Estimation Algorithm for annotation cost.
|
||||
* Open Source Protocol for running this.
|
||||
|
||||
Reference in New Issue
Block a user