added in text cleaning operations.

This commit is contained in:
Charles-Gormley
2024-11-23 20:16:40 -05:00
parent 28148f1181
commit 71be98e993
7 changed files with 207 additions and 86 deletions

1
.gitignore vendored
View File

@@ -4,3 +4,4 @@ repo_structure.txt
*__pycache__*
*feeds.json
todo.md
*.csv

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,14 @@
import re
def remove_newlines(text: str) -> str:
return text.replace('\n', '')
def remove_urls(text: str) -> str:
url_pattern = re.compile(r'http\S+|www\S+')
return url_pattern.sub('', text)
def clean_text(text: str) -> str:
text = remove_newlines(text)
text = remove_urls(text)
return text

View File

@@ -1,6 +1,8 @@
import newspaper
import logging
logger = logging.getLogger()
def extract_article(url):
@@ -21,7 +23,8 @@ def extract_article(url):
article.download()
logger.debug(f"Downloaded Article {url}")
article.parse()
logger.debug(f"Parsed Article {url}")
logger.debug(f"Parsed Article {url}")
return article.title, article.text
except Exception as e:
logger.error(f"Failed to extract article {url}: {str(e)}")

View File

@@ -6,6 +6,7 @@ import threading
import logging
from utils import generate_key
from article_extractor import extract_article
from article_cleaning import clean_text
logger = logging.getLogger()
@@ -47,7 +48,8 @@ def extract_feed_threading(rss: dict, output_queue, stop_thread):
pub_date = parse_pub_date(entry['published'])
if pub_date > last_date:
title, text = extract_article(entry.link)
title, text = extract_article(entry.link)
title, text = clean_text(title), clean_text(text)
article = {
'link': entry.link,
'rss': feed_url,

View File

@@ -1,4 +0,0 @@
from .query.querier import ArticleQuerier
from .batch.downloader import S3BatchDownloader
__all__ = ['ArticleQuerier', 'S3BatchDownloader']

17
todo.md
View File

@@ -3,12 +3,11 @@
[x] Faster Batch Responses ( Parrallelization of requests. )
[x] Query API ( Technical Feasability / is S3 even good for this. )
* Text Processings & Text Cleaning of Content during ingestion step Research [ Done ]
* Do this inside of the lambda code. [ Today ]
* Test out Vector Databases at Small Scale.
* Test out Vector Databases at Scale.
* Test out LLM Summarizaiton At Small Scale.
* Test out LLM Summarization At Scall
* Text Processings & Text Cleaning of Content during ingestion step.
* Fix up ReadMe
* Publish RoadMap.
@@ -18,3 +17,13 @@
* Twitter
* Kaggle
* Protocol for annotating data.
* Promotional Material
* Stuff that is not news
* Stuff that is news but like a list of news topics.
* Annotating Keywwords or topics of interest, or categorization.
* Stuff that people wouldn't find genuinely interesting ( no interesting factor or novelty or news worthiness )
* Annotating Entity Type Data.
* Annotating Podcast Scripts or Video Scripts
* Estimation Algorithm for annotation cost.
* Open Source Protocol for running this.