Removes scrape_main_content function.

This commit is contained in:
Torantulino
2023-03-29 09:43:32 +01:00
parent 6d796d222d
commit 693d141c86
2 changed files with 1 additions and 18 deletions

View File

@@ -24,23 +24,6 @@ def scrape_text(url):
return text
def scrape_main_content(url):
response = requests.get(url)
# Try using Readability
doc = Document(response.text)
content = doc.summary()
soup = BeautifulSoup(content, "html.parser")
text = soup.get_text('\n', strip=True)
# Check if Readability provided a satisfactory result (e.g., a minimum length)
# min_length = 50
# if len(text) < min_length:
# # Fallback to the custom function
# text = scrape_main_content_custom(response.text)
return text
def split_text(text, max_length=8192):
paragraphs = text.split("\n")
current_length = 0

View File

@@ -60,7 +60,7 @@ def google_search(query, num_results = 3):
return json.dumps(search_results, ensure_ascii=False, indent=4)
def transcribe_summarise(url):
text = browse.scrape_main_content(url)
text = browse.scrape_text(url)
summary = browse.summarize_text(text)
return """ "Result" : """ + summary