Merge branch 'transcribe_audio_huggingface' of https://github.com/gucky92/Auto-GPT into transcribe_audio_huggingface

This commit is contained in:
gucky92
2023-04-15 23:53:03 +02:00
2 changed files with 34 additions and 6 deletions

View File

@@ -3,6 +3,7 @@ from typing import List, Tuple, Union
from urllib.parse import urljoin, urlparse
import requests
from requests.compat import urljoin
from requests import Response
from bs4 import BeautifulSoup
@@ -134,19 +135,20 @@ def scrape_text(url: str) -> str:
return text
def extract_hyperlinks(soup: BeautifulSoup) -> List[Tuple[str, str]]:
def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]:
"""Extract hyperlinks from a BeautifulSoup object
Args:
soup (BeautifulSoup): The BeautifulSoup object
base_url (str): The base URL
Returns:
List[Tuple[str, str]]: The extracted hyperlinks
"""
hyperlinks = []
for link in soup.find_all("a", href=True):
hyperlinks.append((link.text, link["href"]))
return hyperlinks
return [
(link.text, urljoin(base_url, link["href"]))
for link in soup.find_all("a", href=True)
]
def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]:
@@ -183,7 +185,7 @@ def scrape_links(url: str) -> Union[str, List[str]]:
for script in soup(["script", "style"]):
script.extract()
hyperlinks = extract_hyperlinks(soup)
hyperlinks = extract_hyperlinks(soup, url)
return format_hyperlinks(hyperlinks)

26
tests/browse_tests.py Normal file
View File

@@ -0,0 +1,26 @@
import unittest
import os
import sys
from bs4 import BeautifulSoup
sys.path.append(os.path.abspath("../scripts"))
from browse import extract_hyperlinks
class TestBrowseLinks(unittest.TestCase):
def test_extract_hyperlinks(self):
body = """
<body>
<a href="https://google.com">Google</a>
<a href="foo.html">Foo</a>
<div>Some other crap</div>
</body>
"""
soup = BeautifulSoup(body, "html.parser")
links = extract_hyperlinks(soup, "http://example.com")
self.assertEqual(
links,
[("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")],
)