mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-07 16:24:19 +01:00
Merge branch 'transcribe_audio_huggingface' of https://github.com/gucky92/Auto-GPT into transcribe_audio_huggingface
This commit is contained in:
@@ -3,6 +3,7 @@ from typing import List, Tuple, Union
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from requests.compat import urljoin
|
||||
from requests import Response
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -134,19 +135,20 @@ def scrape_text(url: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def extract_hyperlinks(soup: BeautifulSoup) -> List[Tuple[str, str]]:
|
||||
def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]:
|
||||
"""Extract hyperlinks from a BeautifulSoup object
|
||||
|
||||
Args:
|
||||
soup (BeautifulSoup): The BeautifulSoup object
|
||||
base_url (str): The base URL
|
||||
|
||||
Returns:
|
||||
List[Tuple[str, str]]: The extracted hyperlinks
|
||||
"""
|
||||
hyperlinks = []
|
||||
for link in soup.find_all("a", href=True):
|
||||
hyperlinks.append((link.text, link["href"]))
|
||||
return hyperlinks
|
||||
return [
|
||||
(link.text, urljoin(base_url, link["href"]))
|
||||
for link in soup.find_all("a", href=True)
|
||||
]
|
||||
|
||||
|
||||
def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]:
|
||||
@@ -183,7 +185,7 @@ def scrape_links(url: str) -> Union[str, List[str]]:
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
hyperlinks = extract_hyperlinks(soup)
|
||||
hyperlinks = extract_hyperlinks(soup, url)
|
||||
|
||||
return format_hyperlinks(hyperlinks)
|
||||
|
||||
|
||||
26
tests/browse_tests.py
Normal file
26
tests/browse_tests.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import unittest
|
||||
import os
|
||||
import sys
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
sys.path.append(os.path.abspath("../scripts"))
|
||||
|
||||
from browse import extract_hyperlinks
|
||||
|
||||
|
||||
class TestBrowseLinks(unittest.TestCase):
|
||||
def test_extract_hyperlinks(self):
|
||||
body = """
|
||||
<body>
|
||||
<a href="https://google.com">Google</a>
|
||||
<a href="foo.html">Foo</a>
|
||||
<div>Some other crap</div>
|
||||
</body>
|
||||
"""
|
||||
soup = BeautifulSoup(body, "html.parser")
|
||||
links = extract_hyperlinks(soup, "http://example.com")
|
||||
self.assertEqual(
|
||||
links,
|
||||
[("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")],
|
||||
)
|
||||
Reference in New Issue
Block a user