diff --git a/autogpt/commands/web_requests.py b/autogpt/commands/web_requests.py index 230b1ff0..051cc710 100644 --- a/autogpt/commands/web_requests.py +++ b/autogpt/commands/web_requests.py @@ -3,6 +3,7 @@ from typing import List, Tuple, Union from urllib.parse import urljoin, urlparse import requests +from requests.compat import urljoin from requests import Response from bs4 import BeautifulSoup @@ -134,19 +135,20 @@ def scrape_text(url: str) -> str: return text -def extract_hyperlinks(soup: BeautifulSoup) -> List[Tuple[str, str]]: +def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]: """Extract hyperlinks from a BeautifulSoup object Args: soup (BeautifulSoup): The BeautifulSoup object + base_url (str): The base URL Returns: List[Tuple[str, str]]: The extracted hyperlinks """ - hyperlinks = [] - for link in soup.find_all("a", href=True): - hyperlinks.append((link.text, link["href"])) - return hyperlinks + return [ + (link.text, urljoin(base_url, link["href"])) + for link in soup.find_all("a", href=True) + ] def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]: @@ -183,7 +185,7 @@ def scrape_links(url: str) -> Union[str, List[str]]: for script in soup(["script", "style"]): script.extract() - hyperlinks = extract_hyperlinks(soup) + hyperlinks = extract_hyperlinks(soup, url) return format_hyperlinks(hyperlinks) diff --git a/tests/browse_tests.py b/tests/browse_tests.py new file mode 100644 index 00000000..1ac523ec --- /dev/null +++ b/tests/browse_tests.py @@ -0,0 +1,26 @@ +import unittest +import os +import sys + +from bs4 import BeautifulSoup + +sys.path.append(os.path.abspath("../scripts")) + +from browse import extract_hyperlinks + + +class TestBrowseLinks(unittest.TestCase): + def test_extract_hyperlinks(self): + body = """ + + Google + Foo +
Some other crap
+ + """ + soup = BeautifulSoup(body, "html.parser") + links = extract_hyperlinks(soup, "http://example.com") + self.assertEqual( + links, + [("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")], + )