From 4cde35267b30c373159cf3e8d0dd6b72a645cdc5 Mon Sep 17 00:00:00 2001 From: Mike Harris Date: Mon, 3 Apr 2023 12:51:50 -0400 Subject: [PATCH 1/2] Improve extract_hyperlinks to honor base url --- scripts/browse.py | 12 ++++++------ tests/browse_tests.py | 26 ++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 tests/browse_tests.py diff --git a/scripts/browse.py b/scripts/browse.py index f096c5f3..284ce5fc 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -25,11 +25,11 @@ def scrape_text(url): return text -def extract_hyperlinks(soup): - hyperlinks = [] - for link in soup.find_all('a', href=True): - hyperlinks.append((link.text, link['href'])) - return hyperlinks +def extract_hyperlinks(soup, base_url): + return [ + (link.text, requests.compat.urljoin(base_url, link["href"])) + for link in soup.find_all("a", href=True) + ] def format_hyperlinks(hyperlinks): @@ -51,7 +51,7 @@ def scrape_links(url): for script in soup(["script", "style"]): script.extract() - hyperlinks = extract_hyperlinks(soup) + hyperlinks = extract_hyperlinks(soup, url) return format_hyperlinks(hyperlinks) diff --git a/tests/browse_tests.py b/tests/browse_tests.py new file mode 100644 index 00000000..1ac523ec --- /dev/null +++ b/tests/browse_tests.py @@ -0,0 +1,26 @@ +import unittest +import os +import sys + +from bs4 import BeautifulSoup + +sys.path.append(os.path.abspath("../scripts")) + +from browse import extract_hyperlinks + + +class TestBrowseLinks(unittest.TestCase): + def test_extract_hyperlinks(self): + body = """ + + Google + Foo +
Some other crap
+ + """ + soup = BeautifulSoup(body, "html.parser") + links = extract_hyperlinks(soup, "http://example.com") + self.assertEqual( + links, + [("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")], + ) From 52bb22d8d1c7665b4a7341bc450269dd430f1e36 Mon Sep 17 00:00:00 2001 From: BillSchumacher <34168009+BillSchumacher@users.noreply.github.com> Date: Sat, 15 Apr 2023 16:20:43 -0500 Subject: [PATCH 2/2] Merge --- autogpt/commands/web_requests.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/autogpt/commands/web_requests.py b/autogpt/commands/web_requests.py index 230b1ff0..051cc710 100644 --- a/autogpt/commands/web_requests.py +++ b/autogpt/commands/web_requests.py @@ -3,6 +3,7 @@ from typing import List, Tuple, Union from urllib.parse import urljoin, urlparse import requests +from requests.compat import urljoin from requests import Response from bs4 import BeautifulSoup @@ -134,19 +135,20 @@ def scrape_text(url: str) -> str: return text -def extract_hyperlinks(soup: BeautifulSoup) -> List[Tuple[str, str]]: +def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]: """Extract hyperlinks from a BeautifulSoup object Args: soup (BeautifulSoup): The BeautifulSoup object + base_url (str): The base URL Returns: List[Tuple[str, str]]: The extracted hyperlinks """ - hyperlinks = [] - for link in soup.find_all("a", href=True): - hyperlinks.append((link.text, link["href"])) - return hyperlinks + return [ + (link.text, urljoin(base_url, link["href"])) + for link in soup.find_all("a", href=True) + ] def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]: @@ -183,7 +185,7 @@ def scrape_links(url: str) -> Union[str, List[str]]: for script in soup(["script", "style"]): script.extract() - hyperlinks = extract_hyperlinks(soup) + hyperlinks = extract_hyperlinks(soup, url) return format_hyperlinks(hyperlinks)