From 4cde35267b30c373159cf3e8d0dd6b72a645cdc5 Mon Sep 17 00:00:00 2001 From: Mike Harris Date: Mon, 3 Apr 2023 12:51:50 -0400 Subject: [PATCH] Improve extract_hyperlinks to honor base url --- scripts/browse.py | 12 ++++++------ tests/browse_tests.py | 26 ++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 tests/browse_tests.py diff --git a/scripts/browse.py b/scripts/browse.py index f096c5f3..284ce5fc 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -25,11 +25,11 @@ def scrape_text(url): return text -def extract_hyperlinks(soup): - hyperlinks = [] - for link in soup.find_all('a', href=True): - hyperlinks.append((link.text, link['href'])) - return hyperlinks +def extract_hyperlinks(soup, base_url): + return [ + (link.text, requests.compat.urljoin(base_url, link["href"])) + for link in soup.find_all("a", href=True) + ] def format_hyperlinks(hyperlinks): @@ -51,7 +51,7 @@ def scrape_links(url): for script in soup(["script", "style"]): script.extract() - hyperlinks = extract_hyperlinks(soup) + hyperlinks = extract_hyperlinks(soup, url) return format_hyperlinks(hyperlinks) diff --git a/tests/browse_tests.py b/tests/browse_tests.py new file mode 100644 index 00000000..1ac523ec --- /dev/null +++ b/tests/browse_tests.py @@ -0,0 +1,26 @@ +import unittest +import os +import sys + +from bs4 import BeautifulSoup + +sys.path.append(os.path.abspath("../scripts")) + +from browse import extract_hyperlinks + + +class TestBrowseLinks(unittest.TestCase): + def test_extract_hyperlinks(self): + body = """ + + Google + Foo +
Some other crap
+ + """ + soup = BeautifulSoup(body, "html.parser") + links = extract_hyperlinks(soup, "http://example.com") + self.assertEqual( + links, + [("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")], + )