Improve extract_hyperlinks to honor base url

2026-02-19 13:14:26 +01:00 · 2023-04-03 12:51:50 -04:00
parent 51e293f64c
commit 4cde35267b
2 changed files with 32 additions and 6 deletions
--- a/scripts/browse.py
+++ b/scripts/browse.py
@@ -25,11 +25,11 @@ def scrape_text(url):
    return text


-def extract_hyperlinks(soup):
-    hyperlinks = []
-    for link in soup.find_all('a', href=True):
-        hyperlinks.append((link.text, link['href']))
-    return hyperlinks
+def extract_hyperlinks(soup, base_url):
+    return [
+        (link.text, requests.compat.urljoin(base_url, link["href"]))
+        for link in soup.find_all("a", href=True)
+    ]


 def format_hyperlinks(hyperlinks):
@@ -51,7 +51,7 @@ def scrape_links(url):
    for script in soup(["script", "style"]):
        script.extract()

-    hyperlinks = extract_hyperlinks(soup)
+    hyperlinks = extract_hyperlinks(soup, url)

    return format_hyperlinks(hyperlinks)

--- a/tests/browse_tests.py
+++ b/tests/browse_tests.py
@@ -0,0 +1,26 @@
+import unittest
+import os
+import sys
+
+from bs4 import BeautifulSoup
+
+sys.path.append(os.path.abspath("../scripts"))
+
+from browse import extract_hyperlinks
+
+
+class TestBrowseLinks(unittest.TestCase):
+    def test_extract_hyperlinks(self):
+        body = """
+        <body>
+        <a href="https://google.com">Google</a>
+        <a href="foo.html">Foo</a>
+        <div>Some other crap</div>
+        </body>
+        """
+        soup = BeautifulSoup(body, "html.parser")
+        links = extract_hyperlinks(soup, "http://example.com")
+        self.assertEqual(
+            links,
+            [("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")],
+        )