From 4cde35267b30c373159cf3e8d0dd6b72a645cdc5 Mon Sep 17 00:00:00 2001
From: Mike Harris <mharris717@gmail.com>
Date: Mon, 3 Apr 2023 12:51:50 -0400
Subject: [PATCH] Improve extract_hyperlinks to honor base url

---
 scripts/browse.py     | 12 ++++++------
 tests/browse_tests.py | 26 ++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 6 deletions(-)
 create mode 100644 tests/browse_tests.py
diff --git a/scripts/browse.py b/scripts/browse.py
index f096c5f3..284ce5fc 100644
--- a/scripts/browse.py
+++ b/scripts/browse.py
@@ -25,11 +25,11 @@ def scrape_text(url):
     return text
 
 
-def extract_hyperlinks(soup):
-    hyperlinks = []
-    for link in soup.find_all('a', href=True):
-        hyperlinks.append((link.text, link['href']))
-    return hyperlinks
+def extract_hyperlinks(soup, base_url):
+    return [
+        (link.text, requests.compat.urljoin(base_url, link["href"]))
+        for link in soup.find_all("a", href=True)
+    ]
 
 
 def format_hyperlinks(hyperlinks):
@@ -51,7 +51,7 @@ def scrape_links(url):
     for script in soup(["script", "style"]):
         script.extract()
 
-    hyperlinks = extract_hyperlinks(soup)
+    hyperlinks = extract_hyperlinks(soup, url)
 
     return format_hyperlinks(hyperlinks)
 
diff --git a/tests/browse_tests.py b/tests/browse_tests.py
new file mode 100644
index 00000000..1ac523ec
--- /dev/null
+++ b/tests/browse_tests.py
@@ -0,0 +1,26 @@
+import unittest
+import os
+import sys
+
+from bs4 import BeautifulSoup
+
+sys.path.append(os.path.abspath("../scripts"))
+
+from browse import extract_hyperlinks
+
+
+class TestBrowseLinks(unittest.TestCase):
+    def test_extract_hyperlinks(self):
+        body = """
+        <body>
+        <a href="https://google.com">Google</a>
+        <a href="foo.html">Foo</a>
+        <div>Some other crap</div>
+        </body>
+        """
+        soup = BeautifulSoup(body, "html.parser")
+        links = extract_hyperlinks(soup, "http://example.com")
+        self.assertEqual(
+            links,
+            [("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")],
+        )