Improve extract_hyperlinks to honor base url

This commit is contained in:
Mike Harris
2023-04-03 12:51:50 -04:00
parent 51e293f64c
commit 4cde35267b
2 changed files with 32 additions and 6 deletions

View File

@@ -25,11 +25,11 @@ def scrape_text(url):
return text
def extract_hyperlinks(soup):
hyperlinks = []
for link in soup.find_all('a', href=True):
hyperlinks.append((link.text, link['href']))
return hyperlinks
def extract_hyperlinks(soup, base_url):
return [
(link.text, requests.compat.urljoin(base_url, link["href"]))
for link in soup.find_all("a", href=True)
]
def format_hyperlinks(hyperlinks):
@@ -51,7 +51,7 @@ def scrape_links(url):
for script in soup(["script", "style"]):
script.extract()
hyperlinks = extract_hyperlinks(soup)
hyperlinks = extract_hyperlinks(soup, url)
return format_hyperlinks(hyperlinks)

26
tests/browse_tests.py Normal file
View File

@@ -0,0 +1,26 @@
import unittest
import os
import sys
from bs4 import BeautifulSoup
sys.path.append(os.path.abspath("../scripts"))
from browse import extract_hyperlinks
class TestBrowseLinks(unittest.TestCase):
def test_extract_hyperlinks(self):
body = """
<body>
<a href="https://google.com">Google</a>
<a href="foo.html">Foo</a>
<div>Some other crap</div>
</body>
"""
soup = BeautifulSoup(body, "html.parser")
links = extract_hyperlinks(soup, "http://example.com")
self.assertEqual(
links,
[("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")],
)