From 4cde35267b30c373159cf3e8d0dd6b72a645cdc5 Mon Sep 17 00:00:00 2001
From: Mike Harris <mharris717@gmail.com>
Date: Mon, 3 Apr 2023 12:51:50 -0400
Subject: [PATCH 1/2] Improve extract_hyperlinks to honor base url

---
 scripts/browse.py     | 12 ++++++------
 tests/browse_tests.py | 26 ++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 6 deletions(-)
 create mode 100644 tests/browse_tests.py
diff --git a/scripts/browse.py b/scripts/browse.py
index f096c5f3..284ce5fc 100644
--- a/scripts/browse.py
+++ b/scripts/browse.py
@@ -25,11 +25,11 @@ def scrape_text(url):
     return text
 
 
-def extract_hyperlinks(soup):
-    hyperlinks = []
-    for link in soup.find_all('a', href=True):
-        hyperlinks.append((link.text, link['href']))
-    return hyperlinks
+def extract_hyperlinks(soup, base_url):
+    return [
+        (link.text, requests.compat.urljoin(base_url, link["href"]))
+        for link in soup.find_all("a", href=True)
+    ]
 
 
 def format_hyperlinks(hyperlinks):
@@ -51,7 +51,7 @@ def scrape_links(url):
     for script in soup(["script", "style"]):
         script.extract()
 
-    hyperlinks = extract_hyperlinks(soup)
+    hyperlinks = extract_hyperlinks(soup, url)
 
     return format_hyperlinks(hyperlinks)
 
diff --git a/tests/browse_tests.py b/tests/browse_tests.py
new file mode 100644
index 00000000..1ac523ec
--- /dev/null
+++ b/tests/browse_tests.py
@@ -0,0 +1,26 @@
+import unittest
+import os
+import sys
+
+from bs4 import BeautifulSoup
+
+sys.path.append(os.path.abspath("../scripts"))
+
+from browse import extract_hyperlinks
+
+
+class TestBrowseLinks(unittest.TestCase):
+    def test_extract_hyperlinks(self):
+        body = """
+        <body>
+        <a href="https://google.com">Google</a>
+        <a href="foo.html">Foo</a>
+        <div>Some other crap</div>
+        </body>
+        """
+        soup = BeautifulSoup(body, "html.parser")
+        links = extract_hyperlinks(soup, "http://example.com")
+        self.assertEqual(
+            links,
+            [("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")],
+        )

From 52bb22d8d1c7665b4a7341bc450269dd430f1e36 Mon Sep 17 00:00:00 2001
From: BillSchumacher <34168009+BillSchumacher@users.noreply.github.com>
Date: Sat, 15 Apr 2023 16:20:43 -0500
Subject: [PATCH 2/2] Merge

---
 autogpt/commands/web_requests.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/autogpt/commands/web_requests.py b/autogpt/commands/web_requests.py
index 230b1ff0..051cc710 100644
--- a/autogpt/commands/web_requests.py
+++ b/autogpt/commands/web_requests.py
@@ -3,6 +3,7 @@ from typing import List, Tuple, Union
 from urllib.parse import urljoin, urlparse
 
 import requests
+from requests.compat import urljoin
 from requests import Response
 from bs4 import BeautifulSoup
 
@@ -134,19 +135,20 @@ def scrape_text(url: str) -> str:
     return text
 
 
-def extract_hyperlinks(soup: BeautifulSoup) -> List[Tuple[str, str]]:
+def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]:
     """Extract hyperlinks from a BeautifulSoup object
 
     Args:
         soup (BeautifulSoup): The BeautifulSoup object
+        base_url (str): The base URL
 
     Returns:
         List[Tuple[str, str]]: The extracted hyperlinks
     """
-    hyperlinks = []
-    for link in soup.find_all("a", href=True):
-        hyperlinks.append((link.text, link["href"]))
-    return hyperlinks
+    return [
+        (link.text, urljoin(base_url, link["href"]))
+        for link in soup.find_all("a", href=True)
+    ]
 
 
 def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]:
@@ -183,7 +185,7 @@ def scrape_links(url: str) -> Union[str, List[str]]:
     for script in soup(["script", "style"]):
         script.extract()
 
-    hyperlinks = extract_hyperlinks(soup)
+    hyperlinks = extract_hyperlinks(soup, url)
 
     return format_hyperlinks(hyperlinks)