Merge branch 'transcribe_audio_huggingface' of https://github.com/gucky92/Auto-GPT into transcribe_audio_huggingface

2026-02-22 06:34:32 +01:00 · 2023-04-15 23:53:03 +02:00
parent 973e3c56b7 18168cc347
commit 572aedfcef
2 changed files with 34 additions and 6 deletions
--- a/autogpt/commands/web_requests.py
+++ b/autogpt/commands/web_requests.py
@@ -3,6 +3,7 @@ from typing import List, Tuple, Union
 from urllib.parse import urljoin, urlparse

 import requests
+from requests.compat import urljoin
 from requests import Response
 from bs4 import BeautifulSoup

@@ -134,19 +135,20 @@ def scrape_text(url: str) -> str:
    return text


-def extract_hyperlinks(soup: BeautifulSoup) -> List[Tuple[str, str]]:
+def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]:
    """Extract hyperlinks from a BeautifulSoup object

    Args:
        soup (BeautifulSoup): The BeautifulSoup object
+        base_url (str): The base URL

    Returns:
        List[Tuple[str, str]]: The extracted hyperlinks
    """
-    hyperlinks = []
-    for link in soup.find_all("a", href=True):
-        hyperlinks.append((link.text, link["href"]))
-    return hyperlinks
+    return [
+        (link.text, urljoin(base_url, link["href"]))
+        for link in soup.find_all("a", href=True)
+    ]


 def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]:
@@ -183,7 +185,7 @@ def scrape_links(url: str) -> Union[str, List[str]]:
    for script in soup(["script", "style"]):
        script.extract()

-    hyperlinks = extract_hyperlinks(soup)
+    hyperlinks = extract_hyperlinks(soup, url)

    return format_hyperlinks(hyperlinks)

--- a/tests/browse_tests.py
+++ b/tests/browse_tests.py
@@ -0,0 +1,26 @@
+import unittest
+import os
+import sys
+
+from bs4 import BeautifulSoup
+
+sys.path.append(os.path.abspath("../scripts"))
+
+from browse import extract_hyperlinks
+
+
+class TestBrowseLinks(unittest.TestCase):
+    def test_extract_hyperlinks(self):
+        body = """
+        <body>
+        <a href="https://google.com">Google</a>
+        <a href="foo.html">Foo</a>
+        <div>Some other crap</div>
+        </body>
+        """
+        soup = BeautifulSoup(body, "html.parser")
+        links = extract_hyperlinks(soup, "http://example.com")
+        self.assertEqual(
+            links,
+            [("Google", "https://google.com"), ("Foo", "http://example.com/foo.html")],
+        )