Refactor and Merge branch 'master' of https://github.com/Significant-Gravitas/Auto-GPT into pr/96

2025-12-18 06:24:20 +01:00 · 2023-04-15 17:12:59 -05:00
parent 29c0b544a4 1586966003
commit f2035231e3
127 changed files with 7343 additions and 1892 deletions
--- a/autogpt/commands/web_selenium.py
+++ b/autogpt/commands/web_selenium.py
@@ -0,0 +1,141 @@
+"""Selenium web scraping module."""
+from selenium import webdriver
+from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
+import autogpt.processing.text as summary
+from bs4 import BeautifulSoup
+from selenium.webdriver.remote.webdriver import WebDriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.firefox import GeckoDriverManager
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.firefox.options import Options as FirefoxOptions
+from selenium.webdriver.safari.options import Options as SafariOptions
+import logging
+from pathlib import Path
+from autogpt.config import Config
+from typing import List, Tuple, Union
+
+FILE_DIR = Path(__file__).parent.parent
+CFG = Config()
+
+
+def browse_website(url: str, question: str) -> Tuple[str, WebDriver]:
+    """Browse a website and return the answer and links to the user
+
+    Args:
+        url (str): The url of the website to browse
+        question (str): The question asked by the user
+
+    Returns:
+        Tuple[str, WebDriver]: The answer and links to the user and the webdriver
+    """
+    driver, text = scrape_text_with_selenium(url)
+    add_header(driver)
+    summary_text = summary.summarize_text(url, text, question, driver)
+    links = scrape_links_with_selenium(driver, url)
+
+    # Limit links to 5
+    if len(links) > 5:
+        links = links[:5]
+    close_browser(driver)
+    return f"Answer gathered from website: {summary_text} \n \n Links: {links}", driver
+
+
+def scrape_text_with_selenium(url: str) -> Tuple[WebDriver, str]:
+    """Scrape text from a website using selenium
+
+    Args:
+        url (str): The url of the website to scrape
+
+    Returns:
+        Tuple[WebDriver, str]: The webdriver and the text scraped from the website
+    """
+    logging.getLogger("selenium").setLevel(logging.CRITICAL)
+
+    options_available = {
+        "chrome": ChromeOptions,
+        "safari": SafariOptions,
+        "firefox": FirefoxOptions,
+    }
+
+    options = options_available[CFG.selenium_web_browser]()
+    options.add_argument(
+        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36"
+    )
+
+    if CFG.selenium_web_browser == "firefox":
+        driver = webdriver.Firefox(
+            executable_path=GeckoDriverManager().install(), options=options
+        )
+    elif CFG.selenium_web_browser == "safari":
+        # Requires a bit more setup on the users end
+        # See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari
+        driver = webdriver.Safari(options=options)
+    else:
+        driver = webdriver.Chrome(
+            executable_path=ChromeDriverManager().install(), options=options
+        )
+    driver.get(url)
+
+    WebDriverWait(driver, 10).until(
+        EC.presence_of_element_located((By.TAG_NAME, "body"))
+    )
+
+    # Get the HTML content directly from the browser's DOM
+    page_source = driver.execute_script("return document.body.outerHTML;")
+    soup = BeautifulSoup(page_source, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    text = soup.get_text()
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = "\n".join(chunk for chunk in chunks if chunk)
+    return driver, text
+
+
+def scrape_links_with_selenium(driver: WebDriver, url: str) -> List[str]:
+    """Scrape links from a website using selenium
+
+    Args:
+        driver (WebDriver): The webdriver to use to scrape the links
+
+    Returns:
+        List[str]: The links scraped from the website
+    """
+    page_source = driver.page_source
+    soup = BeautifulSoup(page_source, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    hyperlinks = extract_hyperlinks(soup, url)
+
+    return format_hyperlinks(hyperlinks)
+
+
+def close_browser(driver: WebDriver) -> None:
+    """Close the browser
+
+    Args:
+        driver (WebDriver): The webdriver to close
+
+    Returns:
+        None
+    """
+    driver.quit()
+
+
+def add_header(driver: WebDriver) -> None:
+    """Add a header to the website
+
+    Args:
+        driver (WebDriver): The webdriver to use to add the header
+
+    Returns:
+        None
+    """
+    driver.execute_script(open(f"{FILE_DIR}/js/overlay.js", "r").read())