From fe0923ba6c9abb42ac4df79da580e8a4391e0418 Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Fri, 2 Feb 2024 18:30:37 +0100 Subject: [PATCH] feat(agent/web): Add browser extensions to deal with cookie walls and ads (#6778) * Add `_sideload_chrome_extensions` subroutine to `open_page_in_browser` in web_selenium.py * Sideloads uBlock Origin and I Still Don't Care About Cookies, downloading them if necessary * Add 2-second delay to `open_page_in_browser` to allow time for handling cookie walls --- .../autogpt/autogpt/commands/web_selenium.py | 43 +++++++++++++++---- autogpts/autogpt/autogpt/processing/text.py | 1 - 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/autogpts/autogpt/autogpt/commands/web_selenium.py b/autogpts/autogpt/autogpt/commands/web_selenium.py index 5585e2cb..fa5de1f1 100644 --- a/autogpts/autogpt/autogpt/commands/web_selenium.py +++ b/autogpts/autogpt/autogpt/commands/web_selenium.py @@ -2,11 +2,13 @@ from __future__ import annotations +import asyncio import logging import re from pathlib import Path from sys import platform from typing import TYPE_CHECKING, Optional, Type +from urllib.request import urlretrieve from bs4 import BeautifulSoup from selenium.common.exceptions import WebDriverException @@ -115,8 +117,7 @@ async def read_webpage( """ driver = None try: - # FIXME: agent.config -> something else - driver = open_page_in_browser(url, agent.legacy_config) + driver = await open_page_in_browser(url, agent.legacy_config) text = scrape_text_with_selenium(driver) links = scrape_links_with_selenium(driver, url) @@ -214,7 +215,7 @@ def scrape_links_with_selenium(driver: WebDriver, base_url: str) -> list[str]: return format_hyperlinks(hyperlinks) -def open_page_in_browser(url: str, config: Config) -> WebDriver: +async def open_page_in_browser(url: str, config: Config) -> WebDriver: """Open a browser window and load a web page using Selenium Params: @@ -236,22 +237,22 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver: options: BrowserOptions = options_available[config.selenium_web_browser]() options.add_argument(f"user-agent={config.user_agent}") - if config.selenium_web_browser == "firefox": + if isinstance(options, FirefoxOptions): if config.selenium_headless: options.headless = True options.add_argument("--disable-gpu") driver = FirefoxDriver( service=GeckoDriverService(GeckoDriverManager().install()), options=options ) - elif config.selenium_web_browser == "edge": + elif isinstance(options, EdgeOptions): driver = EdgeDriver( service=EdgeDriverService(EdgeDriverManager().install()), options=options ) - elif config.selenium_web_browser == "safari": + elif isinstance(options, SafariOptions): # Requires a bit more setup on the users end. # See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari # noqa: E501 driver = SafariDriver(options=options) - else: + elif isinstance(options, ChromeOptions): if platform == "linux" or platform == "linux2": options.add_argument("--disable-dev-shm-usage") options.add_argument("--remote-debugging-port=9222") @@ -261,6 +262,8 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver: options.add_argument("--headless=new") options.add_argument("--disable-gpu") + _sideload_chrome_extensions(options, config.app_data_dir / "assets" / "crx") + chromium_driver_path = Path("/usr/bin/chromedriver") driver = ChromeDriver( @@ -271,6 +274,12 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver: ) driver.get(url) + # Wait for page to be ready, sleep 2 seconds, wait again until page ready. + # This allows the cookiewall squasher time to get rid of cookie walls. + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.TAG_NAME, "body")) + ) + await asyncio.sleep(2) WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) @@ -278,6 +287,24 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver: return driver +def _sideload_chrome_extensions(options: ChromeOptions, dl_folder: Path) -> None: + crx_download_url_template = "https://clients2.google.com/service/update2/crx?response=redirect&prodversion=49.0&acceptformat=crx3&x=id%3D{crx_id}%26installsource%3Dondemand%26uc" # noqa + cookiewall_squasher_crx_id = "edibdbjcniadpccecjdfdjjppcpchdlm" + adblocker_crx_id = "cjpalhdlnbpafiamejdnhcphjbkeiagm" + + # Make sure the target folder exists + dl_folder.mkdir(parents=True, exist_ok=True) + + for crx_id in (cookiewall_squasher_crx_id, adblocker_crx_id): + crx_path = dl_folder / f"{crx_id}.crx" + if not crx_path.exists(): + logger.debug(f"Downloading CRX {crx_id}...") + crx_download_url = crx_download_url_template.format(crx_id=crx_id) + urlretrieve(crx_download_url, crx_path) + logger.debug(f"Downloaded {crx_path.name}") + options.add_extension(str(crx_path)) + + def close_browser(driver: WebDriver) -> None: """Close the browser @@ -313,7 +340,7 @@ async def summarize_memorize_webpage( raise ValueError("No text to summarize") text_length = len(text) - logger.info(f"Text length: {text_length} characters") + logger.debug(f"Web page content length: {text_length} characters") # memory = get_memory(agent.legacy_config) diff --git a/autogpts/autogpt/autogpt/processing/text.py b/autogpts/autogpt/autogpt/processing/text.py index 75c064a1..c69ed7e6 100644 --- a/autogpts/autogpt/autogpt/processing/text.py +++ b/autogpts/autogpt/autogpt/processing/text.py @@ -11,7 +11,6 @@ from autogpt.core.prompting import ChatPrompt from autogpt.core.resource.model_providers import ( ChatMessage, ChatModelProvider, - ChatModelResponse, ModelTokenizer, )