feat(agent/web): Add browser extensions to deal with cookie walls and ads (#6778)

* Add `_sideload_chrome_extensions` subroutine to `open_page_in_browser` in web_selenium.py
   * Sideloads uBlock Origin and I Still Don't Care About Cookies, downloading them if necessary
* Add 2-second delay to `open_page_in_browser` to allow time for handling cookie walls
This commit is contained in:
Reinier van der Leer
2024-02-02 18:30:37 +01:00
committed by GitHub
parent dfaeda7cd5
commit fe0923ba6c
2 changed files with 35 additions and 9 deletions

View File

@@ -2,11 +2,13 @@
from __future__ import annotations
import asyncio
import logging
import re
from pathlib import Path
from sys import platform
from typing import TYPE_CHECKING, Optional, Type
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
from selenium.common.exceptions import WebDriverException
@@ -115,8 +117,7 @@ async def read_webpage(
"""
driver = None
try:
# FIXME: agent.config -> something else
driver = open_page_in_browser(url, agent.legacy_config)
driver = await open_page_in_browser(url, agent.legacy_config)
text = scrape_text_with_selenium(driver)
links = scrape_links_with_selenium(driver, url)
@@ -214,7 +215,7 @@ def scrape_links_with_selenium(driver: WebDriver, base_url: str) -> list[str]:
return format_hyperlinks(hyperlinks)
def open_page_in_browser(url: str, config: Config) -> WebDriver:
async def open_page_in_browser(url: str, config: Config) -> WebDriver:
"""Open a browser window and load a web page using Selenium
Params:
@@ -236,22 +237,22 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver:
options: BrowserOptions = options_available[config.selenium_web_browser]()
options.add_argument(f"user-agent={config.user_agent}")
if config.selenium_web_browser == "firefox":
if isinstance(options, FirefoxOptions):
if config.selenium_headless:
options.headless = True
options.add_argument("--disable-gpu")
driver = FirefoxDriver(
service=GeckoDriverService(GeckoDriverManager().install()), options=options
)
elif config.selenium_web_browser == "edge":
elif isinstance(options, EdgeOptions):
driver = EdgeDriver(
service=EdgeDriverService(EdgeDriverManager().install()), options=options
)
elif config.selenium_web_browser == "safari":
elif isinstance(options, SafariOptions):
# Requires a bit more setup on the users end.
# See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari # noqa: E501
driver = SafariDriver(options=options)
else:
elif isinstance(options, ChromeOptions):
if platform == "linux" or platform == "linux2":
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
@@ -261,6 +262,8 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver:
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
_sideload_chrome_extensions(options, config.app_data_dir / "assets" / "crx")
chromium_driver_path = Path("/usr/bin/chromedriver")
driver = ChromeDriver(
@@ -271,6 +274,12 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver:
)
driver.get(url)
# Wait for page to be ready, sleep 2 seconds, wait again until page ready.
# This allows the cookiewall squasher time to get rid of cookie walls.
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
await asyncio.sleep(2)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
@@ -278,6 +287,24 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver:
return driver
def _sideload_chrome_extensions(options: ChromeOptions, dl_folder: Path) -> None:
crx_download_url_template = "https://clients2.google.com/service/update2/crx?response=redirect&prodversion=49.0&acceptformat=crx3&x=id%3D{crx_id}%26installsource%3Dondemand%26uc" # noqa
cookiewall_squasher_crx_id = "edibdbjcniadpccecjdfdjjppcpchdlm"
adblocker_crx_id = "cjpalhdlnbpafiamejdnhcphjbkeiagm"
# Make sure the target folder exists
dl_folder.mkdir(parents=True, exist_ok=True)
for crx_id in (cookiewall_squasher_crx_id, adblocker_crx_id):
crx_path = dl_folder / f"{crx_id}.crx"
if not crx_path.exists():
logger.debug(f"Downloading CRX {crx_id}...")
crx_download_url = crx_download_url_template.format(crx_id=crx_id)
urlretrieve(crx_download_url, crx_path)
logger.debug(f"Downloaded {crx_path.name}")
options.add_extension(str(crx_path))
def close_browser(driver: WebDriver) -> None:
"""Close the browser
@@ -313,7 +340,7 @@ async def summarize_memorize_webpage(
raise ValueError("No text to summarize")
text_length = len(text)
logger.info(f"Text length: {text_length} characters")
logger.debug(f"Web page content length: {text_length} characters")
# memory = get_memory(agent.legacy_config)

View File

@@ -11,7 +11,6 @@ from autogpt.core.prompting import ChatPrompt
from autogpt.core.resource.model_providers import (
ChatMessage,
ChatModelProvider,
ChatModelResponse,
ModelTokenizer,
)