From fe0923ba6c9abb42ac4df79da580e8a4391e0418 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 2 Feb 2024 18:30:37 +0100
Subject: [PATCH] feat(agent/web): Add browser extensions to deal with cookie
 walls and ads (#6778)

* Add `_sideload_chrome_extensions` subroutine to `open_page_in_browser` in web_selenium.py
   * Sideloads uBlock Origin and I Still Don't Care About Cookies, downloading them if necessary
* Add 2-second delay to `open_page_in_browser` to allow time for handling cookie walls
---
 .../autogpt/autogpt/commands/web_selenium.py  | 43 +++++++++++++++----
 autogpts/autogpt/autogpt/processing/text.py   |  1 -
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/autogpts/autogpt/autogpt/commands/web_selenium.py b/autogpts/autogpt/autogpt/commands/web_selenium.py
index 5585e2cb..fa5de1f1 100644
--- a/autogpts/autogpt/autogpt/commands/web_selenium.py
+++ b/autogpts/autogpt/autogpt/commands/web_selenium.py
@@ -2,11 +2,13 @@
 
 from __future__ import annotations
 
+import asyncio
 import logging
 import re
 from pathlib import Path
 from sys import platform
 from typing import TYPE_CHECKING, Optional, Type
+from urllib.request import urlretrieve
 
 from bs4 import BeautifulSoup
 from selenium.common.exceptions import WebDriverException
@@ -115,8 +117,7 @@ async def read_webpage(
     """
     driver = None
     try:
-        # FIXME: agent.config -> something else
-        driver = open_page_in_browser(url, agent.legacy_config)
+        driver = await open_page_in_browser(url, agent.legacy_config)
 
         text = scrape_text_with_selenium(driver)
         links = scrape_links_with_selenium(driver, url)
@@ -214,7 +215,7 @@ def scrape_links_with_selenium(driver: WebDriver, base_url: str) -> list[str]:
     return format_hyperlinks(hyperlinks)
 
 
-def open_page_in_browser(url: str, config: Config) -> WebDriver:
+async def open_page_in_browser(url: str, config: Config) -> WebDriver:
     """Open a browser window and load a web page using Selenium
 
     Params:
@@ -236,22 +237,22 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver:
     options: BrowserOptions = options_available[config.selenium_web_browser]()
     options.add_argument(f"user-agent={config.user_agent}")
 
-    if config.selenium_web_browser == "firefox":
+    if isinstance(options, FirefoxOptions):
         if config.selenium_headless:
             options.headless = True
             options.add_argument("--disable-gpu")
         driver = FirefoxDriver(
             service=GeckoDriverService(GeckoDriverManager().install()), options=options
         )
-    elif config.selenium_web_browser == "edge":
+    elif isinstance(options, EdgeOptions):
         driver = EdgeDriver(
             service=EdgeDriverService(EdgeDriverManager().install()), options=options
         )
-    elif config.selenium_web_browser == "safari":
+    elif isinstance(options, SafariOptions):
         # Requires a bit more setup on the users end.
         # See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari  # noqa: E501
         driver = SafariDriver(options=options)
-    else:
+    elif isinstance(options, ChromeOptions):
         if platform == "linux" or platform == "linux2":
             options.add_argument("--disable-dev-shm-usage")
             options.add_argument("--remote-debugging-port=9222")
@@ -261,6 +262,8 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver:
             options.add_argument("--headless=new")
             options.add_argument("--disable-gpu")
 
+        _sideload_chrome_extensions(options, config.app_data_dir / "assets" / "crx")
+
         chromium_driver_path = Path("/usr/bin/chromedriver")
 
         driver = ChromeDriver(
@@ -271,6 +274,12 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver:
         )
     driver.get(url)
 
+    # Wait for page to be ready, sleep 2 seconds, wait again until page ready.
+    # This allows the cookiewall squasher time to get rid of cookie walls.
+    WebDriverWait(driver, 10).until(
+        EC.presence_of_element_located((By.TAG_NAME, "body"))
+    )
+    await asyncio.sleep(2)
     WebDriverWait(driver, 10).until(
         EC.presence_of_element_located((By.TAG_NAME, "body"))
     )
@@ -278,6 +287,24 @@ def open_page_in_browser(url: str, config: Config) -> WebDriver:
     return driver
 
 
+def _sideload_chrome_extensions(options: ChromeOptions, dl_folder: Path) -> None:
+    crx_download_url_template = "https://clients2.google.com/service/update2/crx?response=redirect&prodversion=49.0&acceptformat=crx3&x=id%3D{crx_id}%26installsource%3Dondemand%26uc"  # noqa
+    cookiewall_squasher_crx_id = "edibdbjcniadpccecjdfdjjppcpchdlm"
+    adblocker_crx_id = "cjpalhdlnbpafiamejdnhcphjbkeiagm"
+
+    # Make sure the target folder exists
+    dl_folder.mkdir(parents=True, exist_ok=True)
+
+    for crx_id in (cookiewall_squasher_crx_id, adblocker_crx_id):
+        crx_path = dl_folder / f"{crx_id}.crx"
+        if not crx_path.exists():
+            logger.debug(f"Downloading CRX {crx_id}...")
+            crx_download_url = crx_download_url_template.format(crx_id=crx_id)
+            urlretrieve(crx_download_url, crx_path)
+            logger.debug(f"Downloaded {crx_path.name}")
+        options.add_extension(str(crx_path))
+
+
 def close_browser(driver: WebDriver) -> None:
     """Close the browser
 
@@ -313,7 +340,7 @@ async def summarize_memorize_webpage(
         raise ValueError("No text to summarize")
 
     text_length = len(text)
-    logger.info(f"Text length: {text_length} characters")
+    logger.debug(f"Web page content length: {text_length} characters")
 
     # memory = get_memory(agent.legacy_config)
 
diff --git a/autogpts/autogpt/autogpt/processing/text.py b/autogpts/autogpt/autogpt/processing/text.py
index 75c064a1..c69ed7e6 100644
--- a/autogpts/autogpt/autogpt/processing/text.py
+++ b/autogpts/autogpt/autogpt/processing/text.py
@@ -11,7 +11,6 @@ from autogpt.core.prompting import ChatPrompt
 from autogpt.core.resource.model_providers import (
     ChatMessage,
     ChatModelProvider,
-    ChatModelResponse,
     ModelTokenizer,
 )