Files
Auto-GPT/autogpt/commands/web_requests.py
Nicholas Tindle acfd966aa4 Pass Configs to Commands and remove CFG = Config() in the commands/ folder (#4328)
* feat: pass config to call_ai_functions in coimmands

* feat: config for read_audio_from_file

* feat: file operations cfg

NOTE: we replaced the CFG in the command enable with TRUE b/c not sure how to handle this yet

* feat: git command conversion

* feat: google search

* feat: image generation

* feat: extract cfg from browser commands

* feat: remove cfg from execute code commands

* fix: file operation related tests

* fix: linting

* fix: tests for read_audio

* fix: test error

* feat: update cassettes

* fix: linting

* fix: test typechecking

* fix: google_search errors if unexpected kw arg is passed

* fix: pass config param to google search test

* fix: agent commands were broken + cassettes

* fix: agent test

* feat: cassettes

* feat: enable/disable logic for commands

* fix: some commands threw errors

* feat: fix tests

* Add new cassettes

* Add new cassettes

* ci: trigger ci

* Update autogpt/commands/execute_code.py

Co-authored-by: Reinier van der Leer <github@pwuts.nl>

* fix prompt

* fix prompt + rebase

* add config remove useless imports

* put back CFG just for download file

* lint

* The signature should be mandatory in the decorator

* black isort

* fix: remove the CFG

* fix: non typed arg

* lint: type some args

* lint: add types for libraries

* Add new cassettes

* fix: windows compatibility

* fix: add config access to decorator

* fix: remove twitter mention

* DDGS search works at 3.0.2 version

* ci: linting

---------

Co-authored-by: Auto-GPT-Bot <github-bot@agpt.co>
Co-authored-by: merwanehamadi <merwanehamadi@gmail.com>
Co-authored-by: Reinier van der Leer <github@pwuts.nl>
Co-authored-by: kinance <kinance@gmail.com>
2023-05-26 08:39:25 -07:00

101 lines
2.8 KiB
Python

"""Browse a webpage and summarize it using the LLM model"""
from __future__ import annotations
import requests
from bs4 import BeautifulSoup
from requests import Response
from autogpt.config import Config
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
from autogpt.url_utils.validators import validate_url
session = requests.Session()
@validate_url
def get_response(
url: str, config: Config, timeout: int = 10
) -> tuple[None, str] | tuple[Response, None]:
"""Get the response from a URL
Args:
url (str): The URL to get the response from
timeout (int): The timeout for the HTTP request
Returns:
tuple[None, str] | tuple[Response, None]: The response and error message
Raises:
ValueError: If the URL is invalid
requests.exceptions.RequestException: If the HTTP request fails
"""
try:
session.headers.update({"User-Agent": config.user_agent})
response = session.get(url, timeout=timeout)
# Check if the response contains an HTTP error
if response.status_code >= 400:
return None, f"Error: HTTP {str(response.status_code)} error"
return response, None
except ValueError as ve:
# Handle invalid URL format
return None, f"Error: {str(ve)}"
except requests.exceptions.RequestException as re:
# Handle exceptions related to the HTTP request
# (e.g., connection errors, timeouts, etc.)
return None, f"Error: {str(re)}"
def scrape_text(url: str, config: Config) -> str:
"""Scrape text from a webpage
Args:
url (str): The URL to scrape text from
Returns:
str: The scraped text
"""
response, error_message = get_response(url, config)
if error_message:
return error_message
if not response:
return "Error: Could not get response"
soup = BeautifulSoup(response.text, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return text
def scrape_links(url: str, config: Config) -> str | list[str]:
"""Scrape links from a webpage
Args:
url (str): The URL to scrape links from
Returns:
str | list[str]: The scraped links
"""
response, error_message = get_response(url, config)
if error_message:
return error_message
if not response:
return "Error: Could not get response"
soup = BeautifulSoup(response.text, "html.parser")
for script in soup(["script", "style"]):
script.extract()
hyperlinks = extract_hyperlinks(soup, url)
return format_hyperlinks(hyperlinks)