remove summary.py from this branch to avoid merge conflict

2025-12-18 14:34:23 +01:00 · 2023-04-15 17:30:18 +02:00
parent bee1bc8c06 3ee961c600
commit fbe1b0e5b0
72 changed files with 2549 additions and 1981 deletions
--- a/autogpt/commands/web_requests.py
+++ b/autogpt/commands/web_requests.py
@@ -0,0 +1,198 @@
+"""Browse a webpage and summarize it using the LLM model"""
+from typing import List, Tuple, Union
+from urllib.parse import urljoin, urlparse
+
+import requests
+from requests import Response
+from bs4 import BeautifulSoup
+
+from autogpt.config import Config
+from autogpt.memory import get_memory
+
+CFG = Config()
+memory = get_memory(CFG)
+
+session = requests.Session()
+session.headers.update({"User-Agent": CFG.user_agent})
+
+
+def is_valid_url(url: str) -> bool:
+    """Check if the URL is valid
+
+    Args:
+        url (str): The URL to check
+
+    Returns:
+        bool: True if the URL is valid, False otherwise
+    """
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except ValueError:
+        return False
+
+
+def sanitize_url(url: str) -> str:
+    """Sanitize the URL
+
+    Args:
+        url (str): The URL to sanitize
+
+    Returns:
+        str: The sanitized URL
+    """
+    return urljoin(url, urlparse(url).path)
+
+
+def check_local_file_access(url: str) -> bool:
+    """Check if the URL is a local file
+
+    Args:
+        url (str): The URL to check
+
+    Returns:
+        bool: True if the URL is a local file, False otherwise
+    """
+    local_prefixes = [
+        "file:///",
+        "file://localhost",
+        "http://localhost",
+        "https://localhost",
+    ]
+    return any(url.startswith(prefix) for prefix in local_prefixes)
+
+
+def get_response(
+    url: str, timeout: int = 10
+) -> Union[Tuple[None, str], Tuple[Response, None]]:
+    """Get the response from a URL
+
+    Args:
+        url (str): The URL to get the response from
+        timeout (int): The timeout for the HTTP request
+
+    Returns:
+        tuple[None, str] | tuple[Response, None]: The response and error message
+
+    Raises:
+        ValueError: If the URL is invalid
+        requests.exceptions.RequestException: If the HTTP request fails
+    """
+    try:
+        # Restrict access to local files
+        if check_local_file_access(url):
+            raise ValueError("Access to local files is restricted")
+
+        # Most basic check if the URL is valid:
+        if not url.startswith("http://") and not url.startswith("https://"):
+            raise ValueError("Invalid URL format")
+
+        sanitized_url = sanitize_url(url)
+
+        response = session.get(sanitized_url, timeout=timeout)
+
+        # Check if the response contains an HTTP error
+        if response.status_code >= 400:
+            return None, f"Error: HTTP {str(response.status_code)} error"
+
+        return response, None
+    except ValueError as ve:
+        # Handle invalid URL format
+        return None, f"Error: {str(ve)}"
+
+    except requests.exceptions.RequestException as re:
+        # Handle exceptions related to the HTTP request
+        #  (e.g., connection errors, timeouts, etc.)
+        return None, f"Error: {str(re)}"
+
+
+def scrape_text(url: str) -> str:
+    """Scrape text from a webpage
+
+    Args:
+        url (str): The URL to scrape text from
+
+    Returns:
+        str: The scraped text
+    """
+    response, error_message = get_response(url)
+    if error_message:
+        return error_message
+    if not response:
+        return "Error: Could not get response"
+
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    text = soup.get_text()
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = "\n".join(chunk for chunk in chunks if chunk)
+
+    return text
+
+
+def extract_hyperlinks(soup: BeautifulSoup) -> List[Tuple[str, str]]:
+    """Extract hyperlinks from a BeautifulSoup object
+
+    Args:
+        soup (BeautifulSoup): The BeautifulSoup object
+
+    Returns:
+        List[Tuple[str, str]]: The extracted hyperlinks
+    """
+    hyperlinks = []
+    for link in soup.find_all("a", href=True):
+        hyperlinks.append((link.text, link["href"]))
+    return hyperlinks
+
+
+def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]:
+    """Format hyperlinks into a list of strings
+
+    Args:
+        hyperlinks (List[Tuple[str, str]]): The hyperlinks to format
+
+    Returns:
+        List[str]: The formatted hyperlinks
+    """
+    formatted_links = []
+    for link_text, link_url in hyperlinks:
+        formatted_links.append(f"{link_text} ({link_url})")
+    return formatted_links
+
+
+def scrape_links(url: str) -> Union[str, List[str]]:
+    """Scrape links from a webpage
+
+    Args:
+        url (str): The URL to scrape links from
+
+    Returns:
+        Union[str, List[str]]: The scraped links
+    """
+    response, error_message = get_response(url)
+    if error_message:
+        return error_message
+    if not response:
+        return "Error: Could not get response"
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    hyperlinks = extract_hyperlinks(soup)
+
+    return format_hyperlinks(hyperlinks)
+
+
+def create_message(chunk, question):
+    """Create a message for the user to summarize a chunk of text"""
+    return {
+        "role": "user",
+        "content": f'"""{chunk}""" Using the above text, answer the following'
+        f' question: "{question}" -- if the question cannot be answered using the'
+        " text, summarize the text.",
+    }