diff --git a/scripts/browse.py b/scripts/browse.py index b0c745ef..c3fc0662 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -2,9 +2,31 @@ import requests from bs4 import BeautifulSoup from config import Config from llm_utils import create_chat_completion +from urllib.parse import urlparse, urljoin cfg = Config() +# Function to check if the URL is valid +def is_valid_url(url): + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + +# Function to sanitize the URL +def sanitize_url(url): + return urljoin(url, urlparse(url).path) + +# Function to make a request with a specified timeout and handle exceptions +def make_request(url, timeout=10): + try: + response = requests.get(url, headers=cfg.user_agent_header, timeout=timeout) + response.raise_for_status() + return response + except requests.exceptions.RequestException as e: + return "Error: " + str(e) + # Define and check for local file address prefixes def check_local_file_access(url): local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost'] @@ -12,7 +34,7 @@ def check_local_file_access(url): def scrape_text(url): """Scrape text from a webpage""" - # Most basic check if the URL is valid: + # Basic check if the URL is valid if not url.startswith('http'): return "Error: Invalid URL" @@ -20,14 +42,21 @@ def scrape_text(url): if check_local_file_access(url): return "Error: Access to local files is restricted" - try: - response = requests.get(url, headers=cfg.user_agent_header) - except requests.exceptions.RequestException as e: - return "Error: " + str(e) + # Validate the input URL + if not is_valid_url(url): + # Sanitize the input URL + sanitized_url = sanitize_url(url) - # Check if the response contains an HTTP error - if response.status_code >= 400: - return "Error: HTTP " + str(response.status_code) + " error" + # Make the request with a timeout and handle exceptions + response = make_request(sanitized_url) + + if isinstance(response, str): + return response + else: + # Sanitize the input URL + sanitized_url = sanitize_url(url) + + response = requests.get(sanitized_url, headers=cfg.user_agent_header) soup = BeautifulSoup(response.text, "html.parser")