From a4f130ff602b89e1dac571362b34db5b8cb41429 Mon Sep 17 00:00:00 2001 From: slavakurilyak Date: Tue, 4 Apr 2023 19:28:13 -0500 Subject: [PATCH] Improve security and robustness in browse.py --- scripts/browse.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/scripts/browse.py b/scripts/browse.py index 510f9c29..d07f2699 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -2,15 +2,44 @@ import requests from bs4 import BeautifulSoup from config import Config from llm_utils import create_chat_completion +from urllib.parse import urlparse, urljoin cfg = Config() -def scrape_text(url): - response = requests.get(url) +# Function to check if the URL is valid +def is_valid_url(url): + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False - # Check if the response contains an HTTP error - if response.status_code >= 400: - return "Error: HTTP " + str(response.status_code) + " error" +# Function to sanitize the URL +def sanitize_url(url): + return urljoin(url, urlparse(url).path) + +# Function to make a request with a specified timeout and handle exceptions +def make_request(url, timeout=10): + try: + response = requests.get(url, timeout=timeout) + response.raise_for_status() + return response + except requests.exceptions.RequestException as e: + return "Error: " + str(e) + +def scrape_text(url): + # Validate the input URL + if not is_valid_url(url): + return "Error: Invalid URL" + + # Sanitize the input URL + sanitized_url = sanitize_url(url) + + # Make the request with a timeout and handle exceptions + response = make_request(sanitized_url) + + if isinstance(response, str): + return response soup = BeautifulSoup(response.text, "html.parser")