From 9f972f4ee9c938c9e64dcc314b920e66c53b3c45 Mon Sep 17 00:00:00 2001 From: Itamar Friedman Date: Thu, 13 Apr 2023 00:00:33 +0300 Subject: [PATCH] flake8 style --- scripts/browse.py | 7 ++++++- tests/unit/test_browse_scrape_links.py | 10 ++++++---- tests/unit/test_browse_scrape_text.py | 5 +++-- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/scripts/browse.py b/scripts/browse.py index 6d473ed6..e224b04d 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -6,6 +6,7 @@ from urllib.parse import urlparse, urljoin cfg = Config() + # Function to check if the URL is valid def is_valid_url(url): try: @@ -14,15 +15,18 @@ def is_valid_url(url): except ValueError: return False + # Function to sanitize the URL def sanitize_url(url): return urljoin(url, urlparse(url).path) + # Define and check for local file address prefixes def check_local_file_access(url): local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost'] return any(url.startswith(prefix) for prefix in local_prefixes) + def get_response(url, headers=cfg.user_agent_header, timeout=10): try: # Restrict access to local files @@ -33,7 +37,6 @@ def get_response(url, headers=cfg.user_agent_header, timeout=10): if not url.startswith('http://') and not url.startswith('https://'): raise ValueError('Invalid URL format') - sanitized_url = sanitize_url(url) response = requests.get(sanitized_url, headers=headers, timeout=timeout) @@ -51,6 +54,7 @@ def get_response(url, headers=cfg.user_agent_header, timeout=10): # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.) return None, "Error: " + str(re) + def scrape_text(url): """Scrape text from a webpage""" response, error_message = get_response(url) @@ -128,6 +132,7 @@ def create_message(chunk, question): "content": f"\"\"\"{chunk}\"\"\" Using the above text, please answer the following question: \"{question}\" -- if the question cannot be answered using the text, please summarize the text." } + def summarize_text(text, question): """Summarize text using the LLM model""" if not text: diff --git a/tests/unit/test_browse_scrape_links.py b/tests/unit/test_browse_scrape_links.py index 9b69b27b..639987a2 100644 --- a/tests/unit/test_browse_scrape_links.py +++ b/tests/unit/test_browse_scrape_links.py @@ -11,7 +11,8 @@ from scripts.browse import scrape_links Code Analysis Objective: -The objective of the 'scrape_links' function is to scrape hyperlinks from a given URL and return them in a formatted way. +The objective of the 'scrape_links' function is to scrape hyperlinks from a +given URL and return them in a formatted way. Inputs: - url: a string representing the URL to be scraped. @@ -29,17 +30,18 @@ Outputs: - A list of formatted hyperlinks. Additional aspects: -- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP requests and parse HTML content, respectively. +- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP +requests and parse HTML content, respectively. - The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML. - The 'format_hyperlinks' function is called to format the extracted hyperlinks. - The function checks for HTTP errors and returns "error" if any are found. """ - class TestScrapeLinks: - # Tests that the function returns a list of formatted hyperlinks when provided with a valid url that returns a webpage with hyperlinks. + # Tests that the function returns a list of formatted hyperlinks when + # provided with a valid url that returns a webpage with hyperlinks. def test_valid_url_with_hyperlinks(self): url = "https://www.google.com" result = scrape_links(url) diff --git a/tests/unit/test_browse_scrape_text.py b/tests/unit/test_browse_scrape_text.py index 775eefcd..76072276 100644 --- a/tests/unit/test_browse_scrape_text.py +++ b/tests/unit/test_browse_scrape_text.py @@ -2,7 +2,6 @@ # Generated by CodiumAI import requests -import tests.context from scripts.browse import scrape_text @@ -10,7 +9,8 @@ from scripts.browse import scrape_text Code Analysis Objective: -The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts. +The objective of the "scrape_text" function is to scrape the text content from +a given URL and return it as a string, after removing any unwanted HTML tags and scripts. Inputs: - url: a string representing the URL of the webpage to be scraped. @@ -33,6 +33,7 @@ Additional aspects: - The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text. """ + class TestScrapeText: # Tests that scrape_text() returns the expected text when given a valid URL.