flake8 style

This commit is contained in:
Itamar Friedman
2023-04-13 00:00:33 +03:00
parent a40ccc1e5d
commit 9f972f4ee9
3 changed files with 15 additions and 7 deletions

View File

@@ -6,6 +6,7 @@ from urllib.parse import urlparse, urljoin
cfg = Config() cfg = Config()
# Function to check if the URL is valid # Function to check if the URL is valid
def is_valid_url(url): def is_valid_url(url):
try: try:
@@ -14,15 +15,18 @@ def is_valid_url(url):
except ValueError: except ValueError:
return False return False
# Function to sanitize the URL # Function to sanitize the URL
def sanitize_url(url): def sanitize_url(url):
return urljoin(url, urlparse(url).path) return urljoin(url, urlparse(url).path)
# Define and check for local file address prefixes # Define and check for local file address prefixes
def check_local_file_access(url): def check_local_file_access(url):
local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost'] local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost']
return any(url.startswith(prefix) for prefix in local_prefixes) return any(url.startswith(prefix) for prefix in local_prefixes)
def get_response(url, headers=cfg.user_agent_header, timeout=10): def get_response(url, headers=cfg.user_agent_header, timeout=10):
try: try:
# Restrict access to local files # Restrict access to local files
@@ -33,7 +37,6 @@ def get_response(url, headers=cfg.user_agent_header, timeout=10):
if not url.startswith('http://') and not url.startswith('https://'): if not url.startswith('http://') and not url.startswith('https://'):
raise ValueError('Invalid URL format') raise ValueError('Invalid URL format')
sanitized_url = sanitize_url(url) sanitized_url = sanitize_url(url)
response = requests.get(sanitized_url, headers=headers, timeout=timeout) response = requests.get(sanitized_url, headers=headers, timeout=timeout)
@@ -51,6 +54,7 @@ def get_response(url, headers=cfg.user_agent_header, timeout=10):
# Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.) # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
return None, "Error: " + str(re) return None, "Error: " + str(re)
def scrape_text(url): def scrape_text(url):
"""Scrape text from a webpage""" """Scrape text from a webpage"""
response, error_message = get_response(url) response, error_message = get_response(url)
@@ -128,6 +132,7 @@ def create_message(chunk, question):
"content": f"\"\"\"{chunk}\"\"\" Using the above text, please answer the following question: \"{question}\" -- if the question cannot be answered using the text, please summarize the text." "content": f"\"\"\"{chunk}\"\"\" Using the above text, please answer the following question: \"{question}\" -- if the question cannot be answered using the text, please summarize the text."
} }
def summarize_text(text, question): def summarize_text(text, question):
"""Summarize text using the LLM model""" """Summarize text using the LLM model"""
if not text: if not text:

View File

@@ -11,7 +11,8 @@ from scripts.browse import scrape_links
Code Analysis Code Analysis
Objective: Objective:
The objective of the 'scrape_links' function is to scrape hyperlinks from a given URL and return them in a formatted way. The objective of the 'scrape_links' function is to scrape hyperlinks from a
given URL and return them in a formatted way.
Inputs: Inputs:
- url: a string representing the URL to be scraped. - url: a string representing the URL to be scraped.
@@ -29,17 +30,18 @@ Outputs:
- A list of formatted hyperlinks. - A list of formatted hyperlinks.
Additional aspects: Additional aspects:
- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP requests and parse HTML content, respectively. - The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP
requests and parse HTML content, respectively.
- The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML. - The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML.
- The 'format_hyperlinks' function is called to format the extracted hyperlinks. - The 'format_hyperlinks' function is called to format the extracted hyperlinks.
- The function checks for HTTP errors and returns "error" if any are found. - The function checks for HTTP errors and returns "error" if any are found.
""" """
class TestScrapeLinks: class TestScrapeLinks:
# Tests that the function returns a list of formatted hyperlinks when provided with a valid url that returns a webpage with hyperlinks. # Tests that the function returns a list of formatted hyperlinks when
# provided with a valid url that returns a webpage with hyperlinks.
def test_valid_url_with_hyperlinks(self): def test_valid_url_with_hyperlinks(self):
url = "https://www.google.com" url = "https://www.google.com"
result = scrape_links(url) result = scrape_links(url)

View File

@@ -2,7 +2,6 @@
# Generated by CodiumAI # Generated by CodiumAI
import requests import requests
import tests.context
from scripts.browse import scrape_text from scripts.browse import scrape_text
@@ -10,7 +9,8 @@ from scripts.browse import scrape_text
Code Analysis Code Analysis
Objective: Objective:
The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts. The objective of the "scrape_text" function is to scrape the text content from
a given URL and return it as a string, after removing any unwanted HTML tags and scripts.
Inputs: Inputs:
- url: a string representing the URL of the webpage to be scraped. - url: a string representing the URL of the webpage to be scraped.
@@ -33,6 +33,7 @@ Additional aspects:
- The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text. - The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text.
""" """
class TestScrapeText: class TestScrapeText:
# Tests that scrape_text() returns the expected text when given a valid URL. # Tests that scrape_text() returns the expected text when given a valid URL.