Merge pull request #780 from coditamar/browse_scrape_links_test_and_validate

browse: (1) apply url validation also to scrape_links(), (2) add unit-tests for scrape_links()
This commit is contained in:
Richard Beales
2023-04-13 07:10:06 +01:00
committed by GitHub
3 changed files with 159 additions and 38 deletions

View File

@@ -6,6 +6,7 @@ from urllib.parse import urlparse, urljoin
cfg = Config()
# Function to check if the URL is valid
def is_valid_url(url):
try:
@@ -14,49 +15,51 @@ def is_valid_url(url):
except ValueError:
return False
# Function to sanitize the URL
def sanitize_url(url):
return urljoin(url, urlparse(url).path)
# Function to make a request with a specified timeout and handle exceptions
def make_request(url, timeout=10):
try:
response = requests.get(url, headers=cfg.user_agent_header, timeout=timeout)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
return "Error: " + str(e)
# Define and check for local file address prefixes
def check_local_file_access(url):
local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost']
return any(url.startswith(prefix) for prefix in local_prefixes)
def get_response(url, headers=cfg.user_agent_header, timeout=10):
try:
# Restrict access to local files
if check_local_file_access(url):
raise ValueError('Access to local files is restricted')
# Most basic check if the URL is valid:
if not url.startswith('http://') and not url.startswith('https://'):
raise ValueError('Invalid URL format')
sanitized_url = sanitize_url(url)
response = requests.get(sanitized_url, headers=headers, timeout=timeout)
# Check if the response contains an HTTP error
if response.status_code >= 400:
return None, "Error: HTTP " + str(response.status_code) + " error"
return response, None
except ValueError as ve:
# Handle invalid URL format
return None, "Error: " + str(ve)
except requests.exceptions.RequestException as re:
# Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
return None, "Error: " + str(re)
def scrape_text(url):
"""Scrape text from a webpage"""
# Basic check if the URL is valid
if not url.startswith('http'):
return "Error: Invalid URL"
# Restrict access to local files
if check_local_file_access(url):
return "Error: Access to local files is restricted"
# Validate the input URL
if not is_valid_url(url):
# Sanitize the input URL
sanitized_url = sanitize_url(url)
# Make the request with a timeout and handle exceptions
response = make_request(sanitized_url)
if isinstance(response, str):
return response
else:
# Sanitize the input URL
sanitized_url = sanitize_url(url)
response = requests.get(sanitized_url, headers=cfg.user_agent_header)
response, error_message = get_response(url)
if error_message:
return error_message
soup = BeautifulSoup(response.text, "html.parser")
@@ -89,11 +92,9 @@ def format_hyperlinks(hyperlinks):
def scrape_links(url):
"""Scrape links from a webpage"""
response = requests.get(url, headers=cfg.user_agent_header)
# Check if the response contains an HTTP error
if response.status_code >= 400:
return "error"
response, error_message = get_response(url)
if error_message:
return error_message
soup = BeautifulSoup(response.text, "html.parser")
@@ -131,6 +132,7 @@ def create_message(chunk, question):
"content": f"\"\"\"{chunk}\"\"\" Using the above text, please answer the following question: \"{question}\" -- if the question cannot be answered using the text, please summarize the text."
}
def summarize_text(text, question):
"""Summarize text using the LLM model"""
if not text:

View File

@@ -0,0 +1,118 @@
# Generated by CodiumAI
# Dependencies:
# pip install pytest-mock
import pytest
from scripts.browse import scrape_links
"""
Code Analysis
Objective:
The objective of the 'scrape_links' function is to scrape hyperlinks from a
given URL and return them in a formatted way.
Inputs:
- url: a string representing the URL to be scraped.
Flow:
1. Send a GET request to the given URL using the requests library and the user agent header from the config file.
2. Check if the response contains an HTTP error. If it does, return "error".
3. Parse the HTML content of the response using the BeautifulSoup library.
4. Remove any script and style tags from the parsed HTML.
5. Extract all hyperlinks from the parsed HTML using the 'extract_hyperlinks' function.
6. Format the extracted hyperlinks using the 'format_hyperlinks' function.
7. Return the formatted hyperlinks.
Outputs:
- A list of formatted hyperlinks.
Additional aspects:
- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP
requests and parse HTML content, respectively.
- The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML.
- The 'format_hyperlinks' function is called to format the extracted hyperlinks.
- The function checks for HTTP errors and returns "error" if any are found.
"""
class TestScrapeLinks:
# Tests that the function returns a list of formatted hyperlinks when
# provided with a valid url that returns a webpage with hyperlinks.
def test_valid_url_with_hyperlinks(self):
url = "https://www.google.com"
result = scrape_links(url)
assert len(result) > 0
assert isinstance(result, list)
assert isinstance(result[0], str)
# Tests that the function returns correctly formatted hyperlinks when given a valid url.
def test_valid_url(self, mocker):
# Mock the requests.get() function to return a response with sample HTML containing hyperlinks
mock_response = mocker.Mock()
mock_response.status_code = 200
mock_response.text = "<html><body><a href='https://www.google.com'>Google</a></body></html>"
mocker.patch('requests.get', return_value=mock_response)
# Call the function with a valid URL
result = scrape_links("https://www.example.com")
# Assert that the function returns correctly formatted hyperlinks
assert result == ["Google (https://www.google.com)"]
# Tests that the function returns "error" when given an invalid url.
def test_invalid_url(self, mocker):
# Mock the requests.get() function to return an HTTP error response
mock_response = mocker.Mock()
mock_response.status_code = 404
mocker.patch('requests.get', return_value=mock_response)
# Call the function with an invalid URL
result = scrape_links("https://www.invalidurl.com")
# Assert that the function returns "error"
assert "Error:" in result
# Tests that the function returns an empty list when the html contains no hyperlinks.
def test_no_hyperlinks(self, mocker):
# Mock the requests.get() function to return a response with sample HTML containing no hyperlinks
mock_response = mocker.Mock()
mock_response.status_code = 200
mock_response.text = "<html><body><p>No hyperlinks here</p></body></html>"
mocker.patch('requests.get', return_value=mock_response)
# Call the function with a URL containing no hyperlinks
result = scrape_links("https://www.example.com")
# Assert that the function returns an empty list
assert result == []
# Tests that scrape_links() correctly extracts and formats hyperlinks from
# a sample HTML containing a few hyperlinks.
def test_scrape_links_with_few_hyperlinks(self, mocker):
# Mock the requests.get() function to return a response with a sample HTML containing hyperlinks
mock_response = mocker.Mock()
mock_response.status_code = 200
mock_response.text = """
<html>
<body>
<div id="google-link"><a href="https://www.google.com">Google</a></div>
<div id="github"><a href="https://github.com">GitHub</a></div>
<div id="CodiumAI"><a href="https://www.codium.ai">CodiumAI</a></div>
</body>
</html>
"""
mocker.patch('requests.get', return_value=mock_response)
# Call the function being tested
result = scrape_links("https://www.example.com")
# Assert that the function returns a list of formatted hyperlinks
assert isinstance(result, list)
assert len(result) == 3
assert result[0] == "Google (https://www.google.com)"
assert result[1] == "GitHub (https://github.com)"
assert result[2] == "CodiumAI (https://www.codium.ai)"

View File

@@ -2,7 +2,6 @@
# Generated by CodiumAI
import requests
import tests.context
from scripts.browse import scrape_text
@@ -10,7 +9,8 @@ from scripts.browse import scrape_text
Code Analysis
Objective:
The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts.
The objective of the "scrape_text" function is to scrape the text content from
a given URL and return it as a string, after removing any unwanted HTML tags and scripts.
Inputs:
- url: a string representing the URL of the webpage to be scraped.
@@ -33,6 +33,7 @@ Additional aspects:
- The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text.
"""
class TestScrapeText:
# Tests that scrape_text() returns the expected text when given a valid URL.