mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-02-04 22:04:35 +01:00
Merge pull request #780 from coditamar/browse_scrape_links_test_and_validate
browse: (1) apply url validation also to scrape_links(), (2) add unit-tests for scrape_links()
This commit is contained in:
@@ -6,6 +6,7 @@ from urllib.parse import urlparse, urljoin
|
||||
|
||||
cfg = Config()
|
||||
|
||||
|
||||
# Function to check if the URL is valid
|
||||
def is_valid_url(url):
|
||||
try:
|
||||
@@ -14,49 +15,51 @@ def is_valid_url(url):
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
# Function to sanitize the URL
|
||||
def sanitize_url(url):
|
||||
return urljoin(url, urlparse(url).path)
|
||||
|
||||
# Function to make a request with a specified timeout and handle exceptions
|
||||
def make_request(url, timeout=10):
|
||||
try:
|
||||
response = requests.get(url, headers=cfg.user_agent_header, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except requests.exceptions.RequestException as e:
|
||||
return "Error: " + str(e)
|
||||
|
||||
# Define and check for local file address prefixes
|
||||
def check_local_file_access(url):
|
||||
local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost']
|
||||
return any(url.startswith(prefix) for prefix in local_prefixes)
|
||||
|
||||
|
||||
def get_response(url, headers=cfg.user_agent_header, timeout=10):
|
||||
try:
|
||||
# Restrict access to local files
|
||||
if check_local_file_access(url):
|
||||
raise ValueError('Access to local files is restricted')
|
||||
|
||||
# Most basic check if the URL is valid:
|
||||
if not url.startswith('http://') and not url.startswith('https://'):
|
||||
raise ValueError('Invalid URL format')
|
||||
|
||||
sanitized_url = sanitize_url(url)
|
||||
|
||||
response = requests.get(sanitized_url, headers=headers, timeout=timeout)
|
||||
|
||||
# Check if the response contains an HTTP error
|
||||
if response.status_code >= 400:
|
||||
return None, "Error: HTTP " + str(response.status_code) + " error"
|
||||
|
||||
return response, None
|
||||
except ValueError as ve:
|
||||
# Handle invalid URL format
|
||||
return None, "Error: " + str(ve)
|
||||
|
||||
except requests.exceptions.RequestException as re:
|
||||
# Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
|
||||
return None, "Error: " + str(re)
|
||||
|
||||
|
||||
def scrape_text(url):
|
||||
"""Scrape text from a webpage"""
|
||||
# Basic check if the URL is valid
|
||||
if not url.startswith('http'):
|
||||
return "Error: Invalid URL"
|
||||
|
||||
# Restrict access to local files
|
||||
if check_local_file_access(url):
|
||||
return "Error: Access to local files is restricted"
|
||||
|
||||
# Validate the input URL
|
||||
if not is_valid_url(url):
|
||||
# Sanitize the input URL
|
||||
sanitized_url = sanitize_url(url)
|
||||
|
||||
# Make the request with a timeout and handle exceptions
|
||||
response = make_request(sanitized_url)
|
||||
|
||||
if isinstance(response, str):
|
||||
return response
|
||||
else:
|
||||
# Sanitize the input URL
|
||||
sanitized_url = sanitize_url(url)
|
||||
|
||||
response = requests.get(sanitized_url, headers=cfg.user_agent_header)
|
||||
response, error_message = get_response(url)
|
||||
if error_message:
|
||||
return error_message
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
@@ -89,11 +92,9 @@ def format_hyperlinks(hyperlinks):
|
||||
|
||||
def scrape_links(url):
|
||||
"""Scrape links from a webpage"""
|
||||
response = requests.get(url, headers=cfg.user_agent_header)
|
||||
|
||||
# Check if the response contains an HTTP error
|
||||
if response.status_code >= 400:
|
||||
return "error"
|
||||
response, error_message = get_response(url)
|
||||
if error_message:
|
||||
return error_message
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
@@ -131,6 +132,7 @@ def create_message(chunk, question):
|
||||
"content": f"\"\"\"{chunk}\"\"\" Using the above text, please answer the following question: \"{question}\" -- if the question cannot be answered using the text, please summarize the text."
|
||||
}
|
||||
|
||||
|
||||
def summarize_text(text, question):
|
||||
"""Summarize text using the LLM model"""
|
||||
if not text:
|
||||
|
||||
118
tests/unit/test_browse_scrape_links.py
Normal file
118
tests/unit/test_browse_scrape_links.py
Normal file
@@ -0,0 +1,118 @@
|
||||
|
||||
# Generated by CodiumAI
|
||||
|
||||
# Dependencies:
|
||||
# pip install pytest-mock
|
||||
import pytest
|
||||
|
||||
from scripts.browse import scrape_links
|
||||
|
||||
"""
|
||||
Code Analysis
|
||||
|
||||
Objective:
|
||||
The objective of the 'scrape_links' function is to scrape hyperlinks from a
|
||||
given URL and return them in a formatted way.
|
||||
|
||||
Inputs:
|
||||
- url: a string representing the URL to be scraped.
|
||||
|
||||
Flow:
|
||||
1. Send a GET request to the given URL using the requests library and the user agent header from the config file.
|
||||
2. Check if the response contains an HTTP error. If it does, return "error".
|
||||
3. Parse the HTML content of the response using the BeautifulSoup library.
|
||||
4. Remove any script and style tags from the parsed HTML.
|
||||
5. Extract all hyperlinks from the parsed HTML using the 'extract_hyperlinks' function.
|
||||
6. Format the extracted hyperlinks using the 'format_hyperlinks' function.
|
||||
7. Return the formatted hyperlinks.
|
||||
|
||||
Outputs:
|
||||
- A list of formatted hyperlinks.
|
||||
|
||||
Additional aspects:
|
||||
- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP
|
||||
requests and parse HTML content, respectively.
|
||||
- The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML.
|
||||
- The 'format_hyperlinks' function is called to format the extracted hyperlinks.
|
||||
- The function checks for HTTP errors and returns "error" if any are found.
|
||||
"""
|
||||
|
||||
|
||||
class TestScrapeLinks:
|
||||
|
||||
# Tests that the function returns a list of formatted hyperlinks when
|
||||
# provided with a valid url that returns a webpage with hyperlinks.
|
||||
def test_valid_url_with_hyperlinks(self):
|
||||
url = "https://www.google.com"
|
||||
result = scrape_links(url)
|
||||
assert len(result) > 0
|
||||
assert isinstance(result, list)
|
||||
assert isinstance(result[0], str)
|
||||
|
||||
# Tests that the function returns correctly formatted hyperlinks when given a valid url.
|
||||
def test_valid_url(self, mocker):
|
||||
# Mock the requests.get() function to return a response with sample HTML containing hyperlinks
|
||||
mock_response = mocker.Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = "<html><body><a href='https://www.google.com'>Google</a></body></html>"
|
||||
mocker.patch('requests.get', return_value=mock_response)
|
||||
|
||||
# Call the function with a valid URL
|
||||
result = scrape_links("https://www.example.com")
|
||||
|
||||
# Assert that the function returns correctly formatted hyperlinks
|
||||
assert result == ["Google (https://www.google.com)"]
|
||||
|
||||
# Tests that the function returns "error" when given an invalid url.
|
||||
def test_invalid_url(self, mocker):
|
||||
# Mock the requests.get() function to return an HTTP error response
|
||||
mock_response = mocker.Mock()
|
||||
mock_response.status_code = 404
|
||||
mocker.patch('requests.get', return_value=mock_response)
|
||||
|
||||
# Call the function with an invalid URL
|
||||
result = scrape_links("https://www.invalidurl.com")
|
||||
|
||||
# Assert that the function returns "error"
|
||||
assert "Error:" in result
|
||||
|
||||
# Tests that the function returns an empty list when the html contains no hyperlinks.
|
||||
def test_no_hyperlinks(self, mocker):
|
||||
# Mock the requests.get() function to return a response with sample HTML containing no hyperlinks
|
||||
mock_response = mocker.Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = "<html><body><p>No hyperlinks here</p></body></html>"
|
||||
mocker.patch('requests.get', return_value=mock_response)
|
||||
|
||||
# Call the function with a URL containing no hyperlinks
|
||||
result = scrape_links("https://www.example.com")
|
||||
|
||||
# Assert that the function returns an empty list
|
||||
assert result == []
|
||||
|
||||
# Tests that scrape_links() correctly extracts and formats hyperlinks from
|
||||
# a sample HTML containing a few hyperlinks.
|
||||
def test_scrape_links_with_few_hyperlinks(self, mocker):
|
||||
# Mock the requests.get() function to return a response with a sample HTML containing hyperlinks
|
||||
mock_response = mocker.Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = """
|
||||
<html>
|
||||
<body>
|
||||
<div id="google-link"><a href="https://www.google.com">Google</a></div>
|
||||
<div id="github"><a href="https://github.com">GitHub</a></div>
|
||||
<div id="CodiumAI"><a href="https://www.codium.ai">CodiumAI</a></div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
mocker.patch('requests.get', return_value=mock_response)
|
||||
|
||||
# Call the function being tested
|
||||
result = scrape_links("https://www.example.com")
|
||||
|
||||
# Assert that the function returns a list of formatted hyperlinks
|
||||
assert isinstance(result, list)
|
||||
assert len(result) == 3
|
||||
assert result[0] == "Google (https://www.google.com)"
|
||||
assert result[1] == "GitHub (https://github.com)"
|
||||
assert result[2] == "CodiumAI (https://www.codium.ai)"
|
||||
@@ -2,7 +2,6 @@
|
||||
# Generated by CodiumAI
|
||||
|
||||
import requests
|
||||
import tests.context
|
||||
|
||||
from scripts.browse import scrape_text
|
||||
|
||||
@@ -10,7 +9,8 @@ from scripts.browse import scrape_text
|
||||
Code Analysis
|
||||
|
||||
Objective:
|
||||
The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts.
|
||||
The objective of the "scrape_text" function is to scrape the text content from
|
||||
a given URL and return it as a string, after removing any unwanted HTML tags and scripts.
|
||||
|
||||
Inputs:
|
||||
- url: a string representing the URL of the webpage to be scraped.
|
||||
@@ -33,6 +33,7 @@ Additional aspects:
|
||||
- The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text.
|
||||
"""
|
||||
|
||||
|
||||
class TestScrapeText:
|
||||
|
||||
# Tests that scrape_text() returns the expected text when given a valid URL.
|
||||
Reference in New Issue
Block a user