Merge pull request #780 from coditamar/browse_scrape_links_test_and_validate

browse: (1) apply url validation also to scrape_links(), (2) add unit-tests for scrape_links()
2026-02-04 22:04:35 +01:00 · 2023-04-13 07:10:06 +01:00
parent a0f900f2da 3e53e976a5
commit 4e4af3ed26
3 changed files with 159 additions and 38 deletions
--- a/scripts/browse.py
+++ b/scripts/browse.py
@@ -6,6 +6,7 @@ from urllib.parse import urlparse, urljoin

 cfg = Config()

+
 # Function to check if the URL is valid
 def is_valid_url(url):
    try:
@@ -14,49 +15,51 @@ def is_valid_url(url):
    except ValueError:
        return False

+
 # Function to sanitize the URL
 def sanitize_url(url):
    return urljoin(url, urlparse(url).path)

-# Function to make a request with a specified timeout and handle exceptions
-def make_request(url, timeout=10):
-    try:
-        response = requests.get(url, headers=cfg.user_agent_header, timeout=timeout)
-        response.raise_for_status()
-        return response
-    except requests.exceptions.RequestException as e:
-        return "Error: " + str(e)

 # Define and check for local file address prefixes
 def check_local_file_access(url):
    local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost']
    return any(url.startswith(prefix) for prefix in local_prefixes)

+
+def get_response(url, headers=cfg.user_agent_header, timeout=10):
+    try:
+        # Restrict access to local files
+        if check_local_file_access(url):
+            raise ValueError('Access to local files is restricted')
+
+        # Most basic check if the URL is valid:
+        if not url.startswith('http://') and not url.startswith('https://'):
+            raise ValueError('Invalid URL format')
+
+        sanitized_url = sanitize_url(url)
+
+        response = requests.get(sanitized_url, headers=headers, timeout=timeout)
+
+        # Check if the response contains an HTTP error
+        if response.status_code >= 400:
+            return None, "Error: HTTP " + str(response.status_code) + " error"
+
+        return response, None
+    except ValueError as ve:
+        # Handle invalid URL format
+        return None, "Error: " + str(ve)
+
+    except requests.exceptions.RequestException as re:
+        # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
+        return None, "Error: " + str(re)
+
+
 def scrape_text(url):
    """Scrape text from a webpage"""
-    # Basic check if the URL is valid
-    if not url.startswith('http'):
-        return "Error: Invalid URL"
-
-    # Restrict access to local files
-    if check_local_file_access(url):
-        return "Error: Access to local files is restricted"
-
-    # Validate the input URL
-    if not is_valid_url(url):
-        # Sanitize the input URL
-        sanitized_url = sanitize_url(url)
-
-        # Make the request with a timeout and handle exceptions
-        response = make_request(sanitized_url)
-
-        if isinstance(response, str):
-            return response
-    else:
-        # Sanitize the input URL
-        sanitized_url = sanitize_url(url)
-
-        response = requests.get(sanitized_url, headers=cfg.user_agent_header)
+    response, error_message = get_response(url)
+    if error_message:
+        return error_message

    soup = BeautifulSoup(response.text, "html.parser")

@@ -89,11 +92,9 @@ def format_hyperlinks(hyperlinks):

 def scrape_links(url):
    """Scrape links from a webpage"""
-    response = requests.get(url, headers=cfg.user_agent_header)
-
-    # Check if the response contains an HTTP error
-    if response.status_code >= 400:
-        return "error"
+    response, error_message = get_response(url)
+    if error_message:
+        return error_message

    soup = BeautifulSoup(response.text, "html.parser")

@@ -131,6 +132,7 @@ def create_message(chunk, question):
        "content": f"\"\"\"{chunk}\"\"\" Using the above text, please answer the following question: \"{question}\" -- if the question cannot be answered using the text, please summarize the text."
    }

+
 def summarize_text(text, question):
    """Summarize text using the LLM model"""
    if not text:
--- a/tests/unit/test_browse_scrape_links.py
+++ b/tests/unit/test_browse_scrape_links.py
@@ -0,0 +1,118 @@
+
+# Generated by CodiumAI
+
+# Dependencies:
+# pip install pytest-mock
+import pytest
+
+from scripts.browse import scrape_links
+
+"""
+Code Analysis
+
+Objective:
+The objective of the 'scrape_links' function is to scrape hyperlinks from a
+given URL and return them in a formatted way.
+
+Inputs:
+- url: a string representing the URL to be scraped.
+
+Flow:
+1. Send a GET request to the given URL using the requests library and the user agent header from the config file.
+2. Check if the response contains an HTTP error. If it does, return "error".
+3. Parse the HTML content of the response using the BeautifulSoup library.
+4. Remove any script and style tags from the parsed HTML.
+5. Extract all hyperlinks from the parsed HTML using the 'extract_hyperlinks' function.
+6. Format the extracted hyperlinks using the 'format_hyperlinks' function.
+7. Return the formatted hyperlinks.
+
+Outputs:
+- A list of formatted hyperlinks.
+
+Additional aspects:
+- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP
+requests and parse HTML content, respectively.
+- The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML.
+- The 'format_hyperlinks' function is called to format the extracted hyperlinks.
+- The function checks for HTTP errors and returns "error" if any are found.
+"""
+
+
+class TestScrapeLinks:
+
+    # Tests that the function returns a list of formatted hyperlinks when
+    # provided with a valid url that returns a webpage with hyperlinks.
+    def test_valid_url_with_hyperlinks(self):
+        url = "https://www.google.com"
+        result = scrape_links(url)
+        assert len(result) > 0
+        assert isinstance(result, list)
+        assert isinstance(result[0], str)
+
+    # Tests that the function returns correctly formatted hyperlinks when given a valid url.
+    def test_valid_url(self, mocker):
+        # Mock the requests.get() function to return a response with sample HTML containing hyperlinks
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = "<html><body><a href='https://www.google.com'>Google</a></body></html>"
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function with a valid URL
+        result = scrape_links("https://www.example.com")
+
+        # Assert that the function returns correctly formatted hyperlinks
+        assert result == ["Google (https://www.google.com)"]
+
+    # Tests that the function returns "error" when given an invalid url.
+    def test_invalid_url(self, mocker):
+        # Mock the requests.get() function to return an HTTP error response
+        mock_response = mocker.Mock()
+        mock_response.status_code = 404
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function with an invalid URL
+        result = scrape_links("https://www.invalidurl.com")
+
+        # Assert that the function returns "error"
+        assert "Error:" in result
+
+    # Tests that the function returns an empty list when the html contains no hyperlinks.
+    def test_no_hyperlinks(self, mocker):
+        # Mock the requests.get() function to return a response with sample HTML containing no hyperlinks
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = "<html><body><p>No hyperlinks here</p></body></html>"
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function with a URL containing no hyperlinks
+        result = scrape_links("https://www.example.com")
+
+        # Assert that the function returns an empty list
+        assert result == []
+
+    # Tests that scrape_links() correctly extracts and formats hyperlinks from
+    # a sample HTML containing a few hyperlinks.
+    def test_scrape_links_with_few_hyperlinks(self, mocker):
+        # Mock the requests.get() function to return a response with a sample HTML containing hyperlinks
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = """
+            <html>
+                <body>
+                    <div id="google-link"><a href="https://www.google.com">Google</a></div>
+                    <div id="github"><a href="https://github.com">GitHub</a></div>
+                    <div id="CodiumAI"><a href="https://www.codium.ai">CodiumAI</a></div>
+                </body>
+            </html>
+        """
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function being tested
+        result = scrape_links("https://www.example.com")
+
+        # Assert that the function returns a list of formatted hyperlinks
+        assert isinstance(result, list)
+        assert len(result) == 3
+        assert result[0] == "Google (https://www.google.com)"
+        assert result[1] == "GitHub (https://github.com)"
+        assert result[2] == "CodiumAI (https://www.codium.ai)"
--- a/tests/unit/test_browse_scrape_text.py
+++ b/tests/unit/test_browse_scrape_text.py
@@ -2,7 +2,6 @@
 # Generated by CodiumAI

 import requests
-import tests.context

 from scripts.browse import scrape_text

@@ -10,7 +9,8 @@ from scripts.browse import scrape_text
 Code Analysis

 Objective:
-The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts.
+The objective of the "scrape_text" function is to scrape the text content from
+a given URL and return it as a string, after removing any unwanted HTML tags and scripts.

 Inputs:
 - url: a string representing the URL of the webpage to be scraped.
@@ -33,6 +33,7 @@ Additional aspects:
 - The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text.
 """

+
 class TestScrapeText:

    # Tests that scrape_text() returns the expected text when given a valid URL.