diff --git a/scripts/browse.py b/scripts/browse.py index 0fda3d7b..7eeaaf4d 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -6,7 +6,14 @@ from llm_utils import create_chat_completion cfg = Config() def scrape_text(url): - response = requests.get(url, headers=cfg.user_agent_header) + # Most basic check if the URL is valid: + if not url.startswith('http'): + return "Error: Invalid URL" + + try: + response = requests.get(url, headers=cfg.user_agent_header) + except requests.exceptions.RequestException as e: + return "Error: " + str(e) # Check if the response contains an HTTP error if response.status_code >= 400: diff --git a/tests/test_browse_scrape_text.py b/tests/test_browse_scrape_text.py new file mode 100644 index 00000000..27ebc0f6 --- /dev/null +++ b/tests/test_browse_scrape_text.py @@ -0,0 +1,99 @@ + +# Generated by CodiumAI + +import requests +import pytest + +from scripts.browse import scrape_text + +""" +Code Analysis + +Objective: +The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts. + +Inputs: +- url: a string representing the URL of the webpage to be scraped. + +Flow: +1. Send a GET request to the given URL using the requests library and the user agent header from the config file. +2. Check if the response contains an HTTP error. If it does, return an error message. +3. Use BeautifulSoup to parse the HTML content of the response and extract all script and style tags. +4. Get the text content of the remaining HTML using the get_text() method of BeautifulSoup. +5. Split the text into lines and then into chunks, removing any extra whitespace. +6. Join the chunks into a single string with newline characters between them. +7. Return the cleaned text. + +Outputs: +- A string representing the cleaned text content of the webpage. + +Additional aspects: +- The function uses the requests library and BeautifulSoup to handle the HTTP request and HTML parsing, respectively. +- The function removes script and style tags from the HTML to avoid including unwanted content in the text output. +- The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text. +""" + + + +class TestScrapeText: + + # Tests that scrape_text() returns the expected text when given a valid URL. + def test_scrape_text_with_valid_url(self, mocker): + # Mock the requests.get() method to return a response with expected text + expected_text = "This is some sample text" + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = f"

{expected_text}

" + mocker.patch("requests.get", return_value=mock_response) + + # Call the function with a valid URL and assert that it returns the expected text + url = "http://www.example.com" + assert scrape_text(url) == expected_text + + # Tests that the function returns an error message when an invalid or unreachable url is provided. + def test_invalid_url(self, mocker): + # Mock the requests.get() method to raise an exception + mocker.patch("requests.get", side_effect=requests.exceptions.RequestException) + + # Call the function with an invalid URL and assert that it returns an error message + url = "http://www.invalidurl.com" + error_message = scrape_text(url) + assert "Error:" in error_message + + # Tests that the function returns an empty string when the html page contains no text to be scraped. + def test_no_text(self, mocker): + # Mock the requests.get() method to return a response with no text + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = "" + mocker.patch("requests.get", return_value=mock_response) + + # Call the function with a valid URL and assert that it returns an empty string + url = "http://www.example.com" + assert scrape_text(url) == "" + + # Tests that the function returns an error message when the response status code is an http error (>=400). + def test_http_error(self, mocker): + # Mock the requests.get() method to return a response with a 404 status code + mocker.patch('requests.get', return_value=mocker.Mock(status_code=404)) + + # Call the function with a URL + result = scrape_text("https://www.example.com") + + # Check that the function returns an error message + assert result == "Error: HTTP 404 error" + + # Tests that scrape_text() properly handles HTML tags. + def test_scrape_text_with_html_tags(self, mocker): + # Create a mock response object with HTML containing tags + html = "

This is bold text.

" + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = html + mocker.patch("requests.get", return_value=mock_response) + + # Call the function with a URL + result = scrape_text("https://www.example.com") + + # Check that the function properly handles HTML tags + assert result == "This is bold text." \ No newline at end of file