From b0cb247b83d9b0b6c6f7d153ad0dfe076b6327ac Mon Sep 17 00:00:00 2001 From: Itamar Friedman Date: Mon, 10 Apr 2023 00:18:37 +0300 Subject: [PATCH 1/3] scrape_text: added tests + hande RequestException --- scripts/browse.py | 5 +- tests/test_browse_scrape_text.py | 102 +++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 tests/test_browse_scrape_text.py diff --git a/scripts/browse.py b/scripts/browse.py index 0fda3d7b..40e6ca1f 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -6,7 +6,10 @@ from llm_utils import create_chat_completion cfg = Config() def scrape_text(url): - response = requests.get(url, headers=cfg.user_agent_header) + try: + response = requests.get(url, headers=cfg.user_agent_header) + except requests.exceptions.RequestException as e: + return "Error: " + str(e) # Check if the response contains an HTTP error if response.status_code >= 400: diff --git a/tests/test_browse_scrape_text.py b/tests/test_browse_scrape_text.py new file mode 100644 index 00000000..1a08367e --- /dev/null +++ b/tests/test_browse_scrape_text.py @@ -0,0 +1,102 @@ + +# Generated by CodiumAI + +import requests +from unittest.mock import Mock +import pytest + +from scripts.browse import scrape_text + +""" +Code Analysis + +Objective: +The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts. + +Inputs: +- url: a string representing the URL of the webpage to be scraped. + +Flow: +1. Send a GET request to the given URL using the requests library and the user agent header from the config file. +2. Check if the response contains an HTTP error. If it does, return an error message. +3. Use BeautifulSoup to parse the HTML content of the response and extract all script and style tags. +4. Get the text content of the remaining HTML using the get_text() method of BeautifulSoup. +5. Split the text into lines and then into chunks, removing any extra whitespace. +6. Join the chunks into a single string with newline characters between them. +7. Return the cleaned text. + +Outputs: +- A string representing the cleaned text content of the webpage. + +Additional aspects: +- The function uses the requests library and BeautifulSoup to handle the HTTP request and HTML parsing, respectively. +- The function removes script and style tags from the HTML to avoid including unwanted content in the text output. +- The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text. +""" + + + +class TestScrapeText: + + # Tests that scrape_text() returns the expected text when given a valid URL. + def test_scrape_text_with_valid_url(self, mocker): + # Mock the requests.get() method to return a response with expected text + expected_text = "This is some sample text" + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = f"

{expected_text}

" + mocker.patch("requests.get", return_value=mock_response) + + # Call the function with a valid URL and assert that it returns the expected text + url = "http://www.example.com" + assert scrape_text(url) == expected_text + + # Tests that the function returns an error message when an invalid or unreachable url is provided. + def test_invalid_url(self, mocker): + # Mock the requests.get() method to raise an exception + mocker.patch("requests.get", side_effect=requests.exceptions.RequestException) + + # Call the function with an invalid URL and assert that it returns an error message + url = "http://www.invalidurl.com" + error_message = scrape_text(url) + assert "Error:" in error_message + + # Tests that the function returns an empty string when the html page contains no text to be scraped. + def test_no_text(self, mocker): + # Mock the requests.get() method to return a response with no text + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = "" + mocker.patch("requests.get", return_value=mock_response) + + # Call the function with a valid URL and assert that it returns an empty string + url = "http://www.example.com" + assert scrape_text(url) == "" + + # Tests that the function returns an error message when the response status code is an http error (>=400). + def test_http_error(self, mocker): + # Mock the requests.get() method to return a response with a 404 status code + mocker.patch('requests.get', return_value=Mock(status_code=404)) + + # Call the function with a URL + result = scrape_text("https://www.example.com") + + # Check that the function returns an error message + assert result == "Error: HTTP 404 error" + + # Tests that scrape_text() properly handles HTML tags. + def test_scrape_text_with_html_tags(self): + # Create a mock response object with HTML containing tags + html = "

This is bold text.

" + response = Mock() + response.status_code = 200 + response.text = html + + # Mock the requests.get() method to return the mock response object + requests.get = Mock(return_value=response) + + # Call the function with a URL + result = scrape_text("https://www.example.com") + + # Check that the function properly handles HTML tags + assert result == "This is bold text." \ No newline at end of file From 06f26cb29c6980c2c521780c83972fc09db819e4 Mon Sep 17 00:00:00 2001 From: Itamar Friedman Date: Mon, 10 Apr 2023 08:19:41 +0300 Subject: [PATCH 2/3] remove dependency of unittest, use pytest --- tests/test_browse_scrape_text.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/test_browse_scrape_text.py b/tests/test_browse_scrape_text.py index 1a08367e..27ebc0f6 100644 --- a/tests/test_browse_scrape_text.py +++ b/tests/test_browse_scrape_text.py @@ -2,7 +2,6 @@ # Generated by CodiumAI import requests -from unittest.mock import Mock import pytest from scripts.browse import scrape_text @@ -76,7 +75,7 @@ class TestScrapeText: # Tests that the function returns an error message when the response status code is an http error (>=400). def test_http_error(self, mocker): # Mock the requests.get() method to return a response with a 404 status code - mocker.patch('requests.get', return_value=Mock(status_code=404)) + mocker.patch('requests.get', return_value=mocker.Mock(status_code=404)) # Call the function with a URL result = scrape_text("https://www.example.com") @@ -85,15 +84,13 @@ class TestScrapeText: assert result == "Error: HTTP 404 error" # Tests that scrape_text() properly handles HTML tags. - def test_scrape_text_with_html_tags(self): + def test_scrape_text_with_html_tags(self, mocker): # Create a mock response object with HTML containing tags html = "

This is bold text.

" - response = Mock() - response.status_code = 200 - response.text = html - - # Mock the requests.get() method to return the mock response object - requests.get = Mock(return_value=response) + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = html + mocker.patch("requests.get", return_value=mock_response) # Call the function with a URL result = scrape_text("https://www.example.com") From da4a045bd6aa85805ff30493f7f0b00050a2dc80 Mon Sep 17 00:00:00 2001 From: Itamar Friedman Date: Mon, 10 Apr 2023 08:26:46 +0300 Subject: [PATCH 3/3] Adding most basic URL validation in scrape_text --- scripts/browse.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/browse.py b/scripts/browse.py index 40e6ca1f..7eeaaf4d 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -6,6 +6,10 @@ from llm_utils import create_chat_completion cfg = Config() def scrape_text(url): + # Most basic check if the URL is valid: + if not url.startswith('http'): + return "Error: Invalid URL" + try: response = requests.get(url, headers=cfg.user_agent_header) except requests.exceptions.RequestException as e: