From b0cb247b83d9b0b6c6f7d153ad0dfe076b6327ac Mon Sep 17 00:00:00 2001
From: Itamar Friedman <itamar.f@codium.ai>
Date: Mon, 10 Apr 2023 00:18:37 +0300
Subject: [PATCH 1/3] scrape_text: added tests + hande  RequestException

---
 scripts/browse.py                |   5 +-
 tests/test_browse_scrape_text.py | 102 +++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_browse_scrape_text.py
diff --git a/scripts/browse.py b/scripts/browse.py
index 0fda3d7b..40e6ca1f 100644
--- a/scripts/browse.py
+++ b/scripts/browse.py
@@ -6,7 +6,10 @@ from llm_utils import create_chat_completion
 cfg = Config()
 
 def scrape_text(url):
-    response = requests.get(url, headers=cfg.user_agent_header)
+    try:
+        response = requests.get(url, headers=cfg.user_agent_header)
+    except requests.exceptions.RequestException as e:
+        return "Error: " + str(e)
 
     # Check if the response contains an HTTP error
     if response.status_code >= 400:
diff --git a/tests/test_browse_scrape_text.py b/tests/test_browse_scrape_text.py
new file mode 100644
index 00000000..1a08367e
--- /dev/null
+++ b/tests/test_browse_scrape_text.py
@@ -0,0 +1,102 @@
+
+# Generated by CodiumAI
+
+import requests
+from unittest.mock import Mock
+import pytest
+
+from scripts.browse import scrape_text
+
+"""
+Code Analysis
+
+Objective:
+The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts.
+
+Inputs:
+- url: a string representing the URL of the webpage to be scraped.
+
+Flow:
+1. Send a GET request to the given URL using the requests library and the user agent header from the config file.
+2. Check if the response contains an HTTP error. If it does, return an error message.
+3. Use BeautifulSoup to parse the HTML content of the response and extract all script and style tags.
+4. Get the text content of the remaining HTML using the get_text() method of BeautifulSoup.
+5. Split the text into lines and then into chunks, removing any extra whitespace.
+6. Join the chunks into a single string with newline characters between them.
+7. Return the cleaned text.
+
+Outputs:
+- A string representing the cleaned text content of the webpage.
+
+Additional aspects:
+- The function uses the requests library and BeautifulSoup to handle the HTTP request and HTML parsing, respectively.
+- The function removes script and style tags from the HTML to avoid including unwanted content in the text output.
+- The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text.
+"""
+
+
+
+class TestScrapeText:
+
+    # Tests that scrape_text() returns the expected text when given a valid URL. 
+    def test_scrape_text_with_valid_url(self, mocker):
+        # Mock the requests.get() method to return a response with expected text
+        expected_text = "This is some sample text"
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = f"<html><body><div><p style='color: blue;'>{expected_text}</p></div></body></html>"
+        mocker.patch("requests.get", return_value=mock_response)
+
+        # Call the function with a valid URL and assert that it returns the expected text
+        url = "http://www.example.com"
+        assert scrape_text(url) == expected_text
+
+    # Tests that the function returns an error message when an invalid or unreachable url is provided. 
+    def test_invalid_url(self, mocker):
+        # Mock the requests.get() method to raise an exception
+        mocker.patch("requests.get", side_effect=requests.exceptions.RequestException)
+
+        # Call the function with an invalid URL and assert that it returns an error message
+        url = "http://www.invalidurl.com"
+        error_message = scrape_text(url)
+        assert "Error:" in error_message
+
+    # Tests that the function returns an empty string when the html page contains no text to be scraped. 
+    def test_no_text(self, mocker):
+        # Mock the requests.get() method to return a response with no text
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = "<html><body></body></html>"
+        mocker.patch("requests.get", return_value=mock_response)
+
+        # Call the function with a valid URL and assert that it returns an empty string
+        url = "http://www.example.com"
+        assert scrape_text(url) == ""
+
+    # Tests that the function returns an error message when the response status code is an http error (>=400).  
+    def test_http_error(self, mocker):
+        # Mock the requests.get() method to return a response with a 404 status code
+        mocker.patch('requests.get', return_value=Mock(status_code=404))
+
+        # Call the function with a URL
+        result = scrape_text("https://www.example.com")
+
+        # Check that the function returns an error message
+        assert result == "Error: HTTP 404 error"
+
+    # Tests that scrape_text() properly handles HTML tags. 
+    def test_scrape_text_with_html_tags(self):
+        # Create a mock response object with HTML containing tags
+        html = "<html><body><p>This is <b>bold</b> text.</p></body></html>"
+        response = Mock()
+        response.status_code = 200
+        response.text = html
+
+        # Mock the requests.get() method to return the mock response object
+        requests.get = Mock(return_value=response)
+
+        # Call the function with a URL
+        result = scrape_text("https://www.example.com")
+
+        # Check that the function properly handles HTML tags
+        assert result == "This is bold text."
\ No newline at end of file

From 06f26cb29c6980c2c521780c83972fc09db819e4 Mon Sep 17 00:00:00 2001
From: Itamar Friedman <itamar.f@codium.ai>
Date: Mon, 10 Apr 2023 08:19:41 +0300
Subject: [PATCH 2/3] remove dependency of unittest, use pytest

---
 tests/test_browse_scrape_text.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tests/test_browse_scrape_text.py b/tests/test_browse_scrape_text.py
index 1a08367e..27ebc0f6 100644
--- a/tests/test_browse_scrape_text.py
+++ b/tests/test_browse_scrape_text.py
@@ -2,7 +2,6 @@
 # Generated by CodiumAI
 
 import requests
-from unittest.mock import Mock
 import pytest
 
 from scripts.browse import scrape_text
@@ -76,7 +75,7 @@ class TestScrapeText:
     # Tests that the function returns an error message when the response status code is an http error (>=400).  
     def test_http_error(self, mocker):
         # Mock the requests.get() method to return a response with a 404 status code
-        mocker.patch('requests.get', return_value=Mock(status_code=404))
+        mocker.patch('requests.get', return_value=mocker.Mock(status_code=404))
 
         # Call the function with a URL
         result = scrape_text("https://www.example.com")
@@ -85,15 +84,13 @@ class TestScrapeText:
         assert result == "Error: HTTP 404 error"
 
     # Tests that scrape_text() properly handles HTML tags. 
-    def test_scrape_text_with_html_tags(self):
+    def test_scrape_text_with_html_tags(self, mocker):
         # Create a mock response object with HTML containing tags
         html = "<html><body><p>This is <b>bold</b> text.</p></body></html>"
-        response = Mock()
-        response.status_code = 200
-        response.text = html
-
-        # Mock the requests.get() method to return the mock response object
-        requests.get = Mock(return_value=response)
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = html
+        mocker.patch("requests.get", return_value=mock_response)
 
         # Call the function with a URL
         result = scrape_text("https://www.example.com")

From da4a045bd6aa85805ff30493f7f0b00050a2dc80 Mon Sep 17 00:00:00 2001
From: Itamar Friedman <itamar.f@codium.ai>
Date: Mon, 10 Apr 2023 08:26:46 +0300
Subject: [PATCH 3/3] Adding most basic URL validation in scrape_text

---
 scripts/browse.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/browse.py b/scripts/browse.py
index 40e6ca1f..7eeaaf4d 100644
--- a/scripts/browse.py
+++ b/scripts/browse.py
@@ -6,6 +6,10 @@ from llm_utils import create_chat_completion
 cfg = Config()
 
 def scrape_text(url):
+    # Most basic check if the URL is valid:
+    if not url.startswith('http'):
+        return "Error: Invalid URL"
+    
     try:
         response = requests.get(url, headers=cfg.user_agent_header)
     except requests.exceptions.RequestException as e: