Use playwright instead of requests for browse

2025-12-21 07:54:21 +01:00 · 2023-04-03 14:05:32 -05:00
parent 439a7ffe7d
commit ac7fefe96e
6 changed files with 172 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ scripts/__pycache__/keys.cpython-310.pyc
 package-lock.json
 *.pyc
 scripts/auto_gpt_workspace/*
+auto_gpt_workspace/*
 *.mpeg
 .env
 last_run_ai_settings.yaml
--- a/requirements-new.txt
+++ b/requirements-new.txt
@@ -0,0 +1,13 @@
+beautifulsoup4==4.12.0
+colorama==0.4.6
+docker_py==1.10.6
+googlesearch_python==1.1.0
+numpy==1.24.2
+openai==0.27.2
+playsound==1.3.0
+playwright==1.32.1
+python-dotenv==1.0.0
+PyYAML==6.0
+requests==2.28.2
+scipy==1.10.1
+tiktoken==0.3.3
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 beautifulsoup4
 colorama==0.4.6
-dirtyjson==1.0.
+# dirtyjson==1.0.
 openai==0.27.2
 playsound==1.3.0
 python-dotenv==1.0.0
@@ -9,5 +9,5 @@ readability-lxml==0.8.1
 requests
 tiktoken==0.3.3
 docker
-# googlesearch-python
+googlesearch_python==1.1.0
 # Googlesearch python seems to be a bit cursed, anyone good at fixing thigns like this?
--- a/scripts/browse_playwright.py
+++ b/scripts/browse_playwright.py
@@ -0,0 +1,150 @@
+from playwright.sync_api import sync_playwright
+from bs4 import BeautifulSoup
+from config import Config
+from llm_utils import create_chat_completion
+
+cfg = Config()
+
+def scrape_text(url):
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        page = browser.new_page()
+
+        try:
+            page.goto(url)
+            html_content = page.content()
+            soup = BeautifulSoup(html_content, "html.parser")
+
+            for script in soup(["script", "style"]):
+                script.extract()
+
+            text = soup.get_text()
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = '\n'.join(chunk for chunk in chunks if chunk)
+
+        except Exception as e:
+            text = "Error: " + str(e)
+
+        finally:
+            browser.close()
+
+    return text
+
+
+def extract_hyperlinks(soup):
+    hyperlinks = []
+    for link in soup.find_all('a', href=True):
+        hyperlinks.append((link.text, link['href']))
+    return hyperlinks
+
+
+def format_hyperlinks(hyperlinks):
+    formatted_links = []
+    for link_text, link_url in hyperlinks:
+        formatted_links.append(f"{link_text} ({link_url})")
+    return formatted_links
+
+
+def scrape_links(url):
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        page = browser.new_page()
+
+        try:
+            page.goto(url)
+            html_content = page.content()
+            soup = BeautifulSoup(html_content, "html.parser")
+
+            for script in soup(["script", "style"]):
+                script.extract()
+
+            hyperlinks = extract_hyperlinks(soup)
+            formatted_links = format_hyperlinks(hyperlinks)
+
+        except Exception as e:
+            formatted_links = "Error: " + str(e)
+
+        finally:
+            browser.close()
+
+    return formatted_links
+
+# The rest of the code remains unchanged.
+
+def split_text(text, max_length=8192):
+    paragraphs = text.split("\n")
+    current_length = 0
+    current_chunk = []
+
+    for paragraph in paragraphs:
+        if current_length + len(paragraph) + 1 <= max_length:
+            current_chunk.append(paragraph)
+            current_length += len(paragraph) + 1
+        else:
+            yield "\n".join(current_chunk)
+            current_chunk = [paragraph]
+            current_length = len(paragraph) + 1
+
+    if current_chunk:
+        yield "\n".join(current_chunk)
+
+
+def summarize_text(text, is_website=True):
+    if text == "":
+        return "Error: No text to summarize"
+
+    print("Text length: " + str(len(text)) + " characters")
+    summaries = []
+    chunks = list(split_text(text))
+
+    for i, chunk in enumerate(chunks):
+        print("Summarizing chunk " + str(i + 1) + " / " + str(len(chunks)))
+        if is_website:
+            messages = [
+                {
+                    "role": "user",
+                    "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
+                    chunk},
+            ]
+        else:
+            messages = [
+                {
+                    "role": "user",
+                    "content": "Please summarize the following text, focusing on extracting concise and specific information: " +
+                    chunk},
+            ]
+
+        summary = create_chat_completion(
+            model=cfg.fast_llm_model,
+            messages=messages,
+            max_tokens=300,
+        )
+        summaries.append(summary)
+    print("Summarized " + str(len(chunks)) + " chunks.")
+
+    combined_summary = "\n".join(summaries)
+
+    # Summarize the combined summary
+    if is_website:
+        messages = [
+            {
+                "role": "user",
+                "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
+                combined_summary},
+        ]
+    else:
+        messages = [
+            {
+                "role": "user",
+                "content": "Please summarize the following text, focusing on extracting concise and specific infomation: " +
+                combined_summary},
+        ]
+
+    final_summary = create_chat_completion(
+        model=cfg.fast_llm_model,
+        messages=messages,
+        max_tokens=300,
+    )
+
+    return final_summary
--- a/scripts/commands.py
+++ b/scripts/commands.py
@@ -1,4 +1,4 @@
-import browse
+import browse_playwright as browse
 import json
 import memory as mem
 import datetime
--- a/scripts/json_parser.py
+++ b/scripts/json_parser.py
@@ -1,4 +1,4 @@
-import dirtyjson
+import json
 from call_ai_function import call_ai_function
 from config import Config
 cfg = Config()
@@ -24,7 +24,7 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True):
    """

    try:
-        return dirtyjson.loads(json_str)
+        return json.loads(json_str)
    except Exception as e:
        # Let's do something manually - sometimes GPT responds with something BEFORE the braces:
        # "I'm sorry, I don't understand. Please try again."{"text": "I'm sorry, I don't understand. Please try again.", "confidence": 0.0}
@@ -34,14 +34,14 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True):
          json_str = json_str[brace_index:]
          last_brace_index = json_str.rindex("}")
          json_str = json_str[:last_brace_index+1]
-          return dirtyjson.loads(json_str)
+          return json.loads(json_str)
        except Exception as e:
          if try_to_fix_with_gpt:
            print(f"Warning: Failed to parse AI output, attempting to fix.\n If you see this warning frequently, it's likely that your prompt is confusing the AI. Try changing it up slightly.")
            # Now try to fix this up using the ai_functions
            ai_fixed_json = fix_json(json_str, json_schema, False)
            if ai_fixed_json != "failed":
-              return dirtyjson.loads(ai_fixed_json)
+              return json.loads(ai_fixed_json)
            else:
              print(f"Failed to fix ai output, telling the AI.") # This allows the AI to react to the error message, which usually results in it correcting its ways.
              return json_str
@@ -68,7 +68,7 @@ def fix_json(json_str: str, schema: str, debug=False) -> str:
        print(f"Fixed JSON: {result_string}")
        print("----------- END OF FIX ATTEMPT ----------------")
    try:
-        return dirtyjson.loads(result_string)
+        return json.loads(result_string)
    except:
        # Get the call stack:
        # import traceback