Use playwright instead of requests for browse

2025-12-21 16:04:21 +01:00 · 2023-04-03 14:05:32 -05:00
parent 439a7ffe7d
commit ac7fefe96e
6 changed files with 172 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ scripts/__pycache__/keys.cpython-310.pyc
 package-lock.json
 *.pyc
 scripts/auto_gpt_workspace/*
 auto_gpt_workspace/*
 *.mpeg
 .env
 last_run_ai_settings.yaml
--- a/requirements-new.txt
+++ b/requirements-new.txt
@@ -0,0 +1,13 @@
 beautifulsoup4==4.12.0
 colorama==0.4.6
 docker_py==1.10.6
 googlesearch_python==1.1.0
 numpy==1.24.2
 openai==0.27.2
 playsound==1.3.0
 playwright==1.32.1
 python-dotenv==1.0.0
 PyYAML==6.0
 requests==2.28.2
 scipy==1.10.1
 tiktoken==0.3.3
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 beautifulsoup4
 colorama==0.4.6
-dirtyjson==1.0.
+# dirtyjson==1.0.
 openai==0.27.2
 playsound==1.3.0
 python-dotenv==1.0.0
@@ -9,5 +9,5 @@ readability-lxml==0.8.1
 requests
 tiktoken==0.3.3
 docker
-# googlesearch-python
+googlesearch_python==1.1.0
 # Googlesearch python seems to be a bit cursed, anyone good at fixing thigns like this?
--- a/scripts/browse_playwright.py
+++ b/scripts/browse_playwright.py
@@ -0,0 +1,150 @@
 from playwright.sync_api import sync_playwright
 from bs4 import BeautifulSoup
 from config import Config
 from llm_utils import create_chat_completion
 cfg = Config()
 def scrape_text(url):
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        try:
            page.goto(url)
            html_content = page.content()
            soup = BeautifulSoup(html_content, "html.parser")
            for script in soup(["script", "style"]):
                script.extract()
            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = '\n'.join(chunk for chunk in chunks if chunk)
        except Exception as e:
            text = "Error: " + str(e)
        finally:
            browser.close()
    return text
 def extract_hyperlinks(soup):
    hyperlinks = []
    for link in soup.find_all('a', href=True):
        hyperlinks.append((link.text, link['href']))
    return hyperlinks
 def format_hyperlinks(hyperlinks):
    formatted_links = []
    for link_text, link_url in hyperlinks:
        formatted_links.append(f"{link_text} ({link_url})")
    return formatted_links
 def scrape_links(url):
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        try:
            page.goto(url)
            html_content = page.content()
            soup = BeautifulSoup(html_content, "html.parser")
            for script in soup(["script", "style"]):
                script.extract()
            hyperlinks = extract_hyperlinks(soup)
            formatted_links = format_hyperlinks(hyperlinks)
        except Exception as e:
            formatted_links = "Error: " + str(e)
        finally:
            browser.close()
    return formatted_links
 # The rest of the code remains unchanged.
 def split_text(text, max_length=8192):
    paragraphs = text.split("\n")
    current_length = 0
    current_chunk = []
    for paragraph in paragraphs:
        if current_length + len(paragraph) + 1 <= max_length:
            current_chunk.append(paragraph)
            current_length += len(paragraph) + 1
        else:
            yield "\n".join(current_chunk)
            current_chunk = [paragraph]
            current_length = len(paragraph) + 1
    if current_chunk:
        yield "\n".join(current_chunk)
 def summarize_text(text, is_website=True):
    if text == "":
        return "Error: No text to summarize"
    print("Text length: " + str(len(text)) + " characters")
    summaries = []
    chunks = list(split_text(text))
    for i, chunk in enumerate(chunks):
        print("Summarizing chunk " + str(i + 1) + " / " + str(len(chunks)))
        if is_website:
            messages = [
                {
                    "role": "user",
                    "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
                    chunk},
            ]
        else:
            messages = [
                {
                    "role": "user",
                    "content": "Please summarize the following text, focusing on extracting concise and specific information: " +
                    chunk},
            ]
        summary = create_chat_completion(
            model=cfg.fast_llm_model,
            messages=messages,
            max_tokens=300,
        )
        summaries.append(summary)
    print("Summarized " + str(len(chunks)) + " chunks.")
    combined_summary = "\n".join(summaries)
    # Summarize the combined summary
    if is_website:
        messages = [
            {
                "role": "user",
                "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
                combined_summary},
        ]
    else:
        messages = [
            {
                "role": "user",
                "content": "Please summarize the following text, focusing on extracting concise and specific infomation: " +
                combined_summary},
        ]
    final_summary = create_chat_completion(
        model=cfg.fast_llm_model,
        messages=messages,
        max_tokens=300,
    )
    return final_summary
--- a/scripts/commands.py
+++ b/scripts/commands.py
@@ -1,4 +1,4 @@
-import browse
+import browse_playwright as browse
 import json
 import memory as mem
 import datetime
--- a/scripts/json_parser.py
+++ b/scripts/json_parser.py
@@ -1,4 +1,4 @@
-import dirtyjson
+import json
 from call_ai_function import call_ai_function
 from config import Config
 cfg = Config()
@@ -24,7 +24,7 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True):
    """
    try:
-        return dirtyjson.loads(json_str)
+        return json.loads(json_str)
    except Exception as e:
        # Let's do something manually - sometimes GPT responds with something BEFORE the braces:
        # "I'm sorry, I don't understand. Please try again."{"text": "I'm sorry, I don't understand. Please try again.", "confidence": 0.0}
@@ -34,14 +34,14 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True):
          json_str = json_str[brace_index:]
          last_brace_index = json_str.rindex("}")
          json_str = json_str[:last_brace_index+1]
-          return dirtyjson.loads(json_str)
+          return json.loads(json_str)
        except Exception as e:
          if try_to_fix_with_gpt:
            print(f"Warning: Failed to parse AI output, attempting to fix.\n If you see this warning frequently, it's likely that your prompt is confusing the AI. Try changing it up slightly.")
            # Now try to fix this up using the ai_functions
            ai_fixed_json = fix_json(json_str, json_schema, False)
            if ai_fixed_json != "failed":
-              return dirtyjson.loads(ai_fixed_json)
+              return json.loads(ai_fixed_json)
            else:
              print(f"Failed to fix ai output, telling the AI.") # This allows the AI to react to the error message, which usually results in it correcting its ways.
              return json_str
@@ -68,7 +68,7 @@ def fix_json(json_str: str, schema: str, debug=False) -> str:
        print(f"Fixed JSON: {result_string}")
        print("----------- END OF FIX ATTEMPT ----------------")
    try:
-        return dirtyjson.loads(result_string)
+        return json.loads(result_string)
    except:
        # Get the call stack:
        # import traceback