diff --git a/src/goose/synopsis/toolkit.py b/src/goose/synopsis/toolkit.py index 2b694d8e..8424987c 100644 --- a/src/goose/synopsis/toolkit.py +++ b/src/goose/synopsis/toolkit.py @@ -1,10 +1,13 @@ # janky global state for now, think about it +import re import subprocess import os from pathlib import Path +import tempfile from typing import Dict from exchange import Message +import httpx from goose.synopsis.system import system from goose.toolkit.base import Toolkit, tool from goose.toolkit.utils import RULEPREFIX, RULESTYLE, get_language @@ -242,3 +245,36 @@ class SynopsisDeveloper(Toolkit): self.logshell(f"cd {path}") system.cwd = str(patho) return path + + @tool + def fetch_web_content(self, url: str) -> str: + """ + Fetch content from a URL using httpx. + + Args: + url (str): url of the site to visit. + Returns: + (dict): A dictionary with two keys: + - 'html_file_path' (str): Path to a html file which has the content of the page. It will be very large so use rg to search it or head in chunks. Will contain meta data and links and markup. + - 'text_file_path' (str): Path to a plain text file which has the some of the content of the page. It will be large so use rg to search it or head in chunks. If content isn't there, try the html variant. + """ # noqa + friendly_name = re.sub(r"[^a-zA-Z0-9]", "_", url)[:50] # Limit length to prevent filenames from being too long + + try: + result = httpx.get(url, follow_redirects=True).text + with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=f"_{friendly_name}.html") as tmp_file: + tmp_file.write(result) + tmp_text_file_path = tmp_file.name.replace(".html", ".txt") + plain_text = re.sub( + r"