Adds hyperlink extraction from webpage

+ accompanying command.
2026-01-29 19:04:28 +01:00 · 2023-03-30 10:10:52 +01:00
parent af17977c72
commit 114fc32d5f
3 changed files with 41 additions and 0 deletions
--- a/AutonomousAI/browse.py
+++ b/AutonomousAI/browse.py
@@ -24,6 +24,40 @@ def scrape_text(url):

    return text

+def extract_hyperlinks(soup):
+    hyperlinks = []
+    for link in soup.find_all('a', href=True):
+        hyperlinks.append((link.text, link['href']))
+    return hyperlinks
+
+def format_hyperlinks(hyperlinks):
+    formatted_links = []
+    for link_text, link_url in hyperlinks:
+        formatted_links.append(f"{link_text} ({link_url})")
+    return '\n'.join(formatted_links)
+
+def scrape_links(url):
+    response = requests.get(url)
+
+    # Check if the response contains an HTTP error
+    if response.status_code >= 400:
+        return "error"
+
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    hyperlinks = extract_hyperlinks(soup)
+
+    text = soup.get_text()
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = '\n'.join(chunk for chunk in chunks if chunk)
+    text = format_hyperlinks(hyperlinks)
+
+    return text
+
 def split_text(text, max_length=8192):
    paragraphs = text.split("\n")
    current_length = 0
--- a/AutonomousAI/commands.py
+++ b/AutonomousAI/commands.py
@@ -50,6 +50,8 @@ def execute_command(command_name, arguments):
            return register_account(arguments["username"], arguments["website"])
        elif command_name == "get_text_summary":
            return get_text_summary(arguments["url"])
+        elif command_name == "get_hyperlinks":
+            return get_hyperlinks(arguments["url"])
        elif command_name == "write_to_file":
            return write_to_file(arguments["file"], arguments["content"])
        elif command_name == "task_complete":
@@ -77,6 +79,10 @@ def get_text_summary(url):
    summary = browse.summarize_text(text)
    return """ "Result" : """ + summary

+def get_hyperlinks(url):
+    text = browse.scrape_links(url)
+    return text
+
 def check_news(source):
    print("Checking news from BBC world instead of " + source)
    _text= get_text_summary("https://www.bbc.com/news/world")
--- a/AutonomousAI/data/prompt.txt
+++ b/AutonomousAI/data/prompt.txt
@@ -17,6 +17,7 @@ COMMANDS:
 10. Delete GPT Agent: "delete_agent", args: "key": "<key>"
 9. Navigate & Perform: "navigate_website", args: "action": "click_button/input_text/register_account", "text/username": "<text>/<username>"
 11. Get Text Summary: "get_text_summary", args: "url": "<url>"
+12. Get Outgoing Links: "get_hyperlinks", args: "url": "<url>"
 13. Write to file: "write_to_file", args: "file": "<file>", "text": "<text>"
 14. Task Complete (Shutdown): "task_complete", args: "reason": "<reason>"