From 114fc32d5f62a498be53f3cffb2e0427fe44d1af Mon Sep 17 00:00:00 2001 From: Torantulino Date: Thu, 30 Mar 2023 10:10:52 +0100 Subject: [PATCH] Adds hyperlink extraction from webpage + accompanying command. --- AutonomousAI/browse.py | 34 ++++++++++++++++++++++++++++++++++ AutonomousAI/commands.py | 6 ++++++ AutonomousAI/data/prompt.txt | 1 + 3 files changed, 41 insertions(+) diff --git a/AutonomousAI/browse.py b/AutonomousAI/browse.py index 4cc45e79..61dbd86c 100644 --- a/AutonomousAI/browse.py +++ b/AutonomousAI/browse.py @@ -24,6 +24,40 @@ def scrape_text(url): return text +def extract_hyperlinks(soup): + hyperlinks = [] + for link in soup.find_all('a', href=True): + hyperlinks.append((link.text, link['href'])) + return hyperlinks + +def format_hyperlinks(hyperlinks): + formatted_links = [] + for link_text, link_url in hyperlinks: + formatted_links.append(f"{link_text} ({link_url})") + return '\n'.join(formatted_links) + +def scrape_links(url): + response = requests.get(url) + + # Check if the response contains an HTTP error + if response.status_code >= 400: + return "error" + + soup = BeautifulSoup(response.text, "html.parser") + + for script in soup(["script", "style"]): + script.extract() + + hyperlinks = extract_hyperlinks(soup) + + text = soup.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text = '\n'.join(chunk for chunk in chunks if chunk) + text = format_hyperlinks(hyperlinks) + + return text + def split_text(text, max_length=8192): paragraphs = text.split("\n") current_length = 0 diff --git a/AutonomousAI/commands.py b/AutonomousAI/commands.py index c427df87..bfbc8d07 100644 --- a/AutonomousAI/commands.py +++ b/AutonomousAI/commands.py @@ -50,6 +50,8 @@ def execute_command(command_name, arguments): return register_account(arguments["username"], arguments["website"]) elif command_name == "get_text_summary": return get_text_summary(arguments["url"]) + elif command_name == "get_hyperlinks": + return get_hyperlinks(arguments["url"]) elif command_name == "write_to_file": return write_to_file(arguments["file"], arguments["content"]) elif command_name == "task_complete": @@ -77,6 +79,10 @@ def get_text_summary(url): summary = browse.summarize_text(text) return """ "Result" : """ + summary +def get_hyperlinks(url): + text = browse.scrape_links(url) + return text + def check_news(source): print("Checking news from BBC world instead of " + source) _text= get_text_summary("https://www.bbc.com/news/world") diff --git a/AutonomousAI/data/prompt.txt b/AutonomousAI/data/prompt.txt index 8e5a5883..03b491fe 100644 --- a/AutonomousAI/data/prompt.txt +++ b/AutonomousAI/data/prompt.txt @@ -17,6 +17,7 @@ COMMANDS: 10. Delete GPT Agent: "delete_agent", args: "key": "" 9. Navigate & Perform: "navigate_website", args: "action": "click_button/input_text/register_account", "text/username": "/" 11. Get Text Summary: "get_text_summary", args: "url": "" +12. Get Outgoing Links: "get_hyperlinks", args: "url": "" 13. Write to file: "write_to_file", args: "file": "", "text": "" 14. Task Complete (Shutdown): "task_complete", args: "reason": ""