Adds hyperlink extraction from webpage

+ accompanying command.
This commit is contained in:
Torantulino
2023-03-30 10:10:52 +01:00
parent af17977c72
commit 114fc32d5f
3 changed files with 41 additions and 0 deletions

View File

@@ -24,6 +24,40 @@ def scrape_text(url):
return text
def extract_hyperlinks(soup):
hyperlinks = []
for link in soup.find_all('a', href=True):
hyperlinks.append((link.text, link['href']))
return hyperlinks
def format_hyperlinks(hyperlinks):
formatted_links = []
for link_text, link_url in hyperlinks:
formatted_links.append(f"{link_text} ({link_url})")
return '\n'.join(formatted_links)
def scrape_links(url):
response = requests.get(url)
# Check if the response contains an HTTP error
if response.status_code >= 400:
return "error"
soup = BeautifulSoup(response.text, "html.parser")
for script in soup(["script", "style"]):
script.extract()
hyperlinks = extract_hyperlinks(soup)
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
text = format_hyperlinks(hyperlinks)
return text
def split_text(text, max_length=8192):
paragraphs = text.split("\n")
current_length = 0

View File

@@ -50,6 +50,8 @@ def execute_command(command_name, arguments):
return register_account(arguments["username"], arguments["website"])
elif command_name == "get_text_summary":
return get_text_summary(arguments["url"])
elif command_name == "get_hyperlinks":
return get_hyperlinks(arguments["url"])
elif command_name == "write_to_file":
return write_to_file(arguments["file"], arguments["content"])
elif command_name == "task_complete":
@@ -77,6 +79,10 @@ def get_text_summary(url):
summary = browse.summarize_text(text)
return """ "Result" : """ + summary
def get_hyperlinks(url):
text = browse.scrape_links(url)
return text
def check_news(source):
print("Checking news from BBC world instead of " + source)
_text= get_text_summary("https://www.bbc.com/news/world")

View File

@@ -17,6 +17,7 @@ COMMANDS:
10. Delete GPT Agent: "delete_agent", args: "key": "<key>"
9. Navigate & Perform: "navigate_website", args: "action": "click_button/input_text/register_account", "text/username": "<text>/<username>"
11. Get Text Summary: "get_text_summary", args: "url": "<url>"
12. Get Outgoing Links: "get_hyperlinks", args: "url": "<url>"
13. Write to file: "write_to_file", args: "file": "<file>", "text": "<text>"
14. Task Complete (Shutdown): "task_complete", args: "reason": "<reason>"