mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-29 19:04:28 +01:00
Adds hyperlink extraction from webpage
+ accompanying command.
This commit is contained in:
@@ -24,6 +24,40 @@ def scrape_text(url):
|
||||
|
||||
return text
|
||||
|
||||
def extract_hyperlinks(soup):
|
||||
hyperlinks = []
|
||||
for link in soup.find_all('a', href=True):
|
||||
hyperlinks.append((link.text, link['href']))
|
||||
return hyperlinks
|
||||
|
||||
def format_hyperlinks(hyperlinks):
|
||||
formatted_links = []
|
||||
for link_text, link_url in hyperlinks:
|
||||
formatted_links.append(f"{link_text} ({link_url})")
|
||||
return '\n'.join(formatted_links)
|
||||
|
||||
def scrape_links(url):
|
||||
response = requests.get(url)
|
||||
|
||||
# Check if the response contains an HTTP error
|
||||
if response.status_code >= 400:
|
||||
return "error"
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
hyperlinks = extract_hyperlinks(soup)
|
||||
|
||||
text = soup.get_text()
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = '\n'.join(chunk for chunk in chunks if chunk)
|
||||
text = format_hyperlinks(hyperlinks)
|
||||
|
||||
return text
|
||||
|
||||
def split_text(text, max_length=8192):
|
||||
paragraphs = text.split("\n")
|
||||
current_length = 0
|
||||
|
||||
@@ -50,6 +50,8 @@ def execute_command(command_name, arguments):
|
||||
return register_account(arguments["username"], arguments["website"])
|
||||
elif command_name == "get_text_summary":
|
||||
return get_text_summary(arguments["url"])
|
||||
elif command_name == "get_hyperlinks":
|
||||
return get_hyperlinks(arguments["url"])
|
||||
elif command_name == "write_to_file":
|
||||
return write_to_file(arguments["file"], arguments["content"])
|
||||
elif command_name == "task_complete":
|
||||
@@ -77,6 +79,10 @@ def get_text_summary(url):
|
||||
summary = browse.summarize_text(text)
|
||||
return """ "Result" : """ + summary
|
||||
|
||||
def get_hyperlinks(url):
|
||||
text = browse.scrape_links(url)
|
||||
return text
|
||||
|
||||
def check_news(source):
|
||||
print("Checking news from BBC world instead of " + source)
|
||||
_text= get_text_summary("https://www.bbc.com/news/world")
|
||||
|
||||
@@ -17,6 +17,7 @@ COMMANDS:
|
||||
10. Delete GPT Agent: "delete_agent", args: "key": "<key>"
|
||||
9. Navigate & Perform: "navigate_website", args: "action": "click_button/input_text/register_account", "text/username": "<text>/<username>"
|
||||
11. Get Text Summary: "get_text_summary", args: "url": "<url>"
|
||||
12. Get Outgoing Links: "get_hyperlinks", args: "url": "<url>"
|
||||
13. Write to file: "write_to_file", args: "file": "<file>", "text": "<text>"
|
||||
14. Task Complete (Shutdown): "task_complete", args: "reason": "<reason>"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user