diff --git a/.env.template b/.env.template index 3279d2f1..32247d18 100644 --- a/.env.template +++ b/.env.template @@ -93,6 +93,13 @@ IMAGE_PROVIDER=dalle # HUGGINGFACE_API_TOKEN - HuggingFace API token (Example: my-huggingface-api-token) HUGGINGFACE_API_TOKEN=your-huggingface-api-token +################################################################################ +### AUDIO TO TEXT PROVIDER +################################################################################ + +### HUGGINGFACE +HUGGINGFACE_AUDIO_TO_TEXT_MODEL=facebook/wav2vec2-base-960h + ################################################################################ ### GIT Provider for repository actions ################################################################################ diff --git a/autogpt/app.py b/autogpt/app.py index 40a71c8e..e7b16adc 100644 --- a/autogpt/app.py +++ b/autogpt/app.py @@ -8,6 +8,7 @@ from autogpt.commands.improve_code import improve_code from autogpt.commands.write_tests import write_tests from autogpt.config import Config from autogpt.commands.image_gen import generate_image +from autogpt.commands.audio_text import read_audio_from_file from autogpt.commands.web_requests import scrape_links, scrape_text from autogpt.commands.execute_code import execute_python_file, execute_shell from autogpt.commands.file_operations import ( @@ -180,6 +181,8 @@ def execute_command(command_name: str, arguments): " shell commands, EXECUTE_LOCAL_COMMANDS must be set to 'True' " "in your config. Do not attempt to bypass the restriction." ) + elif command_name == "read_audio_from_file": + return read_audio_from_file(arguments["file"]) elif command_name == "generate_image": return generate_image(arguments["prompt"]) elif command_name == "send_tweet": diff --git a/autogpt/commands/audio_text.py b/autogpt/commands/audio_text.py new file mode 100644 index 00000000..bf9c3640 --- /dev/null +++ b/autogpt/commands/audio_text.py @@ -0,0 +1,35 @@ +import requests +import json + +from autogpt.config import Config +from autogpt.commands.file_operations import safe_join + +cfg = Config() + +working_directory = "auto_gpt_workspace" + + +def read_audio_from_file(audio_path): + audio_path = safe_join(working_directory, audio_path) + with open(audio_path, "rb") as audio_file: + audio = audio_file.read() + return read_audio(audio) + + +def read_audio(audio): + model = cfg.huggingface_audio_to_text_model + api_url = f"https://api-inference.huggingface.co/models/{model}" + api_token = cfg.huggingface_api_token + headers = {"Authorization": f"Bearer {api_token}"} + + if api_token is None: + raise ValueError("You need to set your Hugging Face API token in the config file.") + + response = requests.post( + api_url, + headers=headers, + data=audio, + ) + + text = json.loads(response.content.decode("utf-8"))['text'] + return "The audio says: " + text diff --git a/autogpt/config/config.py b/autogpt/config/config.py index c47b19af..b03f38eb 100644 --- a/autogpt/config/config.py +++ b/autogpt/config/config.py @@ -72,6 +72,9 @@ class Config(metaclass=Singleton): self.image_provider = os.getenv("IMAGE_PROVIDER") self.huggingface_api_token = os.getenv("HUGGINGFACE_API_TOKEN") + self.huggingface_audio_to_text_model = os.getenv( + "HUGGINGFACE_AUDIO_TO_TEXT_MODEL" + ) # User agent headers to use when browsing web # Some websites might just completely deny request with an error code if diff --git a/autogpt/prompt.py b/autogpt/prompt.py index a760bd72..9f79d420 100644 --- a/autogpt/prompt.py +++ b/autogpt/prompt.py @@ -82,6 +82,7 @@ def get_prompt() -> str: ), ("Execute Python File", "execute_python_file", {"file": ""}), ("Generate Image", "generate_image", {"prompt": ""}), + ("Convert Audio to text", "read_audio_from_file", {"file": ""}), ("Send Tweet", "send_tweet", {"text": ""}), ]