diff --git a/autogpts/autogpt/autogpt/app/main.py b/autogpts/autogpt/autogpt/app/main.py index 036c7cd7..d8daad7b 100644 --- a/autogpts/autogpt/autogpt/app/main.py +++ b/autogpts/autogpt/autogpt/app/main.py @@ -26,12 +26,11 @@ from autogpt.commands import COMMAND_CATEGORIES from autogpt.config import AIConfig, Config, ConfigBuilder, check_openai_api_key from autogpt.llm.api_manager import ApiManager from autogpt.logs.config import configure_chat_plugins, configure_logging -from autogpt.logs.helpers import print_attribute +from autogpt.logs.helpers import print_attribute, speak from autogpt.memory.vector import get_memory from autogpt.models.command_registry import CommandRegistry from autogpt.plugins import scan_plugins from autogpt.prompts.prompt import DEFAULT_TRIGGERING_PROMPT -from autogpt.speech import say_text from autogpt.workspace import Workspace from scripts.install_plugin_deps import install_plugin_dependencies @@ -366,7 +365,7 @@ def update_user( print_assistant_thoughts(ai_config.ai_name, assistant_reply_dict, config) if config.speak_mode: - say_text(f"I want to execute {command_name}", config) + speak(f"I want to execute {command_name}") # First log new-line so user can differentiate sections better in console print() @@ -531,8 +530,6 @@ def print_assistant_thoughts( assistant_reply_json_valid: dict, config: Config, ) -> None: - from autogpt.speech import say_text - logger = logging.getLogger(__name__) assistant_thoughts_reasoning = None @@ -577,7 +574,7 @@ def print_assistant_thoughts( # Speak the assistant's thoughts if assistant_thoughts_speak: if config.speak_mode: - say_text(assistant_thoughts_speak, config) + speak(assistant_thoughts_speak) else: print_attribute("SPEAK", assistant_thoughts_speak, title_color=Fore.YELLOW) diff --git a/autogpts/autogpt/autogpt/logs/config.py b/autogpts/autogpt/autogpt/logs/config.py index f38bde40..dc1e24c4 100644 --- a/autogpts/autogpt/autogpt/logs/config.py +++ b/autogpts/autogpt/autogpt/logs/config.py @@ -27,6 +27,7 @@ DEBUG_LOG_FORMAT = ( " %(title)s%(message)s" ) +SPEECH_OUTPUT_LOGGER = "VOICE" USER_FRIENDLY_OUTPUT_LOGGER = "USER_FRIENDLY_OUTPUT" _chat_plugins: list[AutoGPTPluginTemplate] = [] @@ -96,6 +97,11 @@ def configure_logging(config: Config, log_dir: Path = LOG_DIR) -> None: user_friendly_output_logger.addHandler(stderr) user_friendly_output_logger.propagate = False + speech_output_logger = logging.getLogger(SPEECH_OUTPUT_LOGGER) + speech_output_logger.setLevel(logging.INFO) + speech_output_logger.addHandler(TTSHandler(config)) + speech_output_logger.propagate = False + # JSON logger with better formatting json_logger = logging.getLogger("JSON_LOGGER") json_logger.setLevel(logging.DEBUG) diff --git a/autogpts/autogpt/autogpt/logs/handlers.py b/autogpts/autogpt/autogpt/logs/handlers.py index 48af1d24..4896d99a 100644 --- a/autogpts/autogpt/autogpt/logs/handlers.py +++ b/autogpts/autogpt/autogpt/logs/handlers.py @@ -8,7 +8,7 @@ import time from typing import TYPE_CHECKING from autogpt.logs.utils import remove_color_codes -from autogpt.speech.say import say_text +from autogpt.speech import TextToSpeechProvider if TYPE_CHECKING: from autogpt.config import Config @@ -53,6 +53,7 @@ class TTSHandler(logging.Handler): def __init__(self, config: Config): super().__init__() self.config = config + self.tts_provider = TextToSpeechProvider(config) def format(self, record: logging.LogRecord) -> str: if getattr(record, "title", ""): @@ -67,7 +68,7 @@ class TTSHandler(logging.Handler): return message = self.format(record) - say_text(message, self.config) + self.tts_provider.say(message) class JsonFileHandler(logging.FileHandler): diff --git a/autogpts/autogpt/autogpt/logs/helpers.py b/autogpts/autogpt/autogpt/logs/helpers.py index 482057ec..98b0de8f 100644 --- a/autogpts/autogpt/autogpt/logs/helpers.py +++ b/autogpts/autogpt/autogpt/logs/helpers.py @@ -3,7 +3,7 @@ from typing import Any, Optional from colorama import Fore -from .config import USER_FRIENDLY_OUTPUT_LOGGER, _chat_plugins +from .config import SPEECH_OUTPUT_LOGGER, USER_FRIENDLY_OUTPUT_LOGGER, _chat_plugins def user_friendly_output( @@ -65,3 +65,7 @@ def request_user_double_check(additionalText: Optional[str] = None) -> None: title="DOUBLE CHECK CONFIGURATION", preserve_message_color=True, ) + + +def speak(message: str, level: int = logging.INFO) -> None: + logging.getLogger(SPEECH_OUTPUT_LOGGER).log(level, message) diff --git a/autogpts/autogpt/autogpt/singleton.py b/autogpts/autogpt/autogpt/singleton.py index b3a5af52..46c6256e 100644 --- a/autogpts/autogpt/autogpt/singleton.py +++ b/autogpts/autogpt/autogpt/singleton.py @@ -14,9 +14,3 @@ class Singleton(abc.ABCMeta, type): if cls not in cls._instances: cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) return cls._instances[cls] - - -class AbstractSingleton(abc.ABC, metaclass=Singleton): - """ - Abstract singleton class for ensuring only one instance of a class. - """ diff --git a/autogpts/autogpt/autogpt/speech/__init__.py b/autogpts/autogpt/autogpt/speech/__init__.py index 2ff0d2bf..1b419eb1 100644 --- a/autogpts/autogpt/autogpt/speech/__init__.py +++ b/autogpts/autogpt/autogpt/speech/__init__.py @@ -1,4 +1,4 @@ """This module contains the speech recognition and speech synthesis functions.""" -from autogpt.speech.say import say_text +from autogpt.speech.say import TextToSpeechProvider -__all__ = ["say_text"] +__all__ = ["TextToSpeechProvider"] diff --git a/autogpts/autogpt/autogpt/speech/base.py b/autogpts/autogpt/autogpt/speech/base.py index b3dd03bd..29b17b02 100644 --- a/autogpts/autogpt/autogpt/speech/base.py +++ b/autogpts/autogpt/autogpt/speech/base.py @@ -9,10 +9,8 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: from autogpt.config import Config -from autogpt.singleton import AbstractSingleton - -class VoiceBase(AbstractSingleton): +class VoiceBase: """ Base class for all voice classes. """ diff --git a/autogpts/autogpt/autogpt/speech/say.py b/autogpts/autogpt/autogpt/speech/say.py index 5d04c76f..301fda49 100644 --- a/autogpts/autogpt/autogpt/speech/say.py +++ b/autogpts/autogpt/autogpt/speech/say.py @@ -19,32 +19,37 @@ _QUEUE_SEMAPHORE = Semaphore( ) # The amount of sounds to queue before blocking the main thread -def say_text(text: str, config: Config, voice_index: int = 0) -> None: - """Speak the given text using the given voice index""" - default_voice_engine, voice_engine = _get_voice_engine(config) +class TextToSpeechProvider: + def __init__(self, config: Config): + self._config = config + self._default_voice_engine, self._voice_engine = self._get_voice_engine(config) - def speak() -> None: - success = voice_engine.say(text, voice_index) - if not success: - default_voice_engine.say(text) + def say(self, text, voice_index: int = 0) -> None: + def _speak() -> None: + success = self._voice_engine.say(text, voice_index) + if not success: + self._default_voice_engine.say(text, voice_index) + _QUEUE_SEMAPHORE.release() - _QUEUE_SEMAPHORE.release() + if self._config.speak_mode: + _QUEUE_SEMAPHORE.acquire(True) + thread = threading.Thread(target=_speak) + thread.start() - _QUEUE_SEMAPHORE.acquire(True) - thread = threading.Thread(target=speak) - thread.start() + def __repr__(self): + return f"{self.__class__.__name__}(enabled={self._config.speak_mode}, provider={self._voice_engine.__class__.__name__})" + @staticmethod + def _get_voice_engine(config: Config) -> tuple[VoiceBase, VoiceBase]: + """Get the voice engine to use for the given configuration""" + tts_provider = config.text_to_speech_provider + if tts_provider == "elevenlabs": + voice_engine = ElevenLabsSpeech(config) + elif tts_provider == "macos": + voice_engine = MacOSTTS(config) + elif tts_provider == "streamelements": + voice_engine = StreamElementsSpeech(config) + else: + voice_engine = GTTSVoice(config) -def _get_voice_engine(config: Config) -> tuple[VoiceBase, VoiceBase]: - """Get the voice engine to use for the given configuration""" - tts_provider = config.text_to_speech_provider - if tts_provider == "elevenlabs": - voice_engine = ElevenLabsSpeech(config) - elif tts_provider == "macos": - voice_engine = MacOSTTS(config) - elif tts_provider == "streamelements": - voice_engine = StreamElementsSpeech(config) - else: - voice_engine = GTTSVoice(config) - - return GTTSVoice(config), voice_engine + return GTTSVoice(config), voice_engine