added tts generation support

2025-12-20 14:14:52 +01:00 · 2023-11-08 13:55:58 +01:00
parent 450b86d5a7
commit d637e35498
7 changed files with 159 additions and 3 deletions
--- a/.env.example
+++ b/.env.example
@@ -19,6 +19,7 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
 # TRANSCRIPTION_PRICE=0.006
 # ENABLE_QUOTING=true
 # ENABLE_IMAGE_GENERATION=true
+# ENABLE_TTS_GENERATION=true
 # ENABLE_TRANSCRIPTION=true
 # PROXY=http://localhost:8080
 # OPENAI_MODEL=gpt-3.5-turbo
@@ -37,4 +38,6 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
 # IMAGE_SIZE=512x512
 # GROUP_TRIGGER_KEYWORD=""
 # IGNORE_GROUP_TRANSCRIPTIONS=true
+# TTS_MODEL="tts-1"
+# TTS_VOICE="alloy"
 # BOT_LANGUAGE=en
--- a/README.md
+++ b/README.md
@@ -81,6 +81,7 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
 | `ENABLE_QUOTING`                   | Whether to enable message quoting in private chats                                                                                                                                                                                                                    | `true`                              |
 | `ENABLE_IMAGE_GENERATION`          | Whether to enable image generation via the `/image` command                                                                                                                                                                                                           | `true`                              |
 | `ENABLE_TRANSCRIPTION`             | Whether to enable transcriptions of audio and video messages                                                                                                                                                                                                          | `true`                              |
+| `ENABLE_TTS_GENERATION`             | Whether to enable text to speech generation via the `/tts`                                                                                                                                                                                                              | `true`                              |
 | `PROXY`                            | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`)                                                                                                                                                                                           | -                                   |
 | `OPENAI_MODEL`                     | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/)                                                                                                                                  | `gpt-3.5-turbo`                     |
 | `ASSISTANT_PROMPT`                 | A system message that sets the tone and controls the behavior of the assistant                                                                                                                                                                                        | `You are a helpful assistant.`      |
@@ -100,6 +101,8 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
 | `IGNORE_GROUP_TRANSCRIPTIONS`      | If set to true, the bot will not process transcriptions in group chats                                                                                                                                                                                                | `true`                              |
 | `BOT_LANGUAGE`                     | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`.  [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en`                                |
 | `WHISPER_PROMPT`                     | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message.  [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting) | `-`                                |
+| `TTS_VOICE`                       | The Text to Speech voice to use. Allowed values: `alloy`, `echo`, `fable`, `onyx`, `nova`, or `shimmer`                                                                                                                                                                                  | `alloy`                           |
+| `TTS_MODEL`                       | The Text to Speech model to use. Allowed values: `tts-1` or `tts-1-hd`                                                                                                                                                                                  | `tts-1`                           |

 Check out the [official API reference](https://platform.openai.com/docs/api-reference/chat) for more details.

--- a/bot/main.py
+++ b/bot/main.py
@@ -50,6 +50,8 @@ def main():
        'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
        'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
        'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
+        'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
+        'tts_voice': os.environ.get('TTS_VOICE', 'alloy'),
    }

    if openai_config['enable_functions'] and not functions_available:
@@ -70,6 +72,7 @@ def main():
        'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true',
        'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
        'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
+        'enable_tts_generation': os.environ.get('ENABLE_TTS_GENERATION', 'true').lower() == 'true',
        'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
        'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
        'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))),
@@ -81,6 +84,8 @@ def main():
        'group_trigger_keyword': os.environ.get('GROUP_TRIGGER_KEYWORD', ''),
        'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
        'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
+        'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
+        'tts_prices': [float(i) for i in os.environ.get('TTS_PRICES', "0.015,0.030").split(",")],
        'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)),
        'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
    }
--- a/bot/openai_helper.py
+++ b/bot/openai_helper.py
@@ -10,6 +10,7 @@ import openai
 import requests
 import json
 import httpx
+import tempfile
 from datetime import date
 from calendar import monthrange

@@ -334,6 +335,27 @@ class OpenAIHelper:
        except Exception as e:
            raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e

+    async def generate_speech(self, text: str) -> tuple[any, int]:
+        """
+        Generates an audio from the given text using TTS model.
+        :param prompt: The text to send to the model
+        :return: The audio in bytes and the text size
+        """
+        bot_language = self.config['bot_language']
+        try:
+            response = await self.client.audio.speech.create(
+                model=self.config['tts_model'],
+                voice=self.config['tts_voice'],
+                input=text,
+                response_format='opus'
+            )
+
+            temp_file = tempfile.NamedTemporaryFile()
+            response.stream_to_file(temp_file.name)
+            return temp_file, len(text)
+        except Exception as e:
+            raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e
+
    async def transcribe(self, filename):
        """
        Transcribes the audio file using the Whisper model.
--- a/bot/telegram_bot.py
+++ b/bot/telegram_bot.py
@@ -46,6 +46,9 @@ class ChatGPTTelegramBot:
        if self.config.get('enable_image_generation', False):
            self.commands.append(BotCommand(command='image', description=localized_text('image_description', bot_language)))

+        if self.config.get('enable_speech_generation', False):
+            self.commands.append(BotCommand(command='tts', description=localized_text('speech_description', bot_language)))
+
        self.group_commands = [BotCommand(
            command='chat', description=localized_text('chat_description', bot_language)
        )] + self.commands
@@ -94,6 +97,7 @@ class ChatGPTTelegramBot:
        images_today, images_month = self.usage[user_id].get_current_image_count()
        (transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month,
         transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration()
+        characters_today, characters_month = self.usage[user_id].get_current_tts_usage()
        current_cost = self.usage[user_id].get_current_cost()

        chat_id = update.effective_chat.id
@@ -112,11 +116,16 @@ class ChatGPTTelegramBot:
        text_today_images = ""
        if self.config.get('enable_image_generation', False):
            text_today_images = f"{images_today} {localized_text('stats_images', bot_language)}\n"
+
+        text_today_tts = ""
+        if self.config.get('enable_tts_generation', False):
+            text_today_tts = f"{characters_today} {localized_text('stats_tts', bot_language)}\n"
        
        text_today = (
            f"*{localized_text('usage_today', bot_language)}:*\n"
            f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n"
            f"{text_today_images}"  # Include the image statistics for today if applicable
+            f"{text_today_tts}"
            f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} "
            f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n"
            f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n"
@@ -126,12 +135,17 @@ class ChatGPTTelegramBot:
        text_month_images = ""
        if self.config.get('enable_image_generation', False):
            text_month_images = f"{images_month} {localized_text('stats_images', bot_language)}\n"
+
+        text_month_tts = ""
+        if self.config.get('enable_tts_generation', False):
+            text_month_tts = f"{characters_month} {localized_text('stats_tts', bot_language)}\n"
        
        # Check if image generation is enabled and, if so, generate the image statistics for the month
        text_month = (
            f"*{localized_text('usage_month', bot_language)}:*\n"
            f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n"
            f"{text_month_images}"  # Include the image statistics for the month if applicable
+            f"{text_month_tts}"
            f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} "
            f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n"
            f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}"
@@ -250,6 +264,52 @@ class ChatGPTTelegramBot:

        await wrap_with_indicator(update, context, _generate, constants.ChatAction.UPLOAD_PHOTO)

+    async def tts(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
+        """
+        Generates an speech for the given input using TTS APIs
+        """
+        if not self.config['enable_tts_generation'] \
+                or not await self.check_allowed_and_within_budget(update, context):
+            return
+
+        tts_query = message_text(update.message)
+        if tts_query == '':
+            await update.effective_message.reply_text(
+                message_thread_id=get_thread_id(update),
+                text=localized_text('tts_no_prompt', self.config['bot_language'])
+            )
+            return
+
+        logging.info(f'New speech generation request received from user {update.message.from_user.name} '
+                     f'(id: {update.message.from_user.id})')
+
+        async def _generate():
+            try:
+                speech_file, text_length = await self.openai.generate_speech(text=tts_query)
+
+                await update.effective_message.reply_voice(
+                    reply_to_message_id=get_reply_to_message_id(self.config, update),
+                    voice=speech_file
+                )
+                speech_file.close()
+                # add image request to users usage tracker
+                user_id = update.message.from_user.id
+                self.usage[user_id].add_tts_request(text_length, self.config['tts_model'], self.config['tts_prices'])
+                # add guest chat request to guest usage tracker
+                if str(user_id) not in self.config['allowed_user_ids'].split(',') and 'guests' in self.usage:
+                    self.usage["guests"].add_tts_request(text_length, self.config['tts_model'], self.config['tts_prices'])
+
+            except Exception as e:
+                logging.exception(e)
+                await update.effective_message.reply_text(
+                    message_thread_id=get_thread_id(update),
+                    reply_to_message_id=get_reply_to_message_id(self.config, update),
+                    text=f"{localized_text('tts_fail', self.config['bot_language'])}: {str(e)}",
+                    parse_mode=constants.ParseMode.MARKDOWN
+                )
+
+        await wrap_with_indicator(update, context, _generate, constants.ChatAction.UPLOAD_VOICE)
+
    async def transcribe(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
        """
        Transcribe audio messages.
@@ -786,6 +846,7 @@ class ChatGPTTelegramBot:
        application.add_handler(CommandHandler('reset', self.reset))
        application.add_handler(CommandHandler('help', self.help))
        application.add_handler(CommandHandler('image', self.image))
+        application.add_handler(CommandHandler('tts', self.tts))
        application.add_handler(CommandHandler('start', self.help))
        application.add_handler(CommandHandler('stats', self.stats))
        application.add_handler(CommandHandler('resend', self.resend))
--- a/bot/usage_tracker.py
+++ b/bot/usage_tracker.py
@@ -56,6 +56,8 @@ class UsageTracker:
        if os.path.isfile(self.user_file):
            with open(self.user_file, "r") as file:
                self.usage = json.load(file)
+            if 'tts_characters' not in self.usage['usage_history']:
+                self.usage['usage_history']['tts_characters'] = {}
        else:
            # ensure directory exists
            pathlib.Path(logs_dir).mkdir(exist_ok=True)
@@ -63,7 +65,7 @@ class UsageTracker:
            self.usage = {
                "user_name": user_name,
                "current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())},
-                "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}}
+                "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "tts_characters": {}}
            }

    # token usage functions:
@@ -151,6 +153,57 @@ class UsageTracker:
                usage_month += sum(images)
        return usage_day, usage_month

+    # tts usage functions:
+
+    def add_tts_request(self, text_length, tts_model, tts_prices):
+        tts_models = ['tts-1', 'tts-1-hd']
+        price = tts_prices[tts_models.index(tts_model)]
+        today = date.today()
+        tts_price = round(text_length * price / 1000, 2)
+        self.add_current_costs(tts_price)
+
+        if 'tts_characters' not in self.usage['usage_history']:
+            self.usage['usage_history']['tts_characters'] = {}
+        
+        if tts_model not in self.usage['usage_history']['tts_characters']:
+            self.usage['usage_history']['tts_characters'][tts_model] = {}
+
+        # update usage_history
+        if str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
+            # add requested text length to existing date
+            self.usage["usage_history"]["tts_characters"][tts_model][str(today)] += text_length
+        else:
+            # create new entry for current date
+            self.usage["usage_history"]["tts_characters"][tts_model][str(today)] = text_length
+
+        # write updated token usage to user file
+        with open(self.user_file, "w") as outfile:
+            json.dump(self.usage, outfile)
+
+    def get_current_tts_usage(self):
+        """Get length of speech generated for today and this month.
+
+        :return: total amount of characters converted to speech per day and per month
+        """
+
+        tts_models = ['tts-1', 'tts-1-hd']
+        today = date.today()
+        characters_day = 0
+        for tts_model in tts_models:
+            if tts_model in self.usage["usage_history"]["tts_characters"] and \
+                str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
+                characters_day += self.usage["usage_history"]["tts_characters"][tts_model][str(today)]
+
+        month = str(today)[:7]  # year-month as string
+        characters_month = 0
+        for tts_model in tts_models:
+            if tts_model in self.usage["usage_history"]["tts_characters"]: 
+                for today, characters in self.usage["usage_history"]["tts_characters"][tts_model].items():
+                    if today.startswith(month):
+                        characters_month += characters
+        return int(characters_day), int(characters_month)
+
+
    # transcription usage functions:

    def add_transcription_seconds(self, seconds, minute_price=0.006):
@@ -236,13 +289,14 @@ class UsageTracker:
        cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost())
        return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time}

-    def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006):
+    def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, tts_prices='0.015,0.030'):
        """Get total USD amount of all requests in history
        
        :param tokens_price: price per 1000 tokens, defaults to 0.002
        :param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"],
            defaults to [0.016, 0.018, 0.02]
        :param minute_price: price per minute transcription, defaults to 0.006
+        :param character_price: price per character tts per model ['tts-1', 'tts-1-hd'], defaults to [0.015, 0.030]
        :return: total cost of all requests
        """
        total_tokens = sum(self.usage['usage_history']['chat_tokens'].values())
@@ -255,5 +309,9 @@ class UsageTracker:
        total_transcription_seconds = sum(self.usage['usage_history']['transcription_seconds'].values())
        transcription_cost = round(total_transcription_seconds * minute_price / 60, 2)

-        all_time_cost = token_cost + transcription_cost + image_cost
+        total_characters = [sum(tts_model.values()) for tts_model in self.usage['usage_history']['tts_characters'].values()]
+        tts_prices_list = [float(x) for x in tts_prices.split(',')]
+        tts_cost = round(sum([count * price / 1000 for count, price in zip(total_characters, tts_prices_list)]), 2)
+
+        all_time_cost = token_cost + transcription_cost + image_cost + tts_cost
        return all_time_cost
--- a/translations.json
+++ b/translations.json
@@ -3,6 +3,7 @@
        "help_description":"Show help message",
        "reset_description":"Reset the conversation. Optionally pass high-level instructions (e.g. /reset You are a helpful assistant)",
        "image_description":"Generate image from prompt (e.g. /image cat)",
+        "tts_description":"Generate speech from text (e.g. /tts my house)",
        "stats_description":"Get your current usage statistics",
        "resend_description":"Resend the latest message",
        "chat_description":"Chat with the bot!",
@@ -14,6 +15,7 @@
        "usage_month":"Usage this month",
        "stats_tokens":"tokens",
        "stats_images":"images generated",
+        "stats_tts":"characters converted to speech",
        "stats_transcribe":["minutes and", "seconds transcribed"],
        "stats_total":"💰 For a total amount of $",
        "stats_budget":"Your remaining budget",
@@ -24,7 +26,9 @@
        "resend_failed":"You have nothing to resend",
        "reset_done":"Done!",
        "image_no_prompt":"Please provide a prompt! (e.g. /image cat)",
+        "tts_no_prompt":"Please provide text! (e.g. /tts my house)",
        "image_fail":"Failed to generate image",
+        "tts_fail":"Failed to generate speech",
        "media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"],
        "media_type_fail":"Unsupported file type",
        "transcript":"Transcript",