diff --git a/.env.example b/.env.example index d6eb078..9de40bc 100644 --- a/.env.example +++ b/.env.example @@ -19,6 +19,7 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2 # TRANSCRIPTION_PRICE=0.006 # ENABLE_QUOTING=true # ENABLE_IMAGE_GENERATION=true +# ENABLE_TTS_GENERATION=true # ENABLE_TRANSCRIPTION=true # PROXY=http://localhost:8080 # OPENAI_MODEL=gpt-3.5-turbo @@ -37,4 +38,6 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2 # IMAGE_SIZE=512x512 # GROUP_TRIGGER_KEYWORD="" # IGNORE_GROUP_TRANSCRIPTIONS=true +# TTS_MODEL="tts-1" +# TTS_VOICE="alloy" # BOT_LANGUAGE=en \ No newline at end of file diff --git a/README.md b/README.md index 34e23b3..b5bea20 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di | `ENABLE_QUOTING` | Whether to enable message quoting in private chats | `true` | | `ENABLE_IMAGE_GENERATION` | Whether to enable image generation via the `/image` command | `true` | | `ENABLE_TRANSCRIPTION` | Whether to enable transcriptions of audio and video messages | `true` | +| `ENABLE_TTS_GENERATION` | Whether to enable text to speech generation via the `/tts` | `true` | | `PROXY` | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`) | - | | `OPENAI_MODEL` | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/) | `gpt-3.5-turbo` | | `ASSISTANT_PROMPT` | A system message that sets the tone and controls the behavior of the assistant | `You are a helpful assistant.` | @@ -100,6 +101,8 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di | `IGNORE_GROUP_TRANSCRIPTIONS` | If set to true, the bot will not process transcriptions in group chats | `true` | | `BOT_LANGUAGE` | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`. [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en` | | `WHISPER_PROMPT` | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message. [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting) | `-` | +| `TTS_VOICE` | The Text to Speech voice to use. Allowed values: `alloy`, `echo`, `fable`, `onyx`, `nova`, or `shimmer` | `alloy` | +| `TTS_MODEL` | The Text to Speech model to use. Allowed values: `tts-1` or `tts-1-hd` | `tts-1` | Check out the [official API reference](https://platform.openai.com/docs/api-reference/chat) for more details. diff --git a/bot/main.py b/bot/main.py index d7605fd..f0e5e89 100644 --- a/bot/main.py +++ b/bot/main.py @@ -50,6 +50,8 @@ def main(): 'bot_language': os.environ.get('BOT_LANGUAGE', 'en'), 'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true', 'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''), + 'tts_model': os.environ.get('TTS_MODEL', 'tts-1'), + 'tts_voice': os.environ.get('TTS_VOICE', 'alloy'), } if openai_config['enable_functions'] and not functions_available: @@ -70,6 +72,7 @@ def main(): 'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true', 'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true', 'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true', + 'enable_tts_generation': os.environ.get('ENABLE_TTS_GENERATION', 'true').lower() == 'true', 'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(), 'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')), 'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))), @@ -81,6 +84,8 @@ def main(): 'group_trigger_keyword': os.environ.get('GROUP_TRIGGER_KEYWORD', ''), 'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)), 'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")], + 'tts_model': os.environ.get('TTS_MODEL', 'tts-1'), + 'tts_prices': [float(i) for i in os.environ.get('TTS_PRICES', "0.015,0.030").split(",")], 'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)), 'bot_language': os.environ.get('BOT_LANGUAGE', 'en'), } diff --git a/bot/openai_helper.py b/bot/openai_helper.py index b8a5f12..9e4a2c1 100644 --- a/bot/openai_helper.py +++ b/bot/openai_helper.py @@ -10,6 +10,7 @@ import openai import requests import json import httpx +import tempfile from datetime import date from calendar import monthrange @@ -334,6 +335,27 @@ class OpenAIHelper: except Exception as e: raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e + async def generate_speech(self, text: str) -> tuple[any, int]: + """ + Generates an audio from the given text using TTS model. + :param prompt: The text to send to the model + :return: The audio in bytes and the text size + """ + bot_language = self.config['bot_language'] + try: + response = await self.client.audio.speech.create( + model=self.config['tts_model'], + voice=self.config['tts_voice'], + input=text, + response_format='opus' + ) + + temp_file = tempfile.NamedTemporaryFile() + response.stream_to_file(temp_file.name) + return temp_file, len(text) + except Exception as e: + raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e + async def transcribe(self, filename): """ Transcribes the audio file using the Whisper model. diff --git a/bot/telegram_bot.py b/bot/telegram_bot.py index 57a3f0a..3cb6d9b 100644 --- a/bot/telegram_bot.py +++ b/bot/telegram_bot.py @@ -46,6 +46,9 @@ class ChatGPTTelegramBot: if self.config.get('enable_image_generation', False): self.commands.append(BotCommand(command='image', description=localized_text('image_description', bot_language))) + if self.config.get('enable_speech_generation', False): + self.commands.append(BotCommand(command='tts', description=localized_text('speech_description', bot_language))) + self.group_commands = [BotCommand( command='chat', description=localized_text('chat_description', bot_language) )] + self.commands @@ -94,6 +97,7 @@ class ChatGPTTelegramBot: images_today, images_month = self.usage[user_id].get_current_image_count() (transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month, transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration() + characters_today, characters_month = self.usage[user_id].get_current_tts_usage() current_cost = self.usage[user_id].get_current_cost() chat_id = update.effective_chat.id @@ -112,11 +116,16 @@ class ChatGPTTelegramBot: text_today_images = "" if self.config.get('enable_image_generation', False): text_today_images = f"{images_today} {localized_text('stats_images', bot_language)}\n" + + text_today_tts = "" + if self.config.get('enable_tts_generation', False): + text_today_tts = f"{characters_today} {localized_text('stats_tts', bot_language)}\n" text_today = ( f"*{localized_text('usage_today', bot_language)}:*\n" f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n" f"{text_today_images}" # Include the image statistics for today if applicable + f"{text_today_tts}" f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} " f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n" f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n" @@ -126,12 +135,17 @@ class ChatGPTTelegramBot: text_month_images = "" if self.config.get('enable_image_generation', False): text_month_images = f"{images_month} {localized_text('stats_images', bot_language)}\n" + + text_month_tts = "" + if self.config.get('enable_tts_generation', False): + text_month_tts = f"{characters_month} {localized_text('stats_tts', bot_language)}\n" # Check if image generation is enabled and, if so, generate the image statistics for the month text_month = ( f"*{localized_text('usage_month', bot_language)}:*\n" f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n" f"{text_month_images}" # Include the image statistics for the month if applicable + f"{text_month_tts}" f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} " f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n" f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}" @@ -250,6 +264,52 @@ class ChatGPTTelegramBot: await wrap_with_indicator(update, context, _generate, constants.ChatAction.UPLOAD_PHOTO) + async def tts(self, update: Update, context: ContextTypes.DEFAULT_TYPE): + """ + Generates an speech for the given input using TTS APIs + """ + if not self.config['enable_tts_generation'] \ + or not await self.check_allowed_and_within_budget(update, context): + return + + tts_query = message_text(update.message) + if tts_query == '': + await update.effective_message.reply_text( + message_thread_id=get_thread_id(update), + text=localized_text('tts_no_prompt', self.config['bot_language']) + ) + return + + logging.info(f'New speech generation request received from user {update.message.from_user.name} ' + f'(id: {update.message.from_user.id})') + + async def _generate(): + try: + speech_file, text_length = await self.openai.generate_speech(text=tts_query) + + await update.effective_message.reply_voice( + reply_to_message_id=get_reply_to_message_id(self.config, update), + voice=speech_file + ) + speech_file.close() + # add image request to users usage tracker + user_id = update.message.from_user.id + self.usage[user_id].add_tts_request(text_length, self.config['tts_model'], self.config['tts_prices']) + # add guest chat request to guest usage tracker + if str(user_id) not in self.config['allowed_user_ids'].split(',') and 'guests' in self.usage: + self.usage["guests"].add_tts_request(text_length, self.config['tts_model'], self.config['tts_prices']) + + except Exception as e: + logging.exception(e) + await update.effective_message.reply_text( + message_thread_id=get_thread_id(update), + reply_to_message_id=get_reply_to_message_id(self.config, update), + text=f"{localized_text('tts_fail', self.config['bot_language'])}: {str(e)}", + parse_mode=constants.ParseMode.MARKDOWN + ) + + await wrap_with_indicator(update, context, _generate, constants.ChatAction.UPLOAD_VOICE) + async def transcribe(self, update: Update, context: ContextTypes.DEFAULT_TYPE): """ Transcribe audio messages. @@ -786,6 +846,7 @@ class ChatGPTTelegramBot: application.add_handler(CommandHandler('reset', self.reset)) application.add_handler(CommandHandler('help', self.help)) application.add_handler(CommandHandler('image', self.image)) + application.add_handler(CommandHandler('tts', self.tts)) application.add_handler(CommandHandler('start', self.help)) application.add_handler(CommandHandler('stats', self.stats)) application.add_handler(CommandHandler('resend', self.resend)) diff --git a/bot/usage_tracker.py b/bot/usage_tracker.py index c7bb7b3..fd907f3 100644 --- a/bot/usage_tracker.py +++ b/bot/usage_tracker.py @@ -56,6 +56,8 @@ class UsageTracker: if os.path.isfile(self.user_file): with open(self.user_file, "r") as file: self.usage = json.load(file) + if 'tts_characters' not in self.usage['usage_history']: + self.usage['usage_history']['tts_characters'] = {} else: # ensure directory exists pathlib.Path(logs_dir).mkdir(exist_ok=True) @@ -63,7 +65,7 @@ class UsageTracker: self.usage = { "user_name": user_name, "current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())}, - "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}} + "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "tts_characters": {}} } # token usage functions: @@ -151,6 +153,57 @@ class UsageTracker: usage_month += sum(images) return usage_day, usage_month + # tts usage functions: + + def add_tts_request(self, text_length, tts_model, tts_prices): + tts_models = ['tts-1', 'tts-1-hd'] + price = tts_prices[tts_models.index(tts_model)] + today = date.today() + tts_price = round(text_length * price / 1000, 2) + self.add_current_costs(tts_price) + + if 'tts_characters' not in self.usage['usage_history']: + self.usage['usage_history']['tts_characters'] = {} + + if tts_model not in self.usage['usage_history']['tts_characters']: + self.usage['usage_history']['tts_characters'][tts_model] = {} + + # update usage_history + if str(today) in self.usage["usage_history"]["tts_characters"][tts_model]: + # add requested text length to existing date + self.usage["usage_history"]["tts_characters"][tts_model][str(today)] += text_length + else: + # create new entry for current date + self.usage["usage_history"]["tts_characters"][tts_model][str(today)] = text_length + + # write updated token usage to user file + with open(self.user_file, "w") as outfile: + json.dump(self.usage, outfile) + + def get_current_tts_usage(self): + """Get length of speech generated for today and this month. + + :return: total amount of characters converted to speech per day and per month + """ + + tts_models = ['tts-1', 'tts-1-hd'] + today = date.today() + characters_day = 0 + for tts_model in tts_models: + if tts_model in self.usage["usage_history"]["tts_characters"] and \ + str(today) in self.usage["usage_history"]["tts_characters"][tts_model]: + characters_day += self.usage["usage_history"]["tts_characters"][tts_model][str(today)] + + month = str(today)[:7] # year-month as string + characters_month = 0 + for tts_model in tts_models: + if tts_model in self.usage["usage_history"]["tts_characters"]: + for today, characters in self.usage["usage_history"]["tts_characters"][tts_model].items(): + if today.startswith(month): + characters_month += characters + return int(characters_day), int(characters_month) + + # transcription usage functions: def add_transcription_seconds(self, seconds, minute_price=0.006): @@ -236,13 +289,14 @@ class UsageTracker: cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost()) return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time} - def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006): + def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, tts_prices='0.015,0.030'): """Get total USD amount of all requests in history :param tokens_price: price per 1000 tokens, defaults to 0.002 :param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"], defaults to [0.016, 0.018, 0.02] :param minute_price: price per minute transcription, defaults to 0.006 + :param character_price: price per character tts per model ['tts-1', 'tts-1-hd'], defaults to [0.015, 0.030] :return: total cost of all requests """ total_tokens = sum(self.usage['usage_history']['chat_tokens'].values()) @@ -255,5 +309,9 @@ class UsageTracker: total_transcription_seconds = sum(self.usage['usage_history']['transcription_seconds'].values()) transcription_cost = round(total_transcription_seconds * minute_price / 60, 2) - all_time_cost = token_cost + transcription_cost + image_cost + total_characters = [sum(tts_model.values()) for tts_model in self.usage['usage_history']['tts_characters'].values()] + tts_prices_list = [float(x) for x in tts_prices.split(',')] + tts_cost = round(sum([count * price / 1000 for count, price in zip(total_characters, tts_prices_list)]), 2) + + all_time_cost = token_cost + transcription_cost + image_cost + tts_cost return all_time_cost diff --git a/translations.json b/translations.json index f603efb..63e1245 100644 --- a/translations.json +++ b/translations.json @@ -3,6 +3,7 @@ "help_description":"Show help message", "reset_description":"Reset the conversation. Optionally pass high-level instructions (e.g. /reset You are a helpful assistant)", "image_description":"Generate image from prompt (e.g. /image cat)", + "tts_description":"Generate speech from text (e.g. /tts my house)", "stats_description":"Get your current usage statistics", "resend_description":"Resend the latest message", "chat_description":"Chat with the bot!", @@ -14,6 +15,7 @@ "usage_month":"Usage this month", "stats_tokens":"tokens", "stats_images":"images generated", + "stats_tts":"characters converted to speech", "stats_transcribe":["minutes and", "seconds transcribed"], "stats_total":"💰 For a total amount of $", "stats_budget":"Your remaining budget", @@ -24,7 +26,9 @@ "resend_failed":"You have nothing to resend", "reset_done":"Done!", "image_no_prompt":"Please provide a prompt! (e.g. /image cat)", + "tts_no_prompt":"Please provide text! (e.g. /tts my house)", "image_fail":"Failed to generate image", + "tts_fail":"Failed to generate speech", "media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"], "media_type_fail":"Unsupported file type", "transcript":"Transcript",