mirror of
https://github.com/aljazceru/chatgpt-telegram-bot.git
synced 2025-12-20 14:14:52 +01:00
added tts generation support
This commit is contained in:
@@ -19,6 +19,7 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
|
||||
# TRANSCRIPTION_PRICE=0.006
|
||||
# ENABLE_QUOTING=true
|
||||
# ENABLE_IMAGE_GENERATION=true
|
||||
# ENABLE_TTS_GENERATION=true
|
||||
# ENABLE_TRANSCRIPTION=true
|
||||
# PROXY=http://localhost:8080
|
||||
# OPENAI_MODEL=gpt-3.5-turbo
|
||||
@@ -37,4 +38,6 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
|
||||
# IMAGE_SIZE=512x512
|
||||
# GROUP_TRIGGER_KEYWORD=""
|
||||
# IGNORE_GROUP_TRANSCRIPTIONS=true
|
||||
# TTS_MODEL="tts-1"
|
||||
# TTS_VOICE="alloy"
|
||||
# BOT_LANGUAGE=en
|
||||
@@ -81,6 +81,7 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
|
||||
| `ENABLE_QUOTING` | Whether to enable message quoting in private chats | `true` |
|
||||
| `ENABLE_IMAGE_GENERATION` | Whether to enable image generation via the `/image` command | `true` |
|
||||
| `ENABLE_TRANSCRIPTION` | Whether to enable transcriptions of audio and video messages | `true` |
|
||||
| `ENABLE_TTS_GENERATION` | Whether to enable text to speech generation via the `/tts` | `true` |
|
||||
| `PROXY` | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`) | - |
|
||||
| `OPENAI_MODEL` | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/) | `gpt-3.5-turbo` |
|
||||
| `ASSISTANT_PROMPT` | A system message that sets the tone and controls the behavior of the assistant | `You are a helpful assistant.` |
|
||||
@@ -100,6 +101,8 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
|
||||
| `IGNORE_GROUP_TRANSCRIPTIONS` | If set to true, the bot will not process transcriptions in group chats | `true` |
|
||||
| `BOT_LANGUAGE` | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`. [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en` |
|
||||
| `WHISPER_PROMPT` | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message. [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting) | `-` |
|
||||
| `TTS_VOICE` | The Text to Speech voice to use. Allowed values: `alloy`, `echo`, `fable`, `onyx`, `nova`, or `shimmer` | `alloy` |
|
||||
| `TTS_MODEL` | The Text to Speech model to use. Allowed values: `tts-1` or `tts-1-hd` | `tts-1` |
|
||||
|
||||
Check out the [official API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
|
||||
|
||||
|
||||
@@ -50,6 +50,8 @@ def main():
|
||||
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
|
||||
'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
|
||||
'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
|
||||
'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
|
||||
'tts_voice': os.environ.get('TTS_VOICE', 'alloy'),
|
||||
}
|
||||
|
||||
if openai_config['enable_functions'] and not functions_available:
|
||||
@@ -70,6 +72,7 @@ def main():
|
||||
'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true',
|
||||
'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
|
||||
'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
|
||||
'enable_tts_generation': os.environ.get('ENABLE_TTS_GENERATION', 'true').lower() == 'true',
|
||||
'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
|
||||
'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
|
||||
'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))),
|
||||
@@ -81,6 +84,8 @@ def main():
|
||||
'group_trigger_keyword': os.environ.get('GROUP_TRIGGER_KEYWORD', ''),
|
||||
'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
|
||||
'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
|
||||
'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
|
||||
'tts_prices': [float(i) for i in os.environ.get('TTS_PRICES', "0.015,0.030").split(",")],
|
||||
'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)),
|
||||
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import openai
|
||||
import requests
|
||||
import json
|
||||
import httpx
|
||||
import tempfile
|
||||
from datetime import date
|
||||
from calendar import monthrange
|
||||
|
||||
@@ -334,6 +335,27 @@ class OpenAIHelper:
|
||||
except Exception as e:
|
||||
raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e
|
||||
|
||||
async def generate_speech(self, text: str) -> tuple[any, int]:
|
||||
"""
|
||||
Generates an audio from the given text using TTS model.
|
||||
:param prompt: The text to send to the model
|
||||
:return: The audio in bytes and the text size
|
||||
"""
|
||||
bot_language = self.config['bot_language']
|
||||
try:
|
||||
response = await self.client.audio.speech.create(
|
||||
model=self.config['tts_model'],
|
||||
voice=self.config['tts_voice'],
|
||||
input=text,
|
||||
response_format='opus'
|
||||
)
|
||||
|
||||
temp_file = tempfile.NamedTemporaryFile()
|
||||
response.stream_to_file(temp_file.name)
|
||||
return temp_file, len(text)
|
||||
except Exception as e:
|
||||
raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e
|
||||
|
||||
async def transcribe(self, filename):
|
||||
"""
|
||||
Transcribes the audio file using the Whisper model.
|
||||
|
||||
@@ -46,6 +46,9 @@ class ChatGPTTelegramBot:
|
||||
if self.config.get('enable_image_generation', False):
|
||||
self.commands.append(BotCommand(command='image', description=localized_text('image_description', bot_language)))
|
||||
|
||||
if self.config.get('enable_speech_generation', False):
|
||||
self.commands.append(BotCommand(command='tts', description=localized_text('speech_description', bot_language)))
|
||||
|
||||
self.group_commands = [BotCommand(
|
||||
command='chat', description=localized_text('chat_description', bot_language)
|
||||
)] + self.commands
|
||||
@@ -94,6 +97,7 @@ class ChatGPTTelegramBot:
|
||||
images_today, images_month = self.usage[user_id].get_current_image_count()
|
||||
(transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month,
|
||||
transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration()
|
||||
characters_today, characters_month = self.usage[user_id].get_current_tts_usage()
|
||||
current_cost = self.usage[user_id].get_current_cost()
|
||||
|
||||
chat_id = update.effective_chat.id
|
||||
@@ -112,11 +116,16 @@ class ChatGPTTelegramBot:
|
||||
text_today_images = ""
|
||||
if self.config.get('enable_image_generation', False):
|
||||
text_today_images = f"{images_today} {localized_text('stats_images', bot_language)}\n"
|
||||
|
||||
text_today_tts = ""
|
||||
if self.config.get('enable_tts_generation', False):
|
||||
text_today_tts = f"{characters_today} {localized_text('stats_tts', bot_language)}\n"
|
||||
|
||||
text_today = (
|
||||
f"*{localized_text('usage_today', bot_language)}:*\n"
|
||||
f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n"
|
||||
f"{text_today_images}" # Include the image statistics for today if applicable
|
||||
f"{text_today_tts}"
|
||||
f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} "
|
||||
f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n"
|
||||
f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n"
|
||||
@@ -126,12 +135,17 @@ class ChatGPTTelegramBot:
|
||||
text_month_images = ""
|
||||
if self.config.get('enable_image_generation', False):
|
||||
text_month_images = f"{images_month} {localized_text('stats_images', bot_language)}\n"
|
||||
|
||||
text_month_tts = ""
|
||||
if self.config.get('enable_tts_generation', False):
|
||||
text_month_tts = f"{characters_month} {localized_text('stats_tts', bot_language)}\n"
|
||||
|
||||
# Check if image generation is enabled and, if so, generate the image statistics for the month
|
||||
text_month = (
|
||||
f"*{localized_text('usage_month', bot_language)}:*\n"
|
||||
f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n"
|
||||
f"{text_month_images}" # Include the image statistics for the month if applicable
|
||||
f"{text_month_tts}"
|
||||
f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} "
|
||||
f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n"
|
||||
f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}"
|
||||
@@ -250,6 +264,52 @@ class ChatGPTTelegramBot:
|
||||
|
||||
await wrap_with_indicator(update, context, _generate, constants.ChatAction.UPLOAD_PHOTO)
|
||||
|
||||
async def tts(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||
"""
|
||||
Generates an speech for the given input using TTS APIs
|
||||
"""
|
||||
if not self.config['enable_tts_generation'] \
|
||||
or not await self.check_allowed_and_within_budget(update, context):
|
||||
return
|
||||
|
||||
tts_query = message_text(update.message)
|
||||
if tts_query == '':
|
||||
await update.effective_message.reply_text(
|
||||
message_thread_id=get_thread_id(update),
|
||||
text=localized_text('tts_no_prompt', self.config['bot_language'])
|
||||
)
|
||||
return
|
||||
|
||||
logging.info(f'New speech generation request received from user {update.message.from_user.name} '
|
||||
f'(id: {update.message.from_user.id})')
|
||||
|
||||
async def _generate():
|
||||
try:
|
||||
speech_file, text_length = await self.openai.generate_speech(text=tts_query)
|
||||
|
||||
await update.effective_message.reply_voice(
|
||||
reply_to_message_id=get_reply_to_message_id(self.config, update),
|
||||
voice=speech_file
|
||||
)
|
||||
speech_file.close()
|
||||
# add image request to users usage tracker
|
||||
user_id = update.message.from_user.id
|
||||
self.usage[user_id].add_tts_request(text_length, self.config['tts_model'], self.config['tts_prices'])
|
||||
# add guest chat request to guest usage tracker
|
||||
if str(user_id) not in self.config['allowed_user_ids'].split(',') and 'guests' in self.usage:
|
||||
self.usage["guests"].add_tts_request(text_length, self.config['tts_model'], self.config['tts_prices'])
|
||||
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
await update.effective_message.reply_text(
|
||||
message_thread_id=get_thread_id(update),
|
||||
reply_to_message_id=get_reply_to_message_id(self.config, update),
|
||||
text=f"{localized_text('tts_fail', self.config['bot_language'])}: {str(e)}",
|
||||
parse_mode=constants.ParseMode.MARKDOWN
|
||||
)
|
||||
|
||||
await wrap_with_indicator(update, context, _generate, constants.ChatAction.UPLOAD_VOICE)
|
||||
|
||||
async def transcribe(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||
"""
|
||||
Transcribe audio messages.
|
||||
@@ -786,6 +846,7 @@ class ChatGPTTelegramBot:
|
||||
application.add_handler(CommandHandler('reset', self.reset))
|
||||
application.add_handler(CommandHandler('help', self.help))
|
||||
application.add_handler(CommandHandler('image', self.image))
|
||||
application.add_handler(CommandHandler('tts', self.tts))
|
||||
application.add_handler(CommandHandler('start', self.help))
|
||||
application.add_handler(CommandHandler('stats', self.stats))
|
||||
application.add_handler(CommandHandler('resend', self.resend))
|
||||
|
||||
@@ -56,6 +56,8 @@ class UsageTracker:
|
||||
if os.path.isfile(self.user_file):
|
||||
with open(self.user_file, "r") as file:
|
||||
self.usage = json.load(file)
|
||||
if 'tts_characters' not in self.usage['usage_history']:
|
||||
self.usage['usage_history']['tts_characters'] = {}
|
||||
else:
|
||||
# ensure directory exists
|
||||
pathlib.Path(logs_dir).mkdir(exist_ok=True)
|
||||
@@ -63,7 +65,7 @@ class UsageTracker:
|
||||
self.usage = {
|
||||
"user_name": user_name,
|
||||
"current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())},
|
||||
"usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}}
|
||||
"usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "tts_characters": {}}
|
||||
}
|
||||
|
||||
# token usage functions:
|
||||
@@ -151,6 +153,57 @@ class UsageTracker:
|
||||
usage_month += sum(images)
|
||||
return usage_day, usage_month
|
||||
|
||||
# tts usage functions:
|
||||
|
||||
def add_tts_request(self, text_length, tts_model, tts_prices):
|
||||
tts_models = ['tts-1', 'tts-1-hd']
|
||||
price = tts_prices[tts_models.index(tts_model)]
|
||||
today = date.today()
|
||||
tts_price = round(text_length * price / 1000, 2)
|
||||
self.add_current_costs(tts_price)
|
||||
|
||||
if 'tts_characters' not in self.usage['usage_history']:
|
||||
self.usage['usage_history']['tts_characters'] = {}
|
||||
|
||||
if tts_model not in self.usage['usage_history']['tts_characters']:
|
||||
self.usage['usage_history']['tts_characters'][tts_model] = {}
|
||||
|
||||
# update usage_history
|
||||
if str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
|
||||
# add requested text length to existing date
|
||||
self.usage["usage_history"]["tts_characters"][tts_model][str(today)] += text_length
|
||||
else:
|
||||
# create new entry for current date
|
||||
self.usage["usage_history"]["tts_characters"][tts_model][str(today)] = text_length
|
||||
|
||||
# write updated token usage to user file
|
||||
with open(self.user_file, "w") as outfile:
|
||||
json.dump(self.usage, outfile)
|
||||
|
||||
def get_current_tts_usage(self):
|
||||
"""Get length of speech generated for today and this month.
|
||||
|
||||
:return: total amount of characters converted to speech per day and per month
|
||||
"""
|
||||
|
||||
tts_models = ['tts-1', 'tts-1-hd']
|
||||
today = date.today()
|
||||
characters_day = 0
|
||||
for tts_model in tts_models:
|
||||
if tts_model in self.usage["usage_history"]["tts_characters"] and \
|
||||
str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
|
||||
characters_day += self.usage["usage_history"]["tts_characters"][tts_model][str(today)]
|
||||
|
||||
month = str(today)[:7] # year-month as string
|
||||
characters_month = 0
|
||||
for tts_model in tts_models:
|
||||
if tts_model in self.usage["usage_history"]["tts_characters"]:
|
||||
for today, characters in self.usage["usage_history"]["tts_characters"][tts_model].items():
|
||||
if today.startswith(month):
|
||||
characters_month += characters
|
||||
return int(characters_day), int(characters_month)
|
||||
|
||||
|
||||
# transcription usage functions:
|
||||
|
||||
def add_transcription_seconds(self, seconds, minute_price=0.006):
|
||||
@@ -236,13 +289,14 @@ class UsageTracker:
|
||||
cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost())
|
||||
return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time}
|
||||
|
||||
def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006):
|
||||
def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, tts_prices='0.015,0.030'):
|
||||
"""Get total USD amount of all requests in history
|
||||
|
||||
:param tokens_price: price per 1000 tokens, defaults to 0.002
|
||||
:param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"],
|
||||
defaults to [0.016, 0.018, 0.02]
|
||||
:param minute_price: price per minute transcription, defaults to 0.006
|
||||
:param character_price: price per character tts per model ['tts-1', 'tts-1-hd'], defaults to [0.015, 0.030]
|
||||
:return: total cost of all requests
|
||||
"""
|
||||
total_tokens = sum(self.usage['usage_history']['chat_tokens'].values())
|
||||
@@ -255,5 +309,9 @@ class UsageTracker:
|
||||
total_transcription_seconds = sum(self.usage['usage_history']['transcription_seconds'].values())
|
||||
transcription_cost = round(total_transcription_seconds * minute_price / 60, 2)
|
||||
|
||||
all_time_cost = token_cost + transcription_cost + image_cost
|
||||
total_characters = [sum(tts_model.values()) for tts_model in self.usage['usage_history']['tts_characters'].values()]
|
||||
tts_prices_list = [float(x) for x in tts_prices.split(',')]
|
||||
tts_cost = round(sum([count * price / 1000 for count, price in zip(total_characters, tts_prices_list)]), 2)
|
||||
|
||||
all_time_cost = token_cost + transcription_cost + image_cost + tts_cost
|
||||
return all_time_cost
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
"help_description":"Show help message",
|
||||
"reset_description":"Reset the conversation. Optionally pass high-level instructions (e.g. /reset You are a helpful assistant)",
|
||||
"image_description":"Generate image from prompt (e.g. /image cat)",
|
||||
"tts_description":"Generate speech from text (e.g. /tts my house)",
|
||||
"stats_description":"Get your current usage statistics",
|
||||
"resend_description":"Resend the latest message",
|
||||
"chat_description":"Chat with the bot!",
|
||||
@@ -14,6 +15,7 @@
|
||||
"usage_month":"Usage this month",
|
||||
"stats_tokens":"tokens",
|
||||
"stats_images":"images generated",
|
||||
"stats_tts":"characters converted to speech",
|
||||
"stats_transcribe":["minutes and", "seconds transcribed"],
|
||||
"stats_total":"💰 For a total amount of $",
|
||||
"stats_budget":"Your remaining budget",
|
||||
@@ -24,7 +26,9 @@
|
||||
"resend_failed":"You have nothing to resend",
|
||||
"reset_done":"Done!",
|
||||
"image_no_prompt":"Please provide a prompt! (e.g. /image cat)",
|
||||
"tts_no_prompt":"Please provide text! (e.g. /tts my house)",
|
||||
"image_fail":"Failed to generate image",
|
||||
"tts_fail":"Failed to generate speech",
|
||||
"media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"],
|
||||
"media_type_fail":"Unsupported file type",
|
||||
"transcript":"Transcript",
|
||||
|
||||
Reference in New Issue
Block a user