added support for vision

This commit is contained in:
gilcu3
2023-11-08 17:56:27 +01:00
parent 450b86d5a7
commit 9e7d8701cd
9 changed files with 245 additions and 8 deletions

View File

@@ -17,24 +17,30 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
# TOKEN_PRICE=0.002 # TOKEN_PRICE=0.002
# IMAGE_PRICES=0.016,0.018,0.02 # IMAGE_PRICES=0.016,0.018,0.02
# TRANSCRIPTION_PRICE=0.006 # TRANSCRIPTION_PRICE=0.006
# VISION_TOKEN_PRICE=0.01
# ENABLE_QUOTING=true # ENABLE_QUOTING=true
# ENABLE_IMAGE_GENERATION=true # ENABLE_IMAGE_GENERATION=true
# ENABLE_TRANSCRIPTION=true # ENABLE_TRANSCRIPTION=true
# ENABLE_VISION=true
# PROXY=http://localhost:8080 # PROXY=http://localhost:8080
# OPENAI_MODEL=gpt-3.5-turbo # OPENAI_MODEL=gpt-3.5-turbo
# ASSISTANT_PROMPT="You are a helpful assistant." # ASSISTANT_PROMPT="You are a helpful assistant."
# SHOW_USAGE=false # SHOW_USAGE=false
# STREAM=true # STREAM=true
# MAX_TOKENS=1200 # MAX_TOKENS=1200
# VISION_MAX_TOKENS=300
# MAX_HISTORY_SIZE=15 # MAX_HISTORY_SIZE=15
# MAX_CONVERSATION_AGE_MINUTES=180 # MAX_CONVERSATION_AGE_MINUTES=180
# VOICE_REPLY_WITH_TRANSCRIPT_ONLY=true # VOICE_REPLY_WITH_TRANSCRIPT_ONLY=true
# VOICE_REPLY_PROMPTS="Hi bot;Hey bot;Hi chat;Hey chat" # VOICE_REPLY_PROMPTS="Hi bot;Hey bot;Hi chat;Hey chat"
# VISION_PROMPT="What is in this image"
# N_CHOICES=1 # N_CHOICES=1
# TEMPERATURE=1.0 # TEMPERATURE=1.0
# PRESENCE_PENALTY=0.0 # PRESENCE_PENALTY=0.0
# FREQUENCY_PENALTY=0.0 # FREQUENCY_PENALTY=0.0
# IMAGE_SIZE=512x512 # IMAGE_SIZE=512x512
# VISION_DETAIL="low"
# GROUP_TRIGGER_KEYWORD="" # GROUP_TRIGGER_KEYWORD=""
# IGNORE_GROUP_TRANSCRIPTIONS=true # IGNORE_GROUP_TRANSCRIPTIONS=true
# IGNORE_GROUP_VISION=true
# BOT_LANGUAGE=en # BOT_LANGUAGE=en

View File

@@ -72,6 +72,7 @@ The following parameters are optional and can be set in the `.env` file:
| `TOKEN_PRICE` | $-price per 1000 tokens used to compute cost information in usage statistics. Source: https://openai.com/pricing | `0.002` | | `TOKEN_PRICE` | $-price per 1000 tokens used to compute cost information in usage statistics. Source: https://openai.com/pricing | `0.002` |
| `IMAGE_PRICES` | A comma-separated list with 3 elements of prices for the different image sizes: `256x256`, `512x512` and `1024x1024`. Source: https://openai.com/pricing | `0.016,0.018,0.02` | | `IMAGE_PRICES` | A comma-separated list with 3 elements of prices for the different image sizes: `256x256`, `512x512` and `1024x1024`. Source: https://openai.com/pricing | `0.016,0.018,0.02` |
| `TRANSCRIPTION_PRICE` | USD-price for one minute of audio transcription. Source: https://openai.com/pricing | `0.006` | | `TRANSCRIPTION_PRICE` | USD-price for one minute of audio transcription. Source: https://openai.com/pricing | `0.006` |
| `VISION_TOKEN_PRICE` | USD-price per 1K tokens of image interpretation. Source: https://openai.com/pricing | `0.01` |
Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/184) for possible budget configurations. Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/184) for possible budget configurations.
@@ -81,23 +82,28 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
| `ENABLE_QUOTING` | Whether to enable message quoting in private chats | `true` | | `ENABLE_QUOTING` | Whether to enable message quoting in private chats | `true` |
| `ENABLE_IMAGE_GENERATION` | Whether to enable image generation via the `/image` command | `true` | | `ENABLE_IMAGE_GENERATION` | Whether to enable image generation via the `/image` command | `true` |
| `ENABLE_TRANSCRIPTION` | Whether to enable transcriptions of audio and video messages | `true` | | `ENABLE_TRANSCRIPTION` | Whether to enable transcriptions of audio and video messages | `true` |
| `ENABLE_VISION` | Whether to enable vision capabilities in supported models | `true` |
| `PROXY` | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`) | - | | `PROXY` | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`) | - |
| `OPENAI_MODEL` | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/) | `gpt-3.5-turbo` | | `OPENAI_MODEL` | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/) | `gpt-3.5-turbo` |
| `ASSISTANT_PROMPT` | A system message that sets the tone and controls the behavior of the assistant | `You are a helpful assistant.` | | `ASSISTANT_PROMPT` | A system message that sets the tone and controls the behavior of the assistant | `You are a helpful assistant.` |
| `SHOW_USAGE` | Whether to show OpenAI token usage information after each response | `false` | | `SHOW_USAGE` | Whether to show OpenAI token usage information after each response | `false` |
| `STREAM` | Whether to stream responses. **Note**: incompatible, if enabled, with `N_CHOICES` higher than 1 | `true` | | `STREAM` | Whether to stream responses. **Note**: incompatible, if enabled, with `N_CHOICES` higher than 1 | `true` |
| `MAX_TOKENS` | Upper bound on how many tokens the ChatGPT API will return | `1200` for GPT-3, `2400` for GPT-4 | | `MAX_TOKENS` | Upper bound on how many tokens the ChatGPT API will return | `1200` for GPT-3, `2400` for GPT-4 |
| `VISION_MAX_TOKENS` | Upper bound on how many tokens vision models will return | `300` for gpt-4-vision-preview |
| `MAX_HISTORY_SIZE` | Max number of messages to keep in memory, after which the conversation will be summarised to avoid excessive token usage | `15` | | `MAX_HISTORY_SIZE` | Max number of messages to keep in memory, after which the conversation will be summarised to avoid excessive token usage | `15` |
| `MAX_CONVERSATION_AGE_MINUTES` | Maximum number of minutes a conversation should live since the last message, after which the conversation will be reset | `180` | | `MAX_CONVERSATION_AGE_MINUTES` | Maximum number of minutes a conversation should live since the last message, after which the conversation will be reset | `180` |
| `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` | Whether to answer to voice messages with the transcript only or with a ChatGPT response of the transcript | `false` | | `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` | Whether to answer to voice messages with the transcript only or with a ChatGPT response of the transcript | `false` |
| `VOICE_REPLY_PROMPTS` | A semicolon separated list of phrases (i.e. `Hi bot;Hello chat`). If the transcript starts with any of them, it will be treated as a prompt even if `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` is set to `true` | - | | `VOICE_REPLY_PROMPTS` | A semicolon separated list of phrases (i.e. `Hi bot;Hello chat`). If the transcript starts with any of them, it will be treated as a prompt even if `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` is set to `true` | - |
| `VISION_PROMPT` | A phrase (i.e. `What is in this image`). The vision models use it as prompt to interpret a given image. If there is caption in the image sent to the bot, that supersedes this parameter | `What is in this image` |
| `N_CHOICES` | Number of answers to generate for each input message. **Note**: setting this to a number higher than 1 will not work properly if `STREAM` is enabled | `1` | | `N_CHOICES` | Number of answers to generate for each input message. **Note**: setting this to a number higher than 1 will not work properly if `STREAM` is enabled | `1` |
| `TEMPERATURE` | Number between 0 and 2. Higher values will make the output more random | `1.0` | | `TEMPERATURE` | Number between 0 and 2. Higher values will make the output more random | `1.0` |
| `PRESENCE_PENALTY` | Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far | `0.0` | | `PRESENCE_PENALTY` | Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far | `0.0` |
| `FREQUENCY_PENALTY` | Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far | `0.0` | | `FREQUENCY_PENALTY` | Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far | `0.0` |
| `IMAGE_SIZE` | The DALL·E generated image size. Allowed values: `256x256`, `512x512` or `1024x1024` | `512x512` | | `IMAGE_SIZE` | The DALL·E generated image size. Allowed values: `256x256`, `512x512` or `1024x1024` | `512x512` |
| `VISION_DETAIL` | The detail parameter for vision models, explained [Vision Guide](https://platform.openai.com/docs/guides/vision). Allowed values: `low` or `high` | `low` |
| `GROUP_TRIGGER_KEYWORD` | If set, the bot in group chats will only respond to messages that start with this keyword | - | | `GROUP_TRIGGER_KEYWORD` | If set, the bot in group chats will only respond to messages that start with this keyword | - |
| `IGNORE_GROUP_TRANSCRIPTIONS` | If set to true, the bot will not process transcriptions in group chats | `true` | | `IGNORE_GROUP_TRANSCRIPTIONS` | If set to true, the bot will not process transcriptions in group chats | `true` |
| `IGNORE_GROUP_VISION` | If set to true, the bot will not process vision queries in group chats | `true` |
| `BOT_LANGUAGE` | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`. [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en` | | `BOT_LANGUAGE` | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`. [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en` |
| `WHISPER_PROMPT` | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message. [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting) | `-` | | `WHISPER_PROMPT` | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message. [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting) | `-` |

View File

@@ -50,6 +50,9 @@ def main():
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'), 'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true', 'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''), 'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
'vision_prompt': os.environ.get('VISION_PROMPT', 'What is in this image'),
'vision_detail': os.environ.get('VISION_DETAIL', 'low'),
'vision_max_tokens': int(os.environ.get('VISION_MAX_TOKENS', '300')),
} }
if openai_config['enable_functions'] and not functions_available: if openai_config['enable_functions'] and not functions_available:
@@ -70,6 +73,7 @@ def main():
'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true', 'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true',
'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true', 'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true', 'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
'enable_vision': os.environ.get('ENABLE_VISION', 'true').lower() == 'true',
'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(), 'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')), 'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))), 'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))),
@@ -78,9 +82,11 @@ def main():
'voice_reply_transcript': os.environ.get('VOICE_REPLY_WITH_TRANSCRIPT_ONLY', 'false').lower() == 'true', 'voice_reply_transcript': os.environ.get('VOICE_REPLY_WITH_TRANSCRIPT_ONLY', 'false').lower() == 'true',
'voice_reply_prompts': os.environ.get('VOICE_REPLY_PROMPTS', '').split(';'), 'voice_reply_prompts': os.environ.get('VOICE_REPLY_PROMPTS', '').split(';'),
'ignore_group_transcriptions': os.environ.get('IGNORE_GROUP_TRANSCRIPTIONS', 'true').lower() == 'true', 'ignore_group_transcriptions': os.environ.get('IGNORE_GROUP_TRANSCRIPTIONS', 'true').lower() == 'true',
'ignore_group_vision': os.environ.get('IGNORE_GROUP_VISION', 'true').lower() == 'true',
'group_trigger_keyword': os.environ.get('GROUP_TRIGGER_KEYWORD', ''), 'group_trigger_keyword': os.environ.get('GROUP_TRIGGER_KEYWORD', ''),
'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)), 'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")], 'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
'vision_token_price': float(os.environ.get('VISION_TOKEN_PRICE', '0.01')),
'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)), 'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)),
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'), 'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
} }

View File

@@ -12,10 +12,11 @@ import json
import httpx import httpx
from datetime import date from datetime import date
from calendar import monthrange from calendar import monthrange
from PIL import Image
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
from utils import is_direct_result from utils import is_direct_result, encode_image
from plugin_manager import PluginManager from plugin_manager import PluginManager
# Models can be found here: https://platform.openai.com/docs/models/overview # Models can be found here: https://platform.openai.com/docs/models/overview
@@ -23,7 +24,8 @@ GPT_3_MODELS = ("gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613")
GPT_3_16K_MODELS = ("gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613") GPT_3_16K_MODELS = ("gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613")
GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613") GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613")
GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613") GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613")
GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS GPT_4_VISION_MODELS = ("gpt-4-vision-preview",)
GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS
def default_max_tokens(model: str) -> int: def default_max_tokens(model: str) -> int:
@@ -41,6 +43,8 @@ def default_max_tokens(model: str) -> int:
return base * 4 return base * 4
elif model in GPT_4_32K_MODELS: elif model in GPT_4_32K_MODELS:
return base * 8 return base * 8
elif model in GPT_4_VISION_MODELS:
return 4096
def are_functions_available(model: str) -> bool: def are_functions_available(model: str) -> bool:
@@ -347,6 +351,26 @@ class OpenAIHelper:
logging.exception(e) logging.exception(e)
raise Exception(f"⚠️ _{localized_text('error', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e raise Exception(f"⚠️ _{localized_text('error', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e
async def interpret_image(self, filename, prompt=None):
"""
Interprets a given PNG image file using the Vision model.
"""
try:
image = encode_image(filename)
prompt = self.config['vision_prompt'] if prompt is None else prompt
message = {'role':'user', 'content':[{'type':'text', 'text':prompt}, {'type':'image_url', \
'image_url': {'url':f'data:image/jpeg;base64,{image}', 'detail':self.config['vision_detail'] } }]}
response = await self.client.chat.completions.create(model=self.config['model'], messages=[message], max_tokens=self.config['vision_max_tokens'])
return response.choices[0].message.content, self.__count_tokens_vision(filename)
except openai.RateLimitError as e:
raise e
except openai.BadRequestError as e:
raise Exception(f"⚠️ _{localized_text('openai_invalid', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e
except Exception as e:
logging.exception(e)
raise Exception(f"⚠️ _{localized_text('error', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e
def reset_chat_history(self, chat_id, content=''): def reset_chat_history(self, chat_id, content=''):
""" """
Resets the conversation history. Resets the conversation history.
@@ -410,6 +434,8 @@ class OpenAIHelper:
return base * 2 return base * 2
if self.config['model'] in GPT_4_32K_MODELS: if self.config['model'] in GPT_4_32K_MODELS:
return base * 8 return base * 8
if self.config['model'] in GPT_4_VISION_MODELS:
return base * 31
raise NotImplementedError( raise NotImplementedError(
f"Max tokens for model {self.config['model']} is not implemented yet." f"Max tokens for model {self.config['model']} is not implemented yet."
) )
@@ -430,7 +456,7 @@ class OpenAIHelper:
if model in GPT_3_MODELS + GPT_3_16K_MODELS: if model in GPT_3_MODELS + GPT_3_16K_MODELS:
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
tokens_per_name = -1 # if there's a name, the role is omitted tokens_per_name = -1 # if there's a name, the role is omitted
elif model in GPT_4_MODELS + GPT_4_32K_MODELS: elif model in GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS:
tokens_per_message = 3 tokens_per_message = 3
tokens_per_name = 1 tokens_per_name = 1
else: else:
@@ -445,6 +471,35 @@ class OpenAIHelper:
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
return num_tokens return num_tokens
def __count_tokens_vision(self, filename) -> int:
"""
Counts the number of tokens for interpreting an image.
:param image: image to interpret
:return: the number of tokens required
"""
image = Image.open(filename)
model = self.config['model']
if model not in GPT_4_VISION_MODELS:
raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}.""")
w, h = image.size
if w > h: w, h = h, w
# this computation follows https://platform.openai.com/docs/guides/vision and https://openai.com/pricing#gpt-4-turbo
base_tokens = 85
detail = self.config['vision_detail']
if detail == 'low':
return base_tokens
elif detail == 'high':
f = max(w / 768, h / 2048)
if f > 1:
w, h = int(w / f), int(h / f)
tw, th = (w + 511) // 512, (h + 511) // 512
tiles = tw * th
num_tokens = base_tokens + tiles * 170
return num_tokens
else:
raise NotImplementedError(f"""unknown parameter detail={detail} for model {model}.""")
# No longer works as of July 21st 2023, as OpenAI has removed the billing API # No longer works as of July 21st 2023, as OpenAI has removed the billing API
# def get_billing_current_month(self): # def get_billing_current_month(self):
# """Gets billed usage for current month from OpenAI API. # """Gets billed usage for current month from OpenAI API.

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import asyncio import asyncio
import logging import logging
import os import os
import tempfile
from uuid import uuid4 from uuid import uuid4
from telegram import BotCommandScopeAllGroupChats, Update, constants from telegram import BotCommandScopeAllGroupChats, Update, constants
@@ -13,6 +14,7 @@ from telegram.ext import ApplicationBuilder, CommandHandler, MessageHandler, \
filters, InlineQueryHandler, CallbackQueryHandler, Application, ContextTypes, CallbackContext filters, InlineQueryHandler, CallbackQueryHandler, Application, ContextTypes, CallbackContext
from pydub import AudioSegment from pydub import AudioSegment
from PIL import Image
from utils import is_group_chat, get_thread_id, message_text, wrap_with_indicator, split_into_chunks, \ from utils import is_group_chat, get_thread_id, message_text, wrap_with_indicator, split_into_chunks, \
edit_message_with_retry, get_stream_cutoff_values, is_allowed, get_remaining_budget, is_admin, is_within_budget, \ edit_message_with_retry, get_stream_cutoff_values, is_allowed, get_remaining_budget, is_admin, is_within_budget, \
@@ -94,6 +96,7 @@ class ChatGPTTelegramBot:
images_today, images_month = self.usage[user_id].get_current_image_count() images_today, images_month = self.usage[user_id].get_current_image_count()
(transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month, (transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month,
transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration() transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration()
vision_today, vision_month = self.usage[user_id].get_current_vision_tokens()
current_cost = self.usage[user_id].get_current_cost() current_cost = self.usage[user_id].get_current_cost()
chat_id = update.effective_chat.id chat_id = update.effective_chat.id
@@ -113,10 +116,15 @@ class ChatGPTTelegramBot:
if self.config.get('enable_image_generation', False): if self.config.get('enable_image_generation', False):
text_today_images = f"{images_today} {localized_text('stats_images', bot_language)}\n" text_today_images = f"{images_today} {localized_text('stats_images', bot_language)}\n"
text_today_vision = ""
if self.config.get('enable_vision', False):
text_today_vision = f"{vision_today} {localized_text('stats_vision', bot_language)}\n"
text_today = ( text_today = (
f"*{localized_text('usage_today', bot_language)}:*\n" f"*{localized_text('usage_today', bot_language)}:*\n"
f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n" f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n"
f"{text_today_images}" # Include the image statistics for today if applicable f"{text_today_images}" # Include the image statistics for today if applicable
f"{text_today_vision}"
f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} " f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} "
f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n" f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n"
f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n" f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n"
@@ -127,11 +135,16 @@ class ChatGPTTelegramBot:
if self.config.get('enable_image_generation', False): if self.config.get('enable_image_generation', False):
text_month_images = f"{images_month} {localized_text('stats_images', bot_language)}\n" text_month_images = f"{images_month} {localized_text('stats_images', bot_language)}\n"
text_month_vision = ""
if self.config.get('enable_vision', False):
text_month_vision = f"{vision_month} {localized_text('stats_vision', bot_language)}\n"
# Check if image generation is enabled and, if so, generate the image statistics for the month # Check if image generation is enabled and, if so, generate the image statistics for the month
text_month = ( text_month = (
f"*{localized_text('usage_month', bot_language)}:*\n" f"*{localized_text('usage_month', bot_language)}:*\n"
f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n" f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n"
f"{text_month_images}" # Include the image statistics for the month if applicable f"{text_month_images}" # Include the image statistics for the month if applicable
f"{text_month_vision}"
f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} " f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} "
f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n" f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n"
f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}" f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}"
@@ -370,6 +383,97 @@ class ChatGPTTelegramBot:
await wrap_with_indicator(update, context, _execute, constants.ChatAction.TYPING) await wrap_with_indicator(update, context, _execute, constants.ChatAction.TYPING)
async def vision(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
"""
Interpret image using vision model.
"""
if not self.config['enable_vision'] or not await self.check_allowed_and_within_budget(update, context):
return
if is_group_chat(update) and self.config['ignore_group_vision']:
logging.info(f'Vision coming from group chat, ignoring...')
return
chat_id = update.effective_chat.id
image = update.message.effective_attachment[-1]
prompt = update.message.caption
temp_file = tempfile.NamedTemporaryFile()
async def _execute():
bot_language = self.config['bot_language']
try:
media_file = await context.bot.get_file(image.file_id)
await media_file.download_to_drive(temp_file.name)
except Exception as e:
logging.exception(e)
await update.effective_message.reply_text(
message_thread_id=get_thread_id(update),
reply_to_message_id=get_reply_to_message_id(self.config, update),
text=(
f"{localized_text('media_download_fail', bot_language)[0]}: "
f"{str(e)}. {localized_text('media_download_fail', bot_language)[1]}"
),
parse_mode=constants.ParseMode.MARKDOWN
)
return
# convert jpg from telegram to png as understood by openai
temp_file_png = tempfile.NamedTemporaryFile()
try:
original_image = Image.open(temp_file.name)
original_image.save(temp_file_png.name, format='PNG')
logging.info(f'New vision request received from user {update.message.from_user.name} '
f'(id: {update.message.from_user.id})')
except Exception as e:
logging.exception(e)
await update.effective_message.reply_text(
message_thread_id=get_thread_id(update),
reply_to_message_id=get_reply_to_message_id(self.config, update),
text=localized_text('media_type_fail', bot_language)
)
user_id = update.message.from_user.id
if user_id not in self.usage:
self.usage[user_id] = UsageTracker(user_id, update.message.from_user.name)
try:
interpretation, tokens = await self.openai.interpret_image(temp_file_png.name, prompt=prompt)
vision_token_price = self.config['vision_token_price']
self.usage[user_id].add_vision_tokens(tokens, vision_token_price)
allowed_user_ids = self.config['allowed_user_ids'].split(',')
if str(user_id) not in allowed_user_ids and 'guests' in self.usage:
self.usage["guests"].add_vision_tokens(tokens, vision_token_price)
await update.effective_message.reply_text(
message_thread_id=get_thread_id(update),
reply_to_message_id=get_reply_to_message_id(self.config, update),
text=interpretation,
parse_mode=constants.ParseMode.MARKDOWN
)
except Exception as e:
logging.exception(e)
await update.effective_message.reply_text(
message_thread_id=get_thread_id(update),
reply_to_message_id=get_reply_to_message_id(self.config, update),
text=f"{localized_text('vision_fail', bot_language)}: {str(e)}",
parse_mode=constants.ParseMode.MARKDOWN
)
finally:
temp_file.close()
temp_file_png.close()
await wrap_with_indicator(update, context, _execute, constants.ChatAction.TYPING)
async def prompt(self, update: Update, context: ContextTypes.DEFAULT_TYPE): async def prompt(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
""" """
React to incoming messages and respond accordingly. React to incoming messages and respond accordingly.
@@ -792,6 +896,9 @@ class ChatGPTTelegramBot:
application.add_handler(CommandHandler( application.add_handler(CommandHandler(
'chat', self.prompt, filters=filters.ChatType.GROUP | filters.ChatType.SUPERGROUP) 'chat', self.prompt, filters=filters.ChatType.GROUP | filters.ChatType.SUPERGROUP)
) )
application.add_handler(MessageHandler(
filters.PHOTO | filters.Document.IMAGE,
self.vision))
application.add_handler(MessageHandler( application.add_handler(MessageHandler(
filters.AUDIO | filters.VOICE | filters.Document.AUDIO | filters.AUDIO | filters.VOICE | filters.Document.AUDIO |
filters.VIDEO | filters.VIDEO_NOTE | filters.Document.VIDEO, filters.VIDEO | filters.VIDEO_NOTE | filters.Document.VIDEO,

View File

@@ -56,6 +56,8 @@ class UsageTracker:
if os.path.isfile(self.user_file): if os.path.isfile(self.user_file):
with open(self.user_file, "r") as file: with open(self.user_file, "r") as file:
self.usage = json.load(file) self.usage = json.load(file)
if 'vision_tokens' not in self.usage['usage_history']:
self.usage['usage_history']['vision_tokens'] = {}
else: else:
# ensure directory exists # ensure directory exists
pathlib.Path(logs_dir).mkdir(exist_ok=True) pathlib.Path(logs_dir).mkdir(exist_ok=True)
@@ -63,7 +65,7 @@ class UsageTracker:
self.usage = { self.usage = {
"user_name": user_name, "user_name": user_name,
"current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())}, "current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())},
"usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}} "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "vision_tokens":{}}
} }
# token usage functions: # token usage functions:
@@ -151,6 +153,47 @@ class UsageTracker:
usage_month += sum(images) usage_month += sum(images)
return usage_day, usage_month return usage_day, usage_month
# vision usage functions
def add_vision_tokens(self, tokens, vision_token_price=0.01):
"""
Adds requested vision tokens to a users usage history and updates current cost.
:param tokens: total tokens used in last request
:param vision_token_price: price per 1K tokens transcription, defaults to 0.01
"""
today = date.today()
token_price = round(tokens * vision_token_price / 1000, 2)
self.add_current_costs(token_price)
# update usage_history
if str(today) in self.usage["usage_history"]["vision_tokens"]:
# add requested seconds to existing date
self.usage["usage_history"]["vision_tokens"][str(today)] += tokens
else:
# create new entry for current date
self.usage["usage_history"]["vision_tokens"][str(today)] = tokens
# write updated token usage to user file
with open(self.user_file, "w") as outfile:
json.dump(self.usage, outfile)
def get_current_vision_tokens(self):
"""Get vision tokens for today and this month.
:return: total amount of vision tokens per day and per month
"""
today = date.today()
if str(today) in self.usage["usage_history"]["vision_tokens"]:
tokens_day = self.usage["usage_history"]["vision_tokens"][str(today)]
else:
tokens_day = 0
month = str(today)[:7] # year-month as string
tokens_month = 0
for today, tokens in self.usage["usage_history"]["vision_tokens"].items():
if today.startswith(month):
tokens_month += tokens
return tokens_day, tokens_month
# transcription usage functions: # transcription usage functions:
def add_transcription_seconds(self, seconds, minute_price=0.006): def add_transcription_seconds(self, seconds, minute_price=0.006):
@@ -236,13 +279,14 @@ class UsageTracker:
cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost()) cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost())
return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time} return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time}
def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006): def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, vision_token_price=0.01):
"""Get total USD amount of all requests in history """Get total USD amount of all requests in history
:param tokens_price: price per 1000 tokens, defaults to 0.002 :param tokens_price: price per 1000 tokens, defaults to 0.002
:param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"], :param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"],
defaults to [0.016, 0.018, 0.02] defaults to [0.016, 0.018, 0.02]
:param minute_price: price per minute transcription, defaults to 0.006 :param minute_price: price per minute transcription, defaults to 0.006
:param vision_token_price: price per 1k vision token interpretation, defaults to 0.01
:return: total cost of all requests :return: total cost of all requests
""" """
total_tokens = sum(self.usage['usage_history']['chat_tokens'].values()) total_tokens = sum(self.usage['usage_history']['chat_tokens'].values())
@@ -255,5 +299,8 @@ class UsageTracker:
total_transcription_seconds = sum(self.usage['usage_history']['transcription_seconds'].values()) total_transcription_seconds = sum(self.usage['usage_history']['transcription_seconds'].values())
transcription_cost = round(total_transcription_seconds * minute_price / 60, 2) transcription_cost = round(total_transcription_seconds * minute_price / 60, 2)
all_time_cost = token_cost + transcription_cost + image_cost total_vision_tokens = sum(self.usage['usage_history']['vision_tokens'].values())
vision_cost = round(total_vision_tokens * vision_token_price / 1000, 2)
all_time_cost = token_cost + transcription_cost + image_cost + vision_cost
return all_time_cost return all_time_cost

View File

@@ -5,6 +5,7 @@ import itertools
import json import json
import logging import logging
import os import os
import base64
import telegram import telegram
from telegram import Message, MessageEntity, Update, ChatMember, constants from telegram import Message, MessageEntity, Update, ChatMember, constants
@@ -377,3 +378,9 @@ def cleanup_intermediate_files(response: any):
if format == 'path': if format == 'path':
if os.path.exists(value): if os.path.exists(value):
os.remove(value) os.remove(value)
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')

View File

@@ -11,3 +11,4 @@ spotipy~=2.23.0
pytube~=15.0.0 pytube~=15.0.0
gtts~=2.3.2 gtts~=2.3.2
whois~=0.9.27 whois~=0.9.27
Pillow~=10.1.0

View File

@@ -14,6 +14,7 @@
"usage_month":"Usage this month", "usage_month":"Usage this month",
"stats_tokens":"tokens", "stats_tokens":"tokens",
"stats_images":"images generated", "stats_images":"images generated",
"stats_vision":"image tokens interpreted",
"stats_transcribe":["minutes and", "seconds transcribed"], "stats_transcribe":["minutes and", "seconds transcribed"],
"stats_total":"💰 For a total amount of $", "stats_total":"💰 For a total amount of $",
"stats_budget":"Your remaining budget", "stats_budget":"Your remaining budget",
@@ -25,6 +26,7 @@
"reset_done":"Done!", "reset_done":"Done!",
"image_no_prompt":"Please provide a prompt! (e.g. /image cat)", "image_no_prompt":"Please provide a prompt! (e.g. /image cat)",
"image_fail":"Failed to generate image", "image_fail":"Failed to generate image",
"vision_fail":"Failed to interpret image",
"media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"], "media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"],
"media_type_fail":"Unsupported file type", "media_type_fail":"Unsupported file type",
"transcript":"Transcript", "transcript":"Transcript",