mirror of
https://github.com/aljazceru/chatgpt-telegram-bot.git
synced 2025-12-23 07:35:06 +01:00
added support for vision
This commit is contained in:
@@ -17,24 +17,30 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
|
|||||||
# TOKEN_PRICE=0.002
|
# TOKEN_PRICE=0.002
|
||||||
# IMAGE_PRICES=0.016,0.018,0.02
|
# IMAGE_PRICES=0.016,0.018,0.02
|
||||||
# TRANSCRIPTION_PRICE=0.006
|
# TRANSCRIPTION_PRICE=0.006
|
||||||
|
# VISION_TOKEN_PRICE=0.01
|
||||||
# ENABLE_QUOTING=true
|
# ENABLE_QUOTING=true
|
||||||
# ENABLE_IMAGE_GENERATION=true
|
# ENABLE_IMAGE_GENERATION=true
|
||||||
# ENABLE_TRANSCRIPTION=true
|
# ENABLE_TRANSCRIPTION=true
|
||||||
|
# ENABLE_VISION=true
|
||||||
# PROXY=http://localhost:8080
|
# PROXY=http://localhost:8080
|
||||||
# OPENAI_MODEL=gpt-3.5-turbo
|
# OPENAI_MODEL=gpt-3.5-turbo
|
||||||
# ASSISTANT_PROMPT="You are a helpful assistant."
|
# ASSISTANT_PROMPT="You are a helpful assistant."
|
||||||
# SHOW_USAGE=false
|
# SHOW_USAGE=false
|
||||||
# STREAM=true
|
# STREAM=true
|
||||||
# MAX_TOKENS=1200
|
# MAX_TOKENS=1200
|
||||||
|
# VISION_MAX_TOKENS=300
|
||||||
# MAX_HISTORY_SIZE=15
|
# MAX_HISTORY_SIZE=15
|
||||||
# MAX_CONVERSATION_AGE_MINUTES=180
|
# MAX_CONVERSATION_AGE_MINUTES=180
|
||||||
# VOICE_REPLY_WITH_TRANSCRIPT_ONLY=true
|
# VOICE_REPLY_WITH_TRANSCRIPT_ONLY=true
|
||||||
# VOICE_REPLY_PROMPTS="Hi bot;Hey bot;Hi chat;Hey chat"
|
# VOICE_REPLY_PROMPTS="Hi bot;Hey bot;Hi chat;Hey chat"
|
||||||
|
# VISION_PROMPT="What is in this image"
|
||||||
# N_CHOICES=1
|
# N_CHOICES=1
|
||||||
# TEMPERATURE=1.0
|
# TEMPERATURE=1.0
|
||||||
# PRESENCE_PENALTY=0.0
|
# PRESENCE_PENALTY=0.0
|
||||||
# FREQUENCY_PENALTY=0.0
|
# FREQUENCY_PENALTY=0.0
|
||||||
# IMAGE_SIZE=512x512
|
# IMAGE_SIZE=512x512
|
||||||
|
# VISION_DETAIL="low"
|
||||||
# GROUP_TRIGGER_KEYWORD=""
|
# GROUP_TRIGGER_KEYWORD=""
|
||||||
# IGNORE_GROUP_TRANSCRIPTIONS=true
|
# IGNORE_GROUP_TRANSCRIPTIONS=true
|
||||||
|
# IGNORE_GROUP_VISION=true
|
||||||
# BOT_LANGUAGE=en
|
# BOT_LANGUAGE=en
|
||||||
@@ -72,6 +72,7 @@ The following parameters are optional and can be set in the `.env` file:
|
|||||||
| `TOKEN_PRICE` | $-price per 1000 tokens used to compute cost information in usage statistics. Source: https://openai.com/pricing | `0.002` |
|
| `TOKEN_PRICE` | $-price per 1000 tokens used to compute cost information in usage statistics. Source: https://openai.com/pricing | `0.002` |
|
||||||
| `IMAGE_PRICES` | A comma-separated list with 3 elements of prices for the different image sizes: `256x256`, `512x512` and `1024x1024`. Source: https://openai.com/pricing | `0.016,0.018,0.02` |
|
| `IMAGE_PRICES` | A comma-separated list with 3 elements of prices for the different image sizes: `256x256`, `512x512` and `1024x1024`. Source: https://openai.com/pricing | `0.016,0.018,0.02` |
|
||||||
| `TRANSCRIPTION_PRICE` | USD-price for one minute of audio transcription. Source: https://openai.com/pricing | `0.006` |
|
| `TRANSCRIPTION_PRICE` | USD-price for one minute of audio transcription. Source: https://openai.com/pricing | `0.006` |
|
||||||
|
| `VISION_TOKEN_PRICE` | USD-price per 1K tokens of image interpretation. Source: https://openai.com/pricing | `0.01` |
|
||||||
|
|
||||||
Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/184) for possible budget configurations.
|
Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/184) for possible budget configurations.
|
||||||
|
|
||||||
@@ -81,23 +82,28 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
|
|||||||
| `ENABLE_QUOTING` | Whether to enable message quoting in private chats | `true` |
|
| `ENABLE_QUOTING` | Whether to enable message quoting in private chats | `true` |
|
||||||
| `ENABLE_IMAGE_GENERATION` | Whether to enable image generation via the `/image` command | `true` |
|
| `ENABLE_IMAGE_GENERATION` | Whether to enable image generation via the `/image` command | `true` |
|
||||||
| `ENABLE_TRANSCRIPTION` | Whether to enable transcriptions of audio and video messages | `true` |
|
| `ENABLE_TRANSCRIPTION` | Whether to enable transcriptions of audio and video messages | `true` |
|
||||||
|
| `ENABLE_VISION` | Whether to enable vision capabilities in supported models | `true` |
|
||||||
| `PROXY` | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`) | - |
|
| `PROXY` | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`) | - |
|
||||||
| `OPENAI_MODEL` | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/) | `gpt-3.5-turbo` |
|
| `OPENAI_MODEL` | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/) | `gpt-3.5-turbo` |
|
||||||
| `ASSISTANT_PROMPT` | A system message that sets the tone and controls the behavior of the assistant | `You are a helpful assistant.` |
|
| `ASSISTANT_PROMPT` | A system message that sets the tone and controls the behavior of the assistant | `You are a helpful assistant.` |
|
||||||
| `SHOW_USAGE` | Whether to show OpenAI token usage information after each response | `false` |
|
| `SHOW_USAGE` | Whether to show OpenAI token usage information after each response | `false` |
|
||||||
| `STREAM` | Whether to stream responses. **Note**: incompatible, if enabled, with `N_CHOICES` higher than 1 | `true` |
|
| `STREAM` | Whether to stream responses. **Note**: incompatible, if enabled, with `N_CHOICES` higher than 1 | `true` |
|
||||||
| `MAX_TOKENS` | Upper bound on how many tokens the ChatGPT API will return | `1200` for GPT-3, `2400` for GPT-4 |
|
| `MAX_TOKENS` | Upper bound on how many tokens the ChatGPT API will return | `1200` for GPT-3, `2400` for GPT-4 |
|
||||||
|
| `VISION_MAX_TOKENS` | Upper bound on how many tokens vision models will return | `300` for gpt-4-vision-preview |
|
||||||
| `MAX_HISTORY_SIZE` | Max number of messages to keep in memory, after which the conversation will be summarised to avoid excessive token usage | `15` |
|
| `MAX_HISTORY_SIZE` | Max number of messages to keep in memory, after which the conversation will be summarised to avoid excessive token usage | `15` |
|
||||||
| `MAX_CONVERSATION_AGE_MINUTES` | Maximum number of minutes a conversation should live since the last message, after which the conversation will be reset | `180` |
|
| `MAX_CONVERSATION_AGE_MINUTES` | Maximum number of minutes a conversation should live since the last message, after which the conversation will be reset | `180` |
|
||||||
| `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` | Whether to answer to voice messages with the transcript only or with a ChatGPT response of the transcript | `false` |
|
| `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` | Whether to answer to voice messages with the transcript only or with a ChatGPT response of the transcript | `false` |
|
||||||
| `VOICE_REPLY_PROMPTS` | A semicolon separated list of phrases (i.e. `Hi bot;Hello chat`). If the transcript starts with any of them, it will be treated as a prompt even if `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` is set to `true` | - |
|
| `VOICE_REPLY_PROMPTS` | A semicolon separated list of phrases (i.e. `Hi bot;Hello chat`). If the transcript starts with any of them, it will be treated as a prompt even if `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` is set to `true` | - |
|
||||||
|
| `VISION_PROMPT` | A phrase (i.e. `What is in this image`). The vision models use it as prompt to interpret a given image. If there is caption in the image sent to the bot, that supersedes this parameter | `What is in this image` |
|
||||||
| `N_CHOICES` | Number of answers to generate for each input message. **Note**: setting this to a number higher than 1 will not work properly if `STREAM` is enabled | `1` |
|
| `N_CHOICES` | Number of answers to generate for each input message. **Note**: setting this to a number higher than 1 will not work properly if `STREAM` is enabled | `1` |
|
||||||
| `TEMPERATURE` | Number between 0 and 2. Higher values will make the output more random | `1.0` |
|
| `TEMPERATURE` | Number between 0 and 2. Higher values will make the output more random | `1.0` |
|
||||||
| `PRESENCE_PENALTY` | Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far | `0.0` |
|
| `PRESENCE_PENALTY` | Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far | `0.0` |
|
||||||
| `FREQUENCY_PENALTY` | Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far | `0.0` |
|
| `FREQUENCY_PENALTY` | Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far | `0.0` |
|
||||||
| `IMAGE_SIZE` | The DALL·E generated image size. Allowed values: `256x256`, `512x512` or `1024x1024` | `512x512` |
|
| `IMAGE_SIZE` | The DALL·E generated image size. Allowed values: `256x256`, `512x512` or `1024x1024` | `512x512` |
|
||||||
|
| `VISION_DETAIL` | The detail parameter for vision models, explained [Vision Guide](https://platform.openai.com/docs/guides/vision). Allowed values: `low` or `high` | `low` |
|
||||||
| `GROUP_TRIGGER_KEYWORD` | If set, the bot in group chats will only respond to messages that start with this keyword | - |
|
| `GROUP_TRIGGER_KEYWORD` | If set, the bot in group chats will only respond to messages that start with this keyword | - |
|
||||||
| `IGNORE_GROUP_TRANSCRIPTIONS` | If set to true, the bot will not process transcriptions in group chats | `true` |
|
| `IGNORE_GROUP_TRANSCRIPTIONS` | If set to true, the bot will not process transcriptions in group chats | `true` |
|
||||||
|
| `IGNORE_GROUP_VISION` | If set to true, the bot will not process vision queries in group chats | `true` |
|
||||||
| `BOT_LANGUAGE` | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`. [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en` |
|
| `BOT_LANGUAGE` | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`. [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en` |
|
||||||
| `WHISPER_PROMPT` | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message. [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting) | `-` |
|
| `WHISPER_PROMPT` | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message. [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting) | `-` |
|
||||||
|
|
||||||
|
|||||||
@@ -50,6 +50,9 @@ def main():
|
|||||||
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
|
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
|
||||||
'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
|
'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
|
||||||
'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
|
'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
|
||||||
|
'vision_prompt': os.environ.get('VISION_PROMPT', 'What is in this image'),
|
||||||
|
'vision_detail': os.environ.get('VISION_DETAIL', 'low'),
|
||||||
|
'vision_max_tokens': int(os.environ.get('VISION_MAX_TOKENS', '300')),
|
||||||
}
|
}
|
||||||
|
|
||||||
if openai_config['enable_functions'] and not functions_available:
|
if openai_config['enable_functions'] and not functions_available:
|
||||||
@@ -70,6 +73,7 @@ def main():
|
|||||||
'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true',
|
'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true',
|
||||||
'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
|
'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
|
||||||
'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
|
'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
|
||||||
|
'enable_vision': os.environ.get('ENABLE_VISION', 'true').lower() == 'true',
|
||||||
'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
|
'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
|
||||||
'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
|
'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
|
||||||
'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))),
|
'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))),
|
||||||
@@ -78,9 +82,11 @@ def main():
|
|||||||
'voice_reply_transcript': os.environ.get('VOICE_REPLY_WITH_TRANSCRIPT_ONLY', 'false').lower() == 'true',
|
'voice_reply_transcript': os.environ.get('VOICE_REPLY_WITH_TRANSCRIPT_ONLY', 'false').lower() == 'true',
|
||||||
'voice_reply_prompts': os.environ.get('VOICE_REPLY_PROMPTS', '').split(';'),
|
'voice_reply_prompts': os.environ.get('VOICE_REPLY_PROMPTS', '').split(';'),
|
||||||
'ignore_group_transcriptions': os.environ.get('IGNORE_GROUP_TRANSCRIPTIONS', 'true').lower() == 'true',
|
'ignore_group_transcriptions': os.environ.get('IGNORE_GROUP_TRANSCRIPTIONS', 'true').lower() == 'true',
|
||||||
|
'ignore_group_vision': os.environ.get('IGNORE_GROUP_VISION', 'true').lower() == 'true',
|
||||||
'group_trigger_keyword': os.environ.get('GROUP_TRIGGER_KEYWORD', ''),
|
'group_trigger_keyword': os.environ.get('GROUP_TRIGGER_KEYWORD', ''),
|
||||||
'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
|
'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
|
||||||
'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
|
'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
|
||||||
|
'vision_token_price': float(os.environ.get('VISION_TOKEN_PRICE', '0.01')),
|
||||||
'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)),
|
'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)),
|
||||||
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
|
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,10 +12,11 @@ import json
|
|||||||
import httpx
|
import httpx
|
||||||
from datetime import date
|
from datetime import date
|
||||||
from calendar import monthrange
|
from calendar import monthrange
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
|
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
|
||||||
|
|
||||||
from utils import is_direct_result
|
from utils import is_direct_result, encode_image
|
||||||
from plugin_manager import PluginManager
|
from plugin_manager import PluginManager
|
||||||
|
|
||||||
# Models can be found here: https://platform.openai.com/docs/models/overview
|
# Models can be found here: https://platform.openai.com/docs/models/overview
|
||||||
@@ -23,7 +24,8 @@ GPT_3_MODELS = ("gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613")
|
|||||||
GPT_3_16K_MODELS = ("gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613")
|
GPT_3_16K_MODELS = ("gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613")
|
||||||
GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613")
|
GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613")
|
||||||
GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613")
|
GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613")
|
||||||
GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS
|
GPT_4_VISION_MODELS = ("gpt-4-vision-preview",)
|
||||||
|
GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS
|
||||||
|
|
||||||
|
|
||||||
def default_max_tokens(model: str) -> int:
|
def default_max_tokens(model: str) -> int:
|
||||||
@@ -41,6 +43,8 @@ def default_max_tokens(model: str) -> int:
|
|||||||
return base * 4
|
return base * 4
|
||||||
elif model in GPT_4_32K_MODELS:
|
elif model in GPT_4_32K_MODELS:
|
||||||
return base * 8
|
return base * 8
|
||||||
|
elif model in GPT_4_VISION_MODELS:
|
||||||
|
return 4096
|
||||||
|
|
||||||
|
|
||||||
def are_functions_available(model: str) -> bool:
|
def are_functions_available(model: str) -> bool:
|
||||||
@@ -347,6 +351,26 @@ class OpenAIHelper:
|
|||||||
logging.exception(e)
|
logging.exception(e)
|
||||||
raise Exception(f"⚠️ _{localized_text('error', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e
|
raise Exception(f"⚠️ _{localized_text('error', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e
|
||||||
|
|
||||||
|
async def interpret_image(self, filename, prompt=None):
|
||||||
|
"""
|
||||||
|
Interprets a given PNG image file using the Vision model.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
image = encode_image(filename)
|
||||||
|
prompt = self.config['vision_prompt'] if prompt is None else prompt
|
||||||
|
message = {'role':'user', 'content':[{'type':'text', 'text':prompt}, {'type':'image_url', \
|
||||||
|
'image_url': {'url':f'data:image/jpeg;base64,{image}', 'detail':self.config['vision_detail'] } }]}
|
||||||
|
response = await self.client.chat.completions.create(model=self.config['model'], messages=[message], max_tokens=self.config['vision_max_tokens'])
|
||||||
|
return response.choices[0].message.content, self.__count_tokens_vision(filename)
|
||||||
|
|
||||||
|
except openai.RateLimitError as e:
|
||||||
|
raise e
|
||||||
|
except openai.BadRequestError as e:
|
||||||
|
raise Exception(f"⚠️ _{localized_text('openai_invalid', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e
|
||||||
|
except Exception as e:
|
||||||
|
logging.exception(e)
|
||||||
|
raise Exception(f"⚠️ _{localized_text('error', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e
|
||||||
|
|
||||||
def reset_chat_history(self, chat_id, content=''):
|
def reset_chat_history(self, chat_id, content=''):
|
||||||
"""
|
"""
|
||||||
Resets the conversation history.
|
Resets the conversation history.
|
||||||
@@ -410,6 +434,8 @@ class OpenAIHelper:
|
|||||||
return base * 2
|
return base * 2
|
||||||
if self.config['model'] in GPT_4_32K_MODELS:
|
if self.config['model'] in GPT_4_32K_MODELS:
|
||||||
return base * 8
|
return base * 8
|
||||||
|
if self.config['model'] in GPT_4_VISION_MODELS:
|
||||||
|
return base * 31
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"Max tokens for model {self.config['model']} is not implemented yet."
|
f"Max tokens for model {self.config['model']} is not implemented yet."
|
||||||
)
|
)
|
||||||
@@ -430,7 +456,7 @@ class OpenAIHelper:
|
|||||||
if model in GPT_3_MODELS + GPT_3_16K_MODELS:
|
if model in GPT_3_MODELS + GPT_3_16K_MODELS:
|
||||||
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
|
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
|
||||||
tokens_per_name = -1 # if there's a name, the role is omitted
|
tokens_per_name = -1 # if there's a name, the role is omitted
|
||||||
elif model in GPT_4_MODELS + GPT_4_32K_MODELS:
|
elif model in GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS:
|
||||||
tokens_per_message = 3
|
tokens_per_message = 3
|
||||||
tokens_per_name = 1
|
tokens_per_name = 1
|
||||||
else:
|
else:
|
||||||
@@ -445,6 +471,35 @@ class OpenAIHelper:
|
|||||||
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
|
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
|
||||||
return num_tokens
|
return num_tokens
|
||||||
|
|
||||||
|
def __count_tokens_vision(self, filename) -> int:
|
||||||
|
"""
|
||||||
|
Counts the number of tokens for interpreting an image.
|
||||||
|
:param image: image to interpret
|
||||||
|
:return: the number of tokens required
|
||||||
|
"""
|
||||||
|
image = Image.open(filename)
|
||||||
|
model = self.config['model']
|
||||||
|
if model not in GPT_4_VISION_MODELS:
|
||||||
|
raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}.""")
|
||||||
|
|
||||||
|
w, h = image.size
|
||||||
|
if w > h: w, h = h, w
|
||||||
|
# this computation follows https://platform.openai.com/docs/guides/vision and https://openai.com/pricing#gpt-4-turbo
|
||||||
|
base_tokens = 85
|
||||||
|
detail = self.config['vision_detail']
|
||||||
|
if detail == 'low':
|
||||||
|
return base_tokens
|
||||||
|
elif detail == 'high':
|
||||||
|
f = max(w / 768, h / 2048)
|
||||||
|
if f > 1:
|
||||||
|
w, h = int(w / f), int(h / f)
|
||||||
|
tw, th = (w + 511) // 512, (h + 511) // 512
|
||||||
|
tiles = tw * th
|
||||||
|
num_tokens = base_tokens + tiles * 170
|
||||||
|
return num_tokens
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"""unknown parameter detail={detail} for model {model}.""")
|
||||||
|
|
||||||
# No longer works as of July 21st 2023, as OpenAI has removed the billing API
|
# No longer works as of July 21st 2023, as OpenAI has removed the billing API
|
||||||
# def get_billing_current_month(self):
|
# def get_billing_current_month(self):
|
||||||
# """Gets billed usage for current month from OpenAI API.
|
# """Gets billed usage for current month from OpenAI API.
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from telegram import BotCommandScopeAllGroupChats, Update, constants
|
from telegram import BotCommandScopeAllGroupChats, Update, constants
|
||||||
@@ -13,6 +14,7 @@ from telegram.ext import ApplicationBuilder, CommandHandler, MessageHandler, \
|
|||||||
filters, InlineQueryHandler, CallbackQueryHandler, Application, ContextTypes, CallbackContext
|
filters, InlineQueryHandler, CallbackQueryHandler, Application, ContextTypes, CallbackContext
|
||||||
|
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
from utils import is_group_chat, get_thread_id, message_text, wrap_with_indicator, split_into_chunks, \
|
from utils import is_group_chat, get_thread_id, message_text, wrap_with_indicator, split_into_chunks, \
|
||||||
edit_message_with_retry, get_stream_cutoff_values, is_allowed, get_remaining_budget, is_admin, is_within_budget, \
|
edit_message_with_retry, get_stream_cutoff_values, is_allowed, get_remaining_budget, is_admin, is_within_budget, \
|
||||||
@@ -94,6 +96,7 @@ class ChatGPTTelegramBot:
|
|||||||
images_today, images_month = self.usage[user_id].get_current_image_count()
|
images_today, images_month = self.usage[user_id].get_current_image_count()
|
||||||
(transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month,
|
(transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month,
|
||||||
transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration()
|
transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration()
|
||||||
|
vision_today, vision_month = self.usage[user_id].get_current_vision_tokens()
|
||||||
current_cost = self.usage[user_id].get_current_cost()
|
current_cost = self.usage[user_id].get_current_cost()
|
||||||
|
|
||||||
chat_id = update.effective_chat.id
|
chat_id = update.effective_chat.id
|
||||||
@@ -113,10 +116,15 @@ class ChatGPTTelegramBot:
|
|||||||
if self.config.get('enable_image_generation', False):
|
if self.config.get('enable_image_generation', False):
|
||||||
text_today_images = f"{images_today} {localized_text('stats_images', bot_language)}\n"
|
text_today_images = f"{images_today} {localized_text('stats_images', bot_language)}\n"
|
||||||
|
|
||||||
|
text_today_vision = ""
|
||||||
|
if self.config.get('enable_vision', False):
|
||||||
|
text_today_vision = f"{vision_today} {localized_text('stats_vision', bot_language)}\n"
|
||||||
|
|
||||||
text_today = (
|
text_today = (
|
||||||
f"*{localized_text('usage_today', bot_language)}:*\n"
|
f"*{localized_text('usage_today', bot_language)}:*\n"
|
||||||
f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n"
|
f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n"
|
||||||
f"{text_today_images}" # Include the image statistics for today if applicable
|
f"{text_today_images}" # Include the image statistics for today if applicable
|
||||||
|
f"{text_today_vision}"
|
||||||
f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} "
|
f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} "
|
||||||
f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n"
|
f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n"
|
||||||
f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n"
|
f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n"
|
||||||
@@ -127,11 +135,16 @@ class ChatGPTTelegramBot:
|
|||||||
if self.config.get('enable_image_generation', False):
|
if self.config.get('enable_image_generation', False):
|
||||||
text_month_images = f"{images_month} {localized_text('stats_images', bot_language)}\n"
|
text_month_images = f"{images_month} {localized_text('stats_images', bot_language)}\n"
|
||||||
|
|
||||||
|
text_month_vision = ""
|
||||||
|
if self.config.get('enable_vision', False):
|
||||||
|
text_month_vision = f"{vision_month} {localized_text('stats_vision', bot_language)}\n"
|
||||||
|
|
||||||
# Check if image generation is enabled and, if so, generate the image statistics for the month
|
# Check if image generation is enabled and, if so, generate the image statistics for the month
|
||||||
text_month = (
|
text_month = (
|
||||||
f"*{localized_text('usage_month', bot_language)}:*\n"
|
f"*{localized_text('usage_month', bot_language)}:*\n"
|
||||||
f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n"
|
f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n"
|
||||||
f"{text_month_images}" # Include the image statistics for the month if applicable
|
f"{text_month_images}" # Include the image statistics for the month if applicable
|
||||||
|
f"{text_month_vision}"
|
||||||
f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} "
|
f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} "
|
||||||
f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n"
|
f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n"
|
||||||
f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}"
|
f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}"
|
||||||
@@ -370,6 +383,97 @@ class ChatGPTTelegramBot:
|
|||||||
|
|
||||||
await wrap_with_indicator(update, context, _execute, constants.ChatAction.TYPING)
|
await wrap_with_indicator(update, context, _execute, constants.ChatAction.TYPING)
|
||||||
|
|
||||||
|
async def vision(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
"""
|
||||||
|
Interpret image using vision model.
|
||||||
|
"""
|
||||||
|
if not self.config['enable_vision'] or not await self.check_allowed_and_within_budget(update, context):
|
||||||
|
return
|
||||||
|
|
||||||
|
if is_group_chat(update) and self.config['ignore_group_vision']:
|
||||||
|
logging.info(f'Vision coming from group chat, ignoring...')
|
||||||
|
return
|
||||||
|
|
||||||
|
chat_id = update.effective_chat.id
|
||||||
|
image = update.message.effective_attachment[-1]
|
||||||
|
prompt = update.message.caption
|
||||||
|
temp_file = tempfile.NamedTemporaryFile()
|
||||||
|
|
||||||
|
async def _execute():
|
||||||
|
bot_language = self.config['bot_language']
|
||||||
|
try:
|
||||||
|
media_file = await context.bot.get_file(image.file_id)
|
||||||
|
await media_file.download_to_drive(temp_file.name)
|
||||||
|
except Exception as e:
|
||||||
|
logging.exception(e)
|
||||||
|
await update.effective_message.reply_text(
|
||||||
|
message_thread_id=get_thread_id(update),
|
||||||
|
reply_to_message_id=get_reply_to_message_id(self.config, update),
|
||||||
|
text=(
|
||||||
|
f"{localized_text('media_download_fail', bot_language)[0]}: "
|
||||||
|
f"{str(e)}. {localized_text('media_download_fail', bot_language)[1]}"
|
||||||
|
),
|
||||||
|
parse_mode=constants.ParseMode.MARKDOWN
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# convert jpg from telegram to png as understood by openai
|
||||||
|
|
||||||
|
temp_file_png = tempfile.NamedTemporaryFile()
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_image = Image.open(temp_file.name)
|
||||||
|
|
||||||
|
original_image.save(temp_file_png.name, format='PNG')
|
||||||
|
logging.info(f'New vision request received from user {update.message.from_user.name} '
|
||||||
|
f'(id: {update.message.from_user.id})')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.exception(e)
|
||||||
|
await update.effective_message.reply_text(
|
||||||
|
message_thread_id=get_thread_id(update),
|
||||||
|
reply_to_message_id=get_reply_to_message_id(self.config, update),
|
||||||
|
text=localized_text('media_type_fail', bot_language)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
user_id = update.message.from_user.id
|
||||||
|
if user_id not in self.usage:
|
||||||
|
self.usage[user_id] = UsageTracker(user_id, update.message.from_user.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
interpretation, tokens = await self.openai.interpret_image(temp_file_png.name, prompt=prompt)
|
||||||
|
|
||||||
|
vision_token_price = self.config['vision_token_price']
|
||||||
|
self.usage[user_id].add_vision_tokens(tokens, vision_token_price)
|
||||||
|
|
||||||
|
allowed_user_ids = self.config['allowed_user_ids'].split(',')
|
||||||
|
if str(user_id) not in allowed_user_ids and 'guests' in self.usage:
|
||||||
|
self.usage["guests"].add_vision_tokens(tokens, vision_token_price)
|
||||||
|
|
||||||
|
|
||||||
|
await update.effective_message.reply_text(
|
||||||
|
message_thread_id=get_thread_id(update),
|
||||||
|
reply_to_message_id=get_reply_to_message_id(self.config, update),
|
||||||
|
text=interpretation,
|
||||||
|
parse_mode=constants.ParseMode.MARKDOWN
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.exception(e)
|
||||||
|
await update.effective_message.reply_text(
|
||||||
|
message_thread_id=get_thread_id(update),
|
||||||
|
reply_to_message_id=get_reply_to_message_id(self.config, update),
|
||||||
|
text=f"{localized_text('vision_fail', bot_language)}: {str(e)}",
|
||||||
|
parse_mode=constants.ParseMode.MARKDOWN
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
temp_file.close()
|
||||||
|
temp_file_png.close()
|
||||||
|
|
||||||
|
await wrap_with_indicator(update, context, _execute, constants.ChatAction.TYPING)
|
||||||
|
|
||||||
async def prompt(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
|
async def prompt(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
"""
|
"""
|
||||||
React to incoming messages and respond accordingly.
|
React to incoming messages and respond accordingly.
|
||||||
@@ -792,6 +896,9 @@ class ChatGPTTelegramBot:
|
|||||||
application.add_handler(CommandHandler(
|
application.add_handler(CommandHandler(
|
||||||
'chat', self.prompt, filters=filters.ChatType.GROUP | filters.ChatType.SUPERGROUP)
|
'chat', self.prompt, filters=filters.ChatType.GROUP | filters.ChatType.SUPERGROUP)
|
||||||
)
|
)
|
||||||
|
application.add_handler(MessageHandler(
|
||||||
|
filters.PHOTO | filters.Document.IMAGE,
|
||||||
|
self.vision))
|
||||||
application.add_handler(MessageHandler(
|
application.add_handler(MessageHandler(
|
||||||
filters.AUDIO | filters.VOICE | filters.Document.AUDIO |
|
filters.AUDIO | filters.VOICE | filters.Document.AUDIO |
|
||||||
filters.VIDEO | filters.VIDEO_NOTE | filters.Document.VIDEO,
|
filters.VIDEO | filters.VIDEO_NOTE | filters.Document.VIDEO,
|
||||||
|
|||||||
@@ -56,6 +56,8 @@ class UsageTracker:
|
|||||||
if os.path.isfile(self.user_file):
|
if os.path.isfile(self.user_file):
|
||||||
with open(self.user_file, "r") as file:
|
with open(self.user_file, "r") as file:
|
||||||
self.usage = json.load(file)
|
self.usage = json.load(file)
|
||||||
|
if 'vision_tokens' not in self.usage['usage_history']:
|
||||||
|
self.usage['usage_history']['vision_tokens'] = {}
|
||||||
else:
|
else:
|
||||||
# ensure directory exists
|
# ensure directory exists
|
||||||
pathlib.Path(logs_dir).mkdir(exist_ok=True)
|
pathlib.Path(logs_dir).mkdir(exist_ok=True)
|
||||||
@@ -63,7 +65,7 @@ class UsageTracker:
|
|||||||
self.usage = {
|
self.usage = {
|
||||||
"user_name": user_name,
|
"user_name": user_name,
|
||||||
"current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())},
|
"current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())},
|
||||||
"usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}}
|
"usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "vision_tokens":{}}
|
||||||
}
|
}
|
||||||
|
|
||||||
# token usage functions:
|
# token usage functions:
|
||||||
@@ -151,6 +153,47 @@ class UsageTracker:
|
|||||||
usage_month += sum(images)
|
usage_month += sum(images)
|
||||||
return usage_day, usage_month
|
return usage_day, usage_month
|
||||||
|
|
||||||
|
|
||||||
|
# vision usage functions
|
||||||
|
def add_vision_tokens(self, tokens, vision_token_price=0.01):
|
||||||
|
"""
|
||||||
|
Adds requested vision tokens to a users usage history and updates current cost.
|
||||||
|
:param tokens: total tokens used in last request
|
||||||
|
:param vision_token_price: price per 1K tokens transcription, defaults to 0.01
|
||||||
|
"""
|
||||||
|
today = date.today()
|
||||||
|
token_price = round(tokens * vision_token_price / 1000, 2)
|
||||||
|
self.add_current_costs(token_price)
|
||||||
|
|
||||||
|
# update usage_history
|
||||||
|
if str(today) in self.usage["usage_history"]["vision_tokens"]:
|
||||||
|
# add requested seconds to existing date
|
||||||
|
self.usage["usage_history"]["vision_tokens"][str(today)] += tokens
|
||||||
|
else:
|
||||||
|
# create new entry for current date
|
||||||
|
self.usage["usage_history"]["vision_tokens"][str(today)] = tokens
|
||||||
|
|
||||||
|
# write updated token usage to user file
|
||||||
|
with open(self.user_file, "w") as outfile:
|
||||||
|
json.dump(self.usage, outfile)
|
||||||
|
|
||||||
|
def get_current_vision_tokens(self):
|
||||||
|
"""Get vision tokens for today and this month.
|
||||||
|
|
||||||
|
:return: total amount of vision tokens per day and per month
|
||||||
|
"""
|
||||||
|
today = date.today()
|
||||||
|
if str(today) in self.usage["usage_history"]["vision_tokens"]:
|
||||||
|
tokens_day = self.usage["usage_history"]["vision_tokens"][str(today)]
|
||||||
|
else:
|
||||||
|
tokens_day = 0
|
||||||
|
month = str(today)[:7] # year-month as string
|
||||||
|
tokens_month = 0
|
||||||
|
for today, tokens in self.usage["usage_history"]["vision_tokens"].items():
|
||||||
|
if today.startswith(month):
|
||||||
|
tokens_month += tokens
|
||||||
|
return tokens_day, tokens_month
|
||||||
|
|
||||||
# transcription usage functions:
|
# transcription usage functions:
|
||||||
|
|
||||||
def add_transcription_seconds(self, seconds, minute_price=0.006):
|
def add_transcription_seconds(self, seconds, minute_price=0.006):
|
||||||
@@ -236,13 +279,14 @@ class UsageTracker:
|
|||||||
cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost())
|
cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost())
|
||||||
return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time}
|
return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time}
|
||||||
|
|
||||||
def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006):
|
def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, vision_token_price=0.01):
|
||||||
"""Get total USD amount of all requests in history
|
"""Get total USD amount of all requests in history
|
||||||
|
|
||||||
:param tokens_price: price per 1000 tokens, defaults to 0.002
|
:param tokens_price: price per 1000 tokens, defaults to 0.002
|
||||||
:param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"],
|
:param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"],
|
||||||
defaults to [0.016, 0.018, 0.02]
|
defaults to [0.016, 0.018, 0.02]
|
||||||
:param minute_price: price per minute transcription, defaults to 0.006
|
:param minute_price: price per minute transcription, defaults to 0.006
|
||||||
|
:param vision_token_price: price per 1k vision token interpretation, defaults to 0.01
|
||||||
:return: total cost of all requests
|
:return: total cost of all requests
|
||||||
"""
|
"""
|
||||||
total_tokens = sum(self.usage['usage_history']['chat_tokens'].values())
|
total_tokens = sum(self.usage['usage_history']['chat_tokens'].values())
|
||||||
@@ -255,5 +299,8 @@ class UsageTracker:
|
|||||||
total_transcription_seconds = sum(self.usage['usage_history']['transcription_seconds'].values())
|
total_transcription_seconds = sum(self.usage['usage_history']['transcription_seconds'].values())
|
||||||
transcription_cost = round(total_transcription_seconds * minute_price / 60, 2)
|
transcription_cost = round(total_transcription_seconds * minute_price / 60, 2)
|
||||||
|
|
||||||
all_time_cost = token_cost + transcription_cost + image_cost
|
total_vision_tokens = sum(self.usage['usage_history']['vision_tokens'].values())
|
||||||
|
vision_cost = round(total_vision_tokens * vision_token_price / 1000, 2)
|
||||||
|
|
||||||
|
all_time_cost = token_cost + transcription_cost + image_cost + vision_cost
|
||||||
return all_time_cost
|
return all_time_cost
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import itertools
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import base64
|
||||||
|
|
||||||
import telegram
|
import telegram
|
||||||
from telegram import Message, MessageEntity, Update, ChatMember, constants
|
from telegram import Message, MessageEntity, Update, ChatMember, constants
|
||||||
@@ -377,3 +378,9 @@ def cleanup_intermediate_files(response: any):
|
|||||||
if format == 'path':
|
if format == 'path':
|
||||||
if os.path.exists(value):
|
if os.path.exists(value):
|
||||||
os.remove(value)
|
os.remove(value)
|
||||||
|
|
||||||
|
|
||||||
|
# Function to encode the image
|
||||||
|
def encode_image(image_path):
|
||||||
|
with open(image_path, "rb") as image_file:
|
||||||
|
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||||
@@ -11,3 +11,4 @@ spotipy~=2.23.0
|
|||||||
pytube~=15.0.0
|
pytube~=15.0.0
|
||||||
gtts~=2.3.2
|
gtts~=2.3.2
|
||||||
whois~=0.9.27
|
whois~=0.9.27
|
||||||
|
Pillow~=10.1.0
|
||||||
@@ -14,6 +14,7 @@
|
|||||||
"usage_month":"Usage this month",
|
"usage_month":"Usage this month",
|
||||||
"stats_tokens":"tokens",
|
"stats_tokens":"tokens",
|
||||||
"stats_images":"images generated",
|
"stats_images":"images generated",
|
||||||
|
"stats_vision":"image tokens interpreted",
|
||||||
"stats_transcribe":["minutes and", "seconds transcribed"],
|
"stats_transcribe":["minutes and", "seconds transcribed"],
|
||||||
"stats_total":"💰 For a total amount of $",
|
"stats_total":"💰 For a total amount of $",
|
||||||
"stats_budget":"Your remaining budget",
|
"stats_budget":"Your remaining budget",
|
||||||
@@ -25,6 +26,7 @@
|
|||||||
"reset_done":"Done!",
|
"reset_done":"Done!",
|
||||||
"image_no_prompt":"Please provide a prompt! (e.g. /image cat)",
|
"image_no_prompt":"Please provide a prompt! (e.g. /image cat)",
|
||||||
"image_fail":"Failed to generate image",
|
"image_fail":"Failed to generate image",
|
||||||
|
"vision_fail":"Failed to interpret image",
|
||||||
"media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"],
|
"media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"],
|
||||||
"media_type_fail":"Unsupported file type",
|
"media_type_fail":"Unsupported file type",
|
||||||
"transcript":"Transcript",
|
"transcript":"Transcript",
|
||||||
|
|||||||
Reference in New Issue
Block a user