added support for vision

2025-12-22 23:25:41 +01:00 · 2023-11-08 17:56:27 +01:00
parent 450b86d5a7
commit 9e7d8701cd
9 changed files with 245 additions and 8 deletions
--- a/.env.example
+++ b/.env.example
@@ -17,24 +17,30 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
 # TOKEN_PRICE=0.002
 # IMAGE_PRICES=0.016,0.018,0.02
 # TRANSCRIPTION_PRICE=0.006
+# VISION_TOKEN_PRICE=0.01
 # ENABLE_QUOTING=true
 # ENABLE_IMAGE_GENERATION=true
 # ENABLE_TRANSCRIPTION=true
+# ENABLE_VISION=true
 # PROXY=http://localhost:8080
 # OPENAI_MODEL=gpt-3.5-turbo
 # ASSISTANT_PROMPT="You are a helpful assistant."
 # SHOW_USAGE=false
 # STREAM=true
 # MAX_TOKENS=1200
+# VISION_MAX_TOKENS=300
 # MAX_HISTORY_SIZE=15
 # MAX_CONVERSATION_AGE_MINUTES=180
 # VOICE_REPLY_WITH_TRANSCRIPT_ONLY=true
 # VOICE_REPLY_PROMPTS="Hi bot;Hey bot;Hi chat;Hey chat"
+# VISION_PROMPT="What is in this image"
 # N_CHOICES=1
 # TEMPERATURE=1.0
 # PRESENCE_PENALTY=0.0
 # FREQUENCY_PENALTY=0.0
 # IMAGE_SIZE=512x512
+# VISION_DETAIL="low"
 # GROUP_TRIGGER_KEYWORD=""
 # IGNORE_GROUP_TRANSCRIPTIONS=true
+# IGNORE_GROUP_VISION=true
 # BOT_LANGUAGE=en
--- a/README.md
+++ b/README.md
@@ -72,6 +72,7 @@ The following parameters are optional and can be set in the `.env` file:
 | `TOKEN_PRICE`         | $-price per 1000 tokens used to compute cost information in usage statistics. Source: https://openai.com/pricing                                                                                                                                                                                                                                                                          | `0.002`            |
 | `IMAGE_PRICES`        | A comma-separated list with 3 elements of prices for the different image sizes: `256x256`, `512x512` and `1024x1024`. Source: https://openai.com/pricing                                                                                                                                                                                                                                  | `0.016,0.018,0.02` |
 | `TRANSCRIPTION_PRICE` | USD-price for one minute of audio transcription. Source: https://openai.com/pricing                                                                                                                                                                                                                                                                                                       | `0.006`            |
+| `VISION_TOKEN_PRICE` | USD-price per 1K tokens of image interpretation. Source: https://openai.com/pricing                                                                                                                                                                                                                                                                                                       | `0.01`            |

 Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/184) for possible budget configurations.

@@ -81,23 +82,28 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
 | `ENABLE_QUOTING`                   | Whether to enable message quoting in private chats                                                                                                                                                                                                                    | `true`                              |
 | `ENABLE_IMAGE_GENERATION`          | Whether to enable image generation via the `/image` command                                                                                                                                                                                                           | `true`                              |
 | `ENABLE_TRANSCRIPTION`             | Whether to enable transcriptions of audio and video messages                                                                                                                                                                                                          | `true`                              |
+| `ENABLE_VISION`             | Whether to enable vision capabilities in supported models                                                                                                                                                                                                          | `true`                              |
 | `PROXY`                            | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`)                                                                                                                                                                                           | -                                   |
 | `OPENAI_MODEL`                     | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/)                                                                                                                                  | `gpt-3.5-turbo`                     |
 | `ASSISTANT_PROMPT`                 | A system message that sets the tone and controls the behavior of the assistant                                                                                                                                                                                        | `You are a helpful assistant.`      |
 | `SHOW_USAGE`                       | Whether to show OpenAI token usage information after each response                                                                                                                                                                                                    | `false`                             |
 | `STREAM`                           | Whether to stream responses. **Note**: incompatible, if enabled, with `N_CHOICES` higher than 1                                                                                                                                                                       | `true`                              |
 | `MAX_TOKENS`                       | Upper bound on how many tokens the ChatGPT API will return                                                                                                                                                                                                            | `1200` for GPT-3, `2400` for GPT-4  |
+| `VISION_MAX_TOKENS`                       | Upper bound on how many tokens vision models will return                                                                                                                                                                                                            | `300` for gpt-4-vision-preview  |
 | `MAX_HISTORY_SIZE`                 | Max number of messages to keep in memory, after which the conversation will be summarised to avoid excessive token usage                                                                                                                                              | `15`                                |
 | `MAX_CONVERSATION_AGE_MINUTES`     | Maximum number of minutes a conversation should live since the last message, after which the conversation will be reset                                                                                                                                               | `180`                               |
 | `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` | Whether to answer to voice messages with the transcript only or with a ChatGPT response of the transcript                                                                                                                                                             | `false`                             |
 | `VOICE_REPLY_PROMPTS`              | A semicolon separated list of phrases (i.e. `Hi bot;Hello chat`). If the transcript starts with any of them, it will be treated as a prompt even if `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` is set to `true`                                                               | -                                   |
+| `VISION_PROMPT`              | A phrase (i.e. `What is in this image`). The vision models use it as prompt to interpret a given image. If there is caption in the image sent to the bot, that supersedes this parameter                                                               | `What is in this image`                                   |
 | `N_CHOICES`                        | Number of answers to generate for each input message. **Note**: setting this to a number higher than 1 will not work properly if `STREAM` is enabled                                                                                                                  | `1`                                 |
 | `TEMPERATURE`                      | Number between 0 and 2. Higher values will make the output more random                                                                                                                                                                                                | `1.0`                               |
 | `PRESENCE_PENALTY`                 | Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far                                                                                                                                                      | `0.0`                               |
 | `FREQUENCY_PENALTY`                | Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far                                                                                                                                                 | `0.0`                               |
 | `IMAGE_SIZE`                       | The DALL·E generated image size. Allowed values: `256x256`, `512x512` or `1024x1024`                                                                                                                                                                                  | `512x512`                           |
+| `VISION_DETAIL`                       | The detail parameter for vision models, explained [Vision Guide](https://platform.openai.com/docs/guides/vision). Allowed values: `low` or `high`                                                                                                                                                                                  | `low`                           |
 | `GROUP_TRIGGER_KEYWORD`            | If set, the bot in group chats will only respond to messages that start with this keyword                                                                                                                                                                             | -                                   |
 | `IGNORE_GROUP_TRANSCRIPTIONS`      | If set to true, the bot will not process transcriptions in group chats                                                                                                                                                                                                | `true`                              |
+| `IGNORE_GROUP_VISION`      | If set to true, the bot will not process vision queries in group chats                                                                                                                                                                                                | `true`                              |
 | `BOT_LANGUAGE`                     | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`.  [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en`                                |
 | `WHISPER_PROMPT`                     | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message.  [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting) | `-`                                |

--- a/bot/main.py
+++ b/bot/main.py
@@ -50,6 +50,9 @@ def main():
        'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
        'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
        'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
+        'vision_prompt': os.environ.get('VISION_PROMPT', 'What is in this image'),
+        'vision_detail': os.environ.get('VISION_DETAIL', 'low'),
+        'vision_max_tokens': int(os.environ.get('VISION_MAX_TOKENS', '300')),
    }

    if openai_config['enable_functions'] and not functions_available:
@@ -70,6 +73,7 @@ def main():
        'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true',
        'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
        'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
+        'enable_vision': os.environ.get('ENABLE_VISION', 'true').lower() == 'true',
        'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
        'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
        'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))),
@@ -78,9 +82,11 @@ def main():
        'voice_reply_transcript': os.environ.get('VOICE_REPLY_WITH_TRANSCRIPT_ONLY', 'false').lower() == 'true',
        'voice_reply_prompts': os.environ.get('VOICE_REPLY_PROMPTS', '').split(';'),
        'ignore_group_transcriptions': os.environ.get('IGNORE_GROUP_TRANSCRIPTIONS', 'true').lower() == 'true',
+        'ignore_group_vision': os.environ.get('IGNORE_GROUP_VISION', 'true').lower() == 'true',
        'group_trigger_keyword': os.environ.get('GROUP_TRIGGER_KEYWORD', ''),
        'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
        'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
+        'vision_token_price': float(os.environ.get('VISION_TOKEN_PRICE', '0.01')),
        'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)),
        'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
    }
--- a/bot/openai_helper.py
+++ b/bot/openai_helper.py
@@ -12,10 +12,11 @@ import json
 import httpx
 from datetime import date
 from calendar import monthrange
+from PIL import Image

 from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type

-from utils import is_direct_result
+from utils import is_direct_result, encode_image
 from plugin_manager import PluginManager

 # Models can be found here: https://platform.openai.com/docs/models/overview
@@ -23,7 +24,8 @@ GPT_3_MODELS = ("gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613")
 GPT_3_16K_MODELS = ("gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613")
 GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613")
 GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613")
-GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS
+GPT_4_VISION_MODELS = ("gpt-4-vision-preview",)
+GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS


 def default_max_tokens(model: str) -> int:
@@ -41,6 +43,8 @@ def default_max_tokens(model: str) -> int:
        return base * 4
    elif model in GPT_4_32K_MODELS:
        return base * 8
+    elif model in GPT_4_VISION_MODELS:
+        return 4096


 def are_functions_available(model: str) -> bool:
@@ -347,6 +351,26 @@ class OpenAIHelper:
            logging.exception(e)
            raise Exception(f"⚠️ _{localized_text('error', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e

+    async def interpret_image(self, filename, prompt=None):
+        """
+        Interprets a given PNG image file using the Vision model.
+        """
+        try:
+            image = encode_image(filename)
+            prompt = self.config['vision_prompt'] if prompt is None else prompt
+            message = {'role':'user', 'content':[{'type':'text', 'text':prompt}, {'type':'image_url', \
+                        'image_url': {'url':f'data:image/jpeg;base64,{image}', 'detail':self.config['vision_detail'] } }]}
+            response = await self.client.chat.completions.create(model=self.config['model'], messages=[message], max_tokens=self.config['vision_max_tokens'])
+            return response.choices[0].message.content, self.__count_tokens_vision(filename)
+        
+        except openai.RateLimitError as e:
+            raise e
+        except openai.BadRequestError as e:
+            raise Exception(f"⚠️ _{localized_text('openai_invalid', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e
+        except Exception as e:
+            logging.exception(e)
+            raise Exception(f"⚠️ _{localized_text('error', self.config['bot_language'])}._ ⚠️\n{str(e)}") from e
+
    def reset_chat_history(self, chat_id, content=''):
        """
        Resets the conversation history.
@@ -410,6 +434,8 @@ class OpenAIHelper:
            return base * 2
        if self.config['model'] in GPT_4_32K_MODELS:
            return base * 8
+        if self.config['model'] in GPT_4_VISION_MODELS:
+            return base * 31
        raise NotImplementedError(
            f"Max tokens for model {self.config['model']} is not implemented yet."
        )
@@ -430,7 +456,7 @@ class OpenAIHelper:
        if model in GPT_3_MODELS + GPT_3_16K_MODELS:
            tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
            tokens_per_name = -1  # if there's a name, the role is omitted
-        elif model in GPT_4_MODELS + GPT_4_32K_MODELS:
+        elif model in GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS:
            tokens_per_message = 3
            tokens_per_name = 1
        else:
@@ -445,6 +471,35 @@ class OpenAIHelper:
        num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
        return num_tokens

+    def __count_tokens_vision(self, filename) -> int:
+        """
+        Counts the number of tokens for interpreting an image.
+        :param image: image to interpret
+        :return: the number of tokens required
+        """
+        image = Image.open(filename)
+        model = self.config['model']
+        if model not in GPT_4_VISION_MODELS:
+            raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}.""")
+        
+        w, h = image.size
+        if w > h: w, h = h, w
+        # this computation follows https://platform.openai.com/docs/guides/vision and https://openai.com/pricing#gpt-4-turbo
+        base_tokens = 85
+        detail = self.config['vision_detail']
+        if detail == 'low':
+            return base_tokens
+        elif detail == 'high':
+            f = max(w / 768, h / 2048)
+            if f > 1:
+                w, h = int(w / f), int(h / f)
+            tw, th = (w + 511) // 512, (h + 511) // 512
+            tiles = tw * th
+            num_tokens = base_tokens + tiles * 170
+            return num_tokens
+        else:
+            raise NotImplementedError(f"""unknown parameter detail={detail} for model {model}.""")
+
    # No longer works as of July 21st 2023, as OpenAI has removed the billing API
    # def get_billing_current_month(self):
    #     """Gets billed usage for current month from OpenAI API.
--- a/bot/telegram_bot.py
+++ b/bot/telegram_bot.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import asyncio
 import logging
 import os
+import tempfile

 from uuid import uuid4
 from telegram import BotCommandScopeAllGroupChats, Update, constants
@@ -13,6 +14,7 @@ from telegram.ext import ApplicationBuilder, CommandHandler, MessageHandler, \
    filters, InlineQueryHandler, CallbackQueryHandler, Application, ContextTypes, CallbackContext

 from pydub import AudioSegment
+from PIL import Image

 from utils import is_group_chat, get_thread_id, message_text, wrap_with_indicator, split_into_chunks, \
    edit_message_with_retry, get_stream_cutoff_values, is_allowed, get_remaining_budget, is_admin, is_within_budget, \
@@ -94,6 +96,7 @@ class ChatGPTTelegramBot:
        images_today, images_month = self.usage[user_id].get_current_image_count()
        (transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month,
         transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration()
+        vision_today, vision_month = self.usage[user_id].get_current_vision_tokens()
        current_cost = self.usage[user_id].get_current_cost()

        chat_id = update.effective_chat.id
@@ -113,10 +116,15 @@ class ChatGPTTelegramBot:
        if self.config.get('enable_image_generation', False):
            text_today_images = f"{images_today} {localized_text('stats_images', bot_language)}\n"

+        text_today_vision = ""
+        if self.config.get('enable_vision', False):
+            text_today_vision = f"{vision_today} {localized_text('stats_vision', bot_language)}\n"
+        
        text_today = (
            f"*{localized_text('usage_today', bot_language)}:*\n"
            f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n"
            f"{text_today_images}"  # Include the image statistics for today if applicable
+            f"{text_today_vision}"
            f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} "
            f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n"
            f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n"
@@ -127,11 +135,16 @@ class ChatGPTTelegramBot:
        if self.config.get('enable_image_generation', False):
            text_month_images = f"{images_month} {localized_text('stats_images', bot_language)}\n"

+        text_month_vision = ""
+        if self.config.get('enable_vision', False):
+            text_month_vision = f"{vision_month} {localized_text('stats_vision', bot_language)}\n"
+        
        # Check if image generation is enabled and, if so, generate the image statistics for the month
        text_month = (
            f"*{localized_text('usage_month', bot_language)}:*\n"
            f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n"
            f"{text_month_images}"  # Include the image statistics for the month if applicable
+            f"{text_month_vision}"
            f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} "
            f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n"
            f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}"
@@ -370,6 +383,97 @@ class ChatGPTTelegramBot:

        await wrap_with_indicator(update, context, _execute, constants.ChatAction.TYPING)

+    async def vision(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
+        """
+        Interpret image using vision model.
+        """
+        if not self.config['enable_vision'] or not await self.check_allowed_and_within_budget(update, context):
+            return
+
+        if is_group_chat(update) and self.config['ignore_group_vision']:
+            logging.info(f'Vision coming from group chat, ignoring...')
+            return
+
+        chat_id = update.effective_chat.id
+        image = update.message.effective_attachment[-1]
+        prompt = update.message.caption
+        temp_file = tempfile.NamedTemporaryFile()
+
+        async def _execute():
+            bot_language = self.config['bot_language']
+            try:
+                media_file = await context.bot.get_file(image.file_id)
+                await media_file.download_to_drive(temp_file.name)
+            except Exception as e:
+                logging.exception(e)
+                await update.effective_message.reply_text(
+                    message_thread_id=get_thread_id(update),
+                    reply_to_message_id=get_reply_to_message_id(self.config, update),
+                    text=(
+                        f"{localized_text('media_download_fail', bot_language)[0]}: "
+                        f"{str(e)}. {localized_text('media_download_fail', bot_language)[1]}"
+                    ),
+                    parse_mode=constants.ParseMode.MARKDOWN
+                )
+                return
+            
+            # convert jpg from telegram to png as understood by openai
+
+            temp_file_png = tempfile.NamedTemporaryFile()
+
+            try:
+                original_image = Image.open(temp_file.name)
+                
+                original_image.save(temp_file_png.name, format='PNG')
+                logging.info(f'New vision request received from user {update.message.from_user.name} '
+                             f'(id: {update.message.from_user.id})')
+
+            except Exception as e:
+                logging.exception(e)
+                await update.effective_message.reply_text(
+                    message_thread_id=get_thread_id(update),
+                    reply_to_message_id=get_reply_to_message_id(self.config, update),
+                    text=localized_text('media_type_fail', bot_language)
+                )
+            
+            
+
+            user_id = update.message.from_user.id
+            if user_id not in self.usage:
+                self.usage[user_id] = UsageTracker(user_id, update.message.from_user.name)
+
+            try:
+                interpretation, tokens = await self.openai.interpret_image(temp_file_png.name, prompt=prompt)
+
+                vision_token_price = self.config['vision_token_price']
+                self.usage[user_id].add_vision_tokens(tokens, vision_token_price)
+
+                allowed_user_ids = self.config['allowed_user_ids'].split(',')
+                if str(user_id) not in allowed_user_ids and 'guests' in self.usage:
+                    self.usage["guests"].add_vision_tokens(tokens, vision_token_price)
+
+
+                await update.effective_message.reply_text(
+                    message_thread_id=get_thread_id(update),
+                    reply_to_message_id=get_reply_to_message_id(self.config, update),
+                    text=interpretation,
+                    parse_mode=constants.ParseMode.MARKDOWN
+                )
+
+            except Exception as e:
+                logging.exception(e)
+                await update.effective_message.reply_text(
+                    message_thread_id=get_thread_id(update),
+                    reply_to_message_id=get_reply_to_message_id(self.config, update),
+                    text=f"{localized_text('vision_fail', bot_language)}: {str(e)}",
+                    parse_mode=constants.ParseMode.MARKDOWN
+                )
+            finally:
+                temp_file.close()
+                temp_file_png.close()
+
+        await wrap_with_indicator(update, context, _execute, constants.ChatAction.TYPING)
+
    async def prompt(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
        """
        React to incoming messages and respond accordingly.
@@ -792,6 +896,9 @@ class ChatGPTTelegramBot:
        application.add_handler(CommandHandler(
            'chat', self.prompt, filters=filters.ChatType.GROUP | filters.ChatType.SUPERGROUP)
        )
+        application.add_handler(MessageHandler(
+            filters.PHOTO | filters.Document.IMAGE,
+            self.vision))
        application.add_handler(MessageHandler(
            filters.AUDIO | filters.VOICE | filters.Document.AUDIO |
            filters.VIDEO | filters.VIDEO_NOTE | filters.Document.VIDEO,
--- a/bot/usage_tracker.py
+++ b/bot/usage_tracker.py
@@ -56,6 +56,8 @@ class UsageTracker:
        if os.path.isfile(self.user_file):
            with open(self.user_file, "r") as file:
                self.usage = json.load(file)
+                if 'vision_tokens' not in self.usage['usage_history']:
+                    self.usage['usage_history']['vision_tokens'] = {}
        else:
            # ensure directory exists
            pathlib.Path(logs_dir).mkdir(exist_ok=True)
@@ -63,7 +65,7 @@ class UsageTracker:
            self.usage = {
                "user_name": user_name,
                "current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())},
-                "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}}
+                "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "vision_tokens":{}}
            }

    # token usage functions:
@@ -151,6 +153,47 @@ class UsageTracker:
                usage_month += sum(images)
        return usage_day, usage_month

+
+    # vision usage functions
+    def add_vision_tokens(self, tokens, vision_token_price=0.01):
+        """
+         Adds requested vision tokens to a users usage history and updates current cost.
+        :param tokens: total tokens used in last request
+        :param vision_token_price: price per 1K tokens transcription, defaults to 0.01
+        """
+        today = date.today()
+        token_price = round(tokens * vision_token_price / 1000, 2)
+        self.add_current_costs(token_price)
+
+        # update usage_history
+        if str(today) in self.usage["usage_history"]["vision_tokens"]:
+            # add requested seconds to existing date
+            self.usage["usage_history"]["vision_tokens"][str(today)] += tokens
+        else:
+            # create new entry for current date
+            self.usage["usage_history"]["vision_tokens"][str(today)] = tokens
+
+        # write updated token usage to user file
+        with open(self.user_file, "w") as outfile:
+            json.dump(self.usage, outfile)
+
+    def get_current_vision_tokens(self):
+        """Get vision tokens for today and this month.
+
+        :return: total amount of vision tokens per day and per month
+        """
+        today = date.today()
+        if str(today) in self.usage["usage_history"]["vision_tokens"]:
+            tokens_day = self.usage["usage_history"]["vision_tokens"][str(today)]
+        else:
+            tokens_day = 0
+        month = str(today)[:7]  # year-month as string
+        tokens_month = 0
+        for today, tokens in self.usage["usage_history"]["vision_tokens"].items():
+            if today.startswith(month):
+                tokens_month += tokens
+        return tokens_day, tokens_month
+
    # transcription usage functions:

    def add_transcription_seconds(self, seconds, minute_price=0.006):
@@ -236,13 +279,14 @@ class UsageTracker:
        cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost())
        return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time}

-    def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006):
+    def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, vision_token_price=0.01):
        """Get total USD amount of all requests in history
        
        :param tokens_price: price per 1000 tokens, defaults to 0.002
        :param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"],
            defaults to [0.016, 0.018, 0.02]
        :param minute_price: price per minute transcription, defaults to 0.006
+        :param vision_token_price: price per 1k vision token interpretation, defaults to 0.01
        :return: total cost of all requests
        """
        total_tokens = sum(self.usage['usage_history']['chat_tokens'].values())
@@ -255,5 +299,8 @@ class UsageTracker:
        total_transcription_seconds = sum(self.usage['usage_history']['transcription_seconds'].values())
        transcription_cost = round(total_transcription_seconds * minute_price / 60, 2)

-        all_time_cost = token_cost + transcription_cost + image_cost
+        total_vision_tokens = sum(self.usage['usage_history']['vision_tokens'].values())
+        vision_cost = round(total_vision_tokens * vision_token_price / 1000, 2)
+
+        all_time_cost = token_cost + transcription_cost + image_cost + vision_cost
        return all_time_cost
--- a/bot/utils.py
+++ b/bot/utils.py
@@ -5,6 +5,7 @@ import itertools
 import json
 import logging
 import os
+import base64

 import telegram
 from telegram import Message, MessageEntity, Update, ChatMember, constants
@@ -377,3 +378,9 @@ def cleanup_intermediate_files(response: any):
    if format == 'path':
        if os.path.exists(value):
            os.remove(value)
+
+
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ spotipy~=2.23.0
 pytube~=15.0.0
 gtts~=2.3.2
 whois~=0.9.27
+Pillow~=10.1.0
--- a/translations.json
+++ b/translations.json
@@ -14,6 +14,7 @@
        "usage_month":"Usage this month",
        "stats_tokens":"tokens",
        "stats_images":"images generated",
+        "stats_vision":"image tokens interpreted",
        "stats_transcribe":["minutes and", "seconds transcribed"],
        "stats_total":"💰 For a total amount of $",
        "stats_budget":"Your remaining budget",
@@ -25,6 +26,7 @@
        "reset_done":"Done!",
        "image_no_prompt":"Please provide a prompt! (e.g. /image cat)",
        "image_fail":"Failed to generate image",
+        "vision_fail":"Failed to interpret image",
        "media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"],
        "media_type_fail":"Unsupported file type",
        "transcript":"Transcript",