feat: allow read tool to handle images (#3052)

2025-12-21 09:44:21 +01:00 · 2025-10-09 09:05:11 -05:00
parent eb4b5721cd
commit 225adc46ba
7 changed files with 159 additions and 83 deletions
--- a/packages/opencode/src/provider/models.ts
+++ b/packages/opencode/src/provider/models.ts
@@ -28,6 +28,12 @@ export namespace ModelsDev {
        context: z.number(),
        output: z.number(),
      }),
      modalities: z
        .object({
          input: z.array(z.enum(["text", "audio", "image", "video", "pdf"])),
          output: z.array(z.enum(["text", "audio", "image", "video", "pdf"])),
        })
        .optional(),
      experimental: z.boolean().optional(),
      options: z.record(z.string(), z.any()),
      provider: z.object({ npm: z.string() }).optional(),
--- a/packages/opencode/src/provider/provider.ts
+++ b/packages/opencode/src/provider/provider.ts
@@ -279,6 +279,11 @@ export namespace Provider {
              context: 0,
              output: 0,
            },
          modalities: model.modalities ??
            existing?.modalities ?? {
              input: ["text"],
              output: ["text"],
            },
          provider: model.provider ?? existing?.provider,
        }
        parsed.models[modelID] = parsedModel
--- a/packages/opencode/src/session/message-v2.ts
+++ b/packages/opencode/src/session/message-v2.ts
@@ -17,71 +17,6 @@ export namespace MessageV2 {
    }),
  )
  export const ToolStatePending = z
    .object({
      status: z.literal("pending"),
    })
    .meta({
      ref: "ToolStatePending",
    })
  export type ToolStatePending = z.infer<typeof ToolStatePending>
  export const ToolStateRunning = z
    .object({
      status: z.literal("running"),
      input: z.any(),
      title: z.string().optional(),
      metadata: z.record(z.string(), z.any()).optional(),
      time: z.object({
        start: z.number(),
      }),
    })
    .meta({
      ref: "ToolStateRunning",
    })
  export type ToolStateRunning = z.infer<typeof ToolStateRunning>
  export const ToolStateCompleted = z
    .object({
      status: z.literal("completed"),
      input: z.record(z.string(), z.any()),
      output: z.string(),
      title: z.string(),
      metadata: z.record(z.string(), z.any()),
      time: z.object({
        start: z.number(),
        end: z.number(),
        compacted: z.number().optional(),
      }),
    })
    .meta({
      ref: "ToolStateCompleted",
    })
  export type ToolStateCompleted = z.infer<typeof ToolStateCompleted>
  export const ToolStateError = z
    .object({
      status: z.literal("error"),
      input: z.record(z.string(), z.any()),
      error: z.string(),
      metadata: z.record(z.string(), z.any()).optional(),
      time: z.object({
        start: z.number(),
        end: z.number(),
      }),
    })
    .meta({
      ref: "ToolStateError",
    })
  export type ToolStateError = z.infer<typeof ToolStateError>
  export const ToolState = z
    .discriminatedUnion("status", [ToolStatePending, ToolStateRunning, ToolStateCompleted, ToolStateError])
    .meta({
      ref: "ToolState",
    })
  const PartBase = z.object({
    id: z.string(),
    sessionID: z.string(),
@@ -134,17 +69,6 @@ export namespace MessageV2 {
  })
  export type ReasoningPart = z.infer<typeof ReasoningPart>
  export const ToolPart = PartBase.extend({
    type: z.literal("tool"),
    callID: z.string(),
    tool: z.string(),
    state: ToolState,
    metadata: z.record(z.string(), z.any()).optional(),
  }).meta({
    ref: "ToolPart",
  })
  export type ToolPart = z.infer<typeof ToolPart>
  const FilePartSourceBase = z.object({
    text: z
      .object({
@@ -228,6 +152,83 @@ export namespace MessageV2 {
  })
  export type StepFinishPart = z.infer<typeof StepFinishPart>
  export const ToolStatePending = z
    .object({
      status: z.literal("pending"),
    })
    .meta({
      ref: "ToolStatePending",
    })
  export type ToolStatePending = z.infer<typeof ToolStatePending>
  export const ToolStateRunning = z
    .object({
      status: z.literal("running"),
      input: z.any(),
      title: z.string().optional(),
      metadata: z.record(z.string(), z.any()).optional(),
      time: z.object({
        start: z.number(),
      }),
    })
    .meta({
      ref: "ToolStateRunning",
    })
  export type ToolStateRunning = z.infer<typeof ToolStateRunning>
  export const ToolStateCompleted = z
    .object({
      status: z.literal("completed"),
      input: z.record(z.string(), z.any()),
      output: z.string(),
      title: z.string(),
      metadata: z.record(z.string(), z.any()),
      time: z.object({
        start: z.number(),
        end: z.number(),
        compacted: z.number().optional(),
      }),
      attachments: FilePart.array().optional(),
    })
    .meta({
      ref: "ToolStateCompleted",
    })
  export type ToolStateCompleted = z.infer<typeof ToolStateCompleted>
  export const ToolStateError = z
    .object({
      status: z.literal("error"),
      input: z.record(z.string(), z.any()),
      error: z.string(),
      metadata: z.record(z.string(), z.any()).optional(),
      time: z.object({
        start: z.number(),
        end: z.number(),
      }),
    })
    .meta({
      ref: "ToolStateError",
    })
  export type ToolStateError = z.infer<typeof ToolStateError>
  export const ToolState = z
    .discriminatedUnion("status", [ToolStatePending, ToolStateRunning, ToolStateCompleted, ToolStateError])
    .meta({
      ref: "ToolState",
    })
  export const ToolPart = PartBase.extend({
    type: z.literal("tool"),
    callID: z.string(),
    tool: z.string(),
    state: ToolState,
    metadata: z.record(z.string(), z.any()).optional(),
  }).meta({
    ref: "ToolPart",
  })
  export type ToolPart = z.infer<typeof ToolPart>
  const Base = z.object({
    id: z.string(),
    sessionID: z.string(),
@@ -531,7 +532,25 @@ export namespace MessageV2 {
                },
              ]
            if (part.type === "tool") {
-              if (part.state.status === "completed")
+              if (part.state.status === "completed") {
                if (part.state.attachments?.length) {
                  result.push({
                    id: Identifier.ascending("message"),
                    role: "user",
                    parts: [
                      {
                        type: "text",
                        text: `Tool ${part.tool} returned an attachment:`,
                      },
                      ...part.state.attachments.map((attachment) => ({
                        type: "file" as const,
                        url: attachment.url,
                        mediaType: attachment.mime,
                        filename: attachment.filename,
                      })),
                    ],
                  })
                }
                return [
                  {
                    type: ("tool-" + part.tool) as `tool-${string}`,
@@ -542,6 +561,7 @@ export namespace MessageV2 {
                    callProviderMetadata: part.metadata,
                  },
                ]
              }
              if (part.state.status === "error")
                return [
                  {
--- a/packages/opencode/src/session/prompt.ts
+++ b/packages/opencode/src/session/prompt.ts
@@ -457,6 +457,10 @@ export namespace SessionPrompt {
            abort: options.abortSignal!,
            messageID: input.processor.message.id,
            callID: options.toolCallId,
            extra: {
              modelID: input.modelID,
              providerID: input.providerID,
            },
            agent: input.agent.name,
            metadata: async (val) => {
              const match = input.processor.partFromToolCall(options.toolCallId)
@@ -989,6 +993,7 @@ export namespace SessionPrompt {
                        start: match.state.time.start,
                        end: Date.now(),
                      },
                      attachments: value.output.attachments,
                    },
                  })
                  delete toolcalls[value.toolCallId]
--- a/packages/opencode/src/tool/read.ts
+++ b/packages/opencode/src/tool/read.ts
@@ -7,6 +7,8 @@ import { FileTime } from "../file/time"
 import DESCRIPTION from "./read.txt"
 import { Filesystem } from "../util/filesystem"
 import { Instance } from "../project/instance"
 import { Provider } from "../provider/provider"
 import { Identifier } from "../id/id"
 const DEFAULT_READ_LIMIT = 2000
 const MAX_LINE_LENGTH = 2000
@@ -23,6 +25,8 @@ export const ReadTool = Tool.define("read", {
    if (!path.isAbsolute(filepath)) {
      filepath = path.join(process.cwd(), filepath)
    }
    const title = path.relative(Instance.worktree, filepath)
    if (!ctx.extra?.["bypassCwdCheck"] && !Filesystem.contains(Instance.directory, filepath)) {
      throw new Error(`File ${filepath} is not in the current working directory`)
    }
@@ -48,12 +52,45 @@ export const ReadTool = Tool.define("read", {
      throw new Error(`File not found: ${filepath}`)
    }
    const limit = params.limit ?? DEFAULT_READ_LIMIT
    const offset = params.offset || 0
    const isImage = isImageFile(filepath)
-    if (isImage) throw new Error(`This is an image file of type: ${isImage}\nUse a different tool to process images`)
+    const supportsImages = await (async () => {
      if (!ctx.extra?.["providerID"] || !ctx.extra?.["modelID"]) return false
      const providerID = ctx.extra["providerID"] as string
      const modelID = ctx.extra["modelID"] as string
      const model = await Provider.getModel(providerID, modelID).catch(() => undefined)
      if (!model) return false
      return model.info.modalities?.input?.includes("image") ?? false
    })()
    if (isImage) {
      if (!supportsImages) {
        throw new Error(`Failed to read image: ${filepath}, model may not be able to read images`)
      }
      const mime = file.type
      const msg = "Image read successfully"
      return {
        title,
        output: msg,
        metadata: {
          preview: msg,
        },
        attachments: [
          {
            id: Identifier.ascending("part"),
            sessionID: ctx.sessionID,
            messageID: ctx.messageID,
            type: "file",
            mime,
            url: `data:${mime};base64,${Buffer.from(await file.bytes()).toString("base64")}`,
          },
        ],
      }
    }
    const isBinary = await isBinaryFile(filepath, file)
    if (isBinary) throw new Error(`Cannot read binary file: ${filepath}`)
    const limit = params.limit ?? DEFAULT_READ_LIMIT
    const offset = params.offset || 0
    const lines = await file.text().then((text) => text.split("\n"))
    const raw = lines.slice(offset, offset + limit).map((line) => {
      return line.length > MAX_LINE_LENGTH ? line.substring(0, MAX_LINE_LENGTH) + "..." : line
@@ -76,7 +113,7 @@ export const ReadTool = Tool.define("read", {
    FileTime.read(ctx.sessionID, filepath)
    return {
-      title: path.relative(Instance.worktree, filepath),
+      title,
      output,
      metadata: {
        preview,
--- a/packages/opencode/src/tool/read.txt
+++ b/packages/opencode/src/tool/read.txt
@@ -7,6 +7,6 @@ Usage:
 - You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
 - Any lines longer than 2000 characters will be truncated
 - Results are returned using cat -n format, with line numbers starting at 1
 - This tool cannot read binary files, including images
 - You have the capability to call multiple tools in a single response. It is always better to speculatively read multiple files as a batch that are potentially useful.
 - If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.
 - You can read image files using this tool.
--- a/packages/opencode/src/tool/tool.ts
+++ b/packages/opencode/src/tool/tool.ts
@@ -1,9 +1,11 @@
 import z from "zod/v4"
 import type { MessageV2 } from "../session/message-v2"
 export namespace Tool {
  interface Metadata {
    [key: string]: any
  }
  export type Context<M extends Metadata = Metadata> = {
    sessionID: string
    messageID: string
@@ -25,6 +27,7 @@ export namespace Tool {
        title: string
        metadata: M
        output: string
        attachments?: MessageV2.FilePart[]
      }>
    }>
  }