Skip to content

GPT Vision

The GPT Vision middleware allows you to interpret images using the OpenAI API. This middleware is useful when you want to provide a description of an image or when you want to generate a caption for an image.

OpenAI library comes with message-kit.

import OpenAI from "openai";
 
const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY,
});
 
export async function vision(imageData: Uint8Array, systemPrompt: string) {
  if (!imageData?.length) {
    throw new Error("imageData is required and must not be empty");
  }
  if (!systemPrompt?.trim()) {
    throw new Error("systemPrompt is required and must not be empty");
  }
  const base64Image = Buffer.from(imageData).toString("base64");
  const dataUrl = `data:image/jpeg;base64,${base64Image}`;
 
  // Create a new thread for each vision request
  const visionMessages = [
    {
      role: "system",
      content: systemPrompt,
    },
    {
      role: "user",
      content: [
        { type: "text", text: systemPrompt },
        {
          type: "image_url",
          image_url: {
            url: dataUrl,
          },
        },
      ],
    },
  ];
 
  try {
    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      messages: visionMessages as any,
    });
    return response.choices[0].message.content;
  } catch (error) {
    console.error("Failed to interpret image with OpenAI:", error);
    throw error;
  }
}