Auto-generate alt text with GPT-4 Vision

View Markdown

Accessibility compliance (WCAG 2.1) requires descriptive alt text on every image. Doing this manually across thousands of product images or video thumbnails doesn’t scale. This guide shows how to pair Ittybit’s image processing with GPT-4 Vision to generate alt text automatically — Ittybit handles frame extraction and resizing, GPT-4 Vision handles the description.

Extract a frame from video

Use kind: "image" with a video URL to pull a single frame. The start option sets the timestamp in seconds.

const ITTYBIT_API_KEY = process.env.ITTYBIT_API_KEY;

async function extractFrame(videoUrl: string, timestamp: number) {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "image",
input: videoUrl,
options: {
start: timestamp,
format: "webp",
width: 1024,
},
}),
});
return res.json();
}

const task = await extractFrame(
"https://example.com/product-demo.mp4",
3.5
);
// { id: "task_abc123", status: "queued", ... }
import os
import requests

ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]


def extract_frame(video_url: str, timestamp: float) -> dict:
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "kind": "image",
            "input": video_url,
            "options": {
                "start": timestamp,
                "format": "webp",
                "width": 1024,
            },
        },
    )
    return res.json()


task = extract_frame("https://example.com/product-demo.mp4", 3.5)
# {"id": "task_abc123", "status": "queued", ...}

Resize an existing image

For images you already have, use kind: "image" with the image URL. Resizing to 1024px wide keeps the file small enough for fast GPT-4 Vision inference without losing important detail.

async function resizeImage(imageUrl: string) {
  const res = await fetch("https://api.ittybit.com/jobs", {
    method: "POST",
    headers: {
      Authorization: `Bearer ${ITTYBIT_API_KEY}`,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      kind: "image",
      input: imageUrl,
      options: {
        format: "webp",
        width: 1024,
        quality: "medium",
      },
    }),
  });
  return res.json();
}
def resize_image(image_url: str) -> dict:
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "kind": "image",
            "input": image_url,
            "options": {
                "format": "webp",
                "width": 1024,
                "quality": "medium",
            },
        },
    )
    return res.json()

Poll for the result

Tasks run asynchronously. Poll until the status reaches succeeded to get the output URL.

async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
      headers: {
        Authorization: `Bearer ${ITTYBIT_API_KEY}`,
      },
    });
    const task = await res.json();

    if (task.status === "succeeded" || task.status === "failed") {
      return task;
    }

    await new Promise((r) => setTimeout(r, 2000));

}
}

const completed = await waitForTask(task.id);
const imageUrl = completed.output.url;
import time


def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        )
        task = res.json()

        if task["status"] in ("succeeded", "failed"):
            return task

        time.sleep(2)


completed = wait_for_task(task["id"])
image_url = completed["output"]["url"]

Generate alt text with GPT-4 Vision

Send the processed image URL to GPT-4 Vision with a system prompt tuned for concise, accessible descriptions.

import OpenAI from "openai";

const openai = new OpenAI();

async function generateAltText(imageUrl: string): Promise<string> {
const response = await openai.chat.completions.create({
model: "gpt-4o",
max_tokens: 100,
messages: [
{
role: "system",
content:
"You write alt text for images. Be concise (1-2 sentences). " +
"Describe what is visible, not what it means. " +
"Do not start with 'Image of' or 'Photo of'.",
},
{
role: "user",
content: [
{
type: "image_url",
image_url: { url: imageUrl },
},
],
},
],
});

return response.choices[0].message.content ?? "";
}

const altText = await generateAltText(imageUrl);
console.log(altText);
// "A hand holding a matte black wireless earbud case against a wooden desk."
from openai import OpenAI

client = OpenAI()


def generate_alt_text(image_url: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        max_tokens=100,
        messages=[
            {
                "role": "system",
                "content": (
                    "You write alt text for images. Be concise (1-2 sentences). "
                    "Describe what is visible, not what it means. "
                    "Do not start with 'Image of' or 'Photo of'."
                ),
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                    },
                ],
            },
        ],
    )

    return response.choices[0].message.content or ""


alt_text = generate_alt_text(image_url)
print(alt_text)
# "A hand holding a matte black wireless earbud case against a wooden desk."

Batch processing

For a catalog of images, run the full pipeline in parallel — resize via Ittybit, then describe via GPT-4 Vision.

interface ImageRecord {
  id: string;
  url: string;
}

async function processOne(image: ImageRecord) {
// Resize with Ittybit
const task = await resizeImage(image.url);
const completed = await waitForTask(task.id);

if (completed.status === "failed") {
console.error(`Task failed for ${image.id}`);
return null;
}

// Describe with GPT-4 Vision
const altText = await generateAltText(completed.output.url);

return { id: image.id, alt: altText };
}

const images: ImageRecord[] = [
{ id: "prod_001", url: "https://example.com/images/shoe-front.jpg" },
{ id: "prod_002", url: "https://example.com/images/shoe-side.jpg" },
{ id: "prod_003", url: "https://example.com/images/shoe-sole.jpg" },
];

// Process up to 5 at a time
const CONCURRENCY = 5;
const results: ({ id: string; alt: string } | null)[] = [];

for (let i = 0; i < images.length; i += CONCURRENCY) {
const batch = images.slice(i, i + CONCURRENCY);
const batchResults = await Promise.all(batch.map(processOne));
results.push(...batchResults);
}

for (const r of results) {
if (r) console.log(`${r.id}: ${r.alt}`);
}
from concurrent.futures import ThreadPoolExecutor


def process_one(image: dict) -> dict | None:
    # Resize with Ittybit
    task = resize_image(image["url"])
    completed = wait_for_task(task["id"])

    if completed["status"] == "failed":
        print(f"Task failed for {image['id']}")
        return None

    # Describe with GPT-4 Vision
    alt_text = generate_alt_text(completed["output"]["url"])

    return {"id": image["id"], "alt": alt_text}


images = [
    {"id": "prod_001", "url": "https://example.com/images/shoe-front.jpg"},
    {"id": "prod_002", "url": "https://example.com/images/shoe-side.jpg"},
    {"id": "prod_003", "url": "https://example.com/images/shoe-sole.jpg"},
]

with ThreadPoolExecutor(max_workers=5) as pool:
    results = list(pool.map(process_one, images))

for r in results:
    if r:
        print(f"{r['id']}: {r['alt']}")

See also