# Auto-generate alt text with GPT-4 Vision

Use Ittybit for frame extraction and GPT-4 Vision for accessible image descriptions

Accessibility compliance (WCAG 2.1) requires descriptive alt text on every image. Doing this manually across thousands of product images or video thumbnails doesn't scale. This guide shows how to pair Ittybit's image processing with GPT-4 Vision to generate alt text automatically -- Ittybit handles frame extraction and resizing, GPT-4 Vision handles the description.

## Extract a frame from video

Use `kind: "image"` with a video URL to pull a single frame. The `start` option sets the timestamp in seconds.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript
const ITTYBIT_API_KEY = process.env.ITTYBIT_API_KEY;

async function extractFrame(videoUrl: string, timestamp: number) {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "image",
input: videoUrl,
options: {
start: timestamp,
format: "webp",
width: 1024,
},
}),
});
return res.json();
}

const task = await extractFrame(
"https://example.com/product-demo.mp4",
3.5
);
// { id: "task_abc123", status: "queued", ... }

````

```python

ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]

def extract_frame(video_url: str, timestamp: float) -> dict:
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "kind": "image",
            "input": video_url,
            "options": {
                "start": timestamp,
                "format": "webp",
                "width": 1024,
            },
        },
    )
    return res.json()

task = extract_frame("https://example.com/product-demo.mp4", 3.5)
# {"id": "task_abc123", "status": "queued", ...}
````

</CodeGroup>

## Resize an existing image

For images you already have, use `kind: "image"` with the image URL. Resizing to 1024px wide keeps the file small enough for fast GPT-4 Vision inference without losing important detail.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript
async function resizeImage(imageUrl: string) {
  const res = await fetch("https://api.ittybit.com/jobs", {
    method: "POST",
    headers: {
      Authorization: `Bearer ${ITTYBIT_API_KEY}`,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      kind: "image",
      input: imageUrl,
      options: {
        format: "webp",
        width: 1024,
        quality: "medium",
      },
    }),
  });
  return res.json();
}
```

```python
def resize_image(image_url: str) -> dict:
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "kind": "image",
            "input": image_url,
            "options": {
                "format": "webp",
                "width": 1024,
                "quality": "medium",
            },
        },
    )
    return res.json()
```

</CodeGroup>

## Poll for the result

Tasks run asynchronously. Poll until the status reaches `succeeded` to get the output URL.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript
async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
      headers: {
        Authorization: `Bearer ${ITTYBIT_API_KEY}`,
      },
    });
    const task = await res.json();

    if (task.status === "succeeded" || task.status === "failed") {
      return task;
    }

    await new Promise((r) => setTimeout(r, 2000));

}
}

const completed = await waitForTask(task.id);
const imageUrl = completed.output.url;

````

```python

def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        )
        task = res.json()

        if task["status"] in ("succeeded", "failed"):
            return task

        time.sleep(2)

completed = wait_for_task(task["id"])
image_url = completed["output"]["url"]
````

</CodeGroup>

## Generate alt text with GPT-4 Vision

Send the processed image URL to GPT-4 Vision with a system prompt tuned for concise, accessible descriptions.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript

const openai = new OpenAI();

async function generateAltText(imageUrl: string): Promise<string> {
const response = await openai.chat.completions.create({
model: "gpt-4o",
max_tokens: 100,
messages: [
{
role: "system",
content:
"You write alt text for images. Be concise (1-2 sentences). " +
"Describe what is visible, not what it means. " +
"Do not start with 'Image of' or 'Photo of'.",
},
{
role: "user",
content: [
{
type: "image_url",
image_url: { url: imageUrl },
},
],
},
],
});

return response.choices[0].message.content ?? "";
}

const altText = await generateAltText(imageUrl);
console.log(altText);
// "A hand holding a matte black wireless earbud case against a wooden desk."

````

```python
from openai import OpenAI

client = OpenAI()

def generate_alt_text(image_url: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        max_tokens=100,
        messages=[
            {
                "role": "system",
                "content": (
                    "You write alt text for images. Be concise (1-2 sentences). "
                    "Describe what is visible, not what it means. "
                    "Do not start with 'Image of' or 'Photo of'."
                ),
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                    },
                ],
            },
        ],
    )

    return response.choices[0].message.content or ""

alt_text = generate_alt_text(image_url)
print(alt_text)
# "A hand holding a matte black wireless earbud case against a wooden desk."
````

</CodeGroup>

## Batch processing

For a catalog of images, run the full pipeline in parallel -- resize via Ittybit, then describe via GPT-4 Vision.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript
interface ImageRecord {
  id: string;
  url: string;
}

async function processOne(image: ImageRecord) {
// Resize with Ittybit
const task = await resizeImage(image.url);
const completed = await waitForTask(task.id);

if (completed.status === "failed") {
console.error(`Task failed for ${image.id}`);
return null;
}

// Describe with GPT-4 Vision
const altText = await generateAltText(completed.output.url);

return { id: image.id, alt: altText };
}

const images: ImageRecord[] = [
{ id: "prod_001", url: "https://example.com/images/shoe-front.jpg" },
{ id: "prod_002", url: "https://example.com/images/shoe-side.jpg" },
{ id: "prod_003", url: "https://example.com/images/shoe-sole.jpg" },
];

// Process up to 5 at a time
const CONCURRENCY = 5;
const results: ({ id: string; alt: string } | null)[] = [];

for (let i = 0; i < images.length; i += CONCURRENCY) {
const batch = images.slice(i, i + CONCURRENCY);
const batchResults = await Promise.all(batch.map(processOne));
results.push(...batchResults);
}

for (const r of results) {
if (r) console.log(`${r.id}: ${r.alt}`);
}

````

```python
from concurrent.futures import ThreadPoolExecutor

def process_one(image: dict) -> dict | None:
    # Resize with Ittybit
    task = resize_image(image["url"])
    completed = wait_for_task(task["id"])

    if completed["status"] == "failed":
        print(f"Task failed for {image['id']}")
        return None

    # Describe with GPT-4 Vision
    alt_text = generate_alt_text(completed["output"]["url"])

    return {"id": image["id"], "alt": alt_text}

images = [
    {"id": "prod_001", "url": "https://example.com/images/shoe-front.jpg"},
    {"id": "prod_002", "url": "https://example.com/images/shoe-side.jpg"},
    {"id": "prod_003", "url": "https://example.com/images/shoe-sole.jpg"},
]

with ThreadPoolExecutor(max_workers=5) as pool:
    results = list(pool.map(process_one, images))

for r in results:
    if r:
        print(f"{r['id']}: {r['alt']}")
````

</CodeGroup>

## See also

- [API `POST /jobs`](/api/create-job) with `kind: "image"` -- extract frames and resize images
- [Extract thumbnails from video](/guides/extract-thumbnails-from-video) -- frame extraction without AI
- [Responsive images](/guides/generate-responsive-image-sizes) -- generate multiple sizes for srcset
- [OpenAI Vision docs](https://platform.openai.com/docs/guides/vision)