# Auto-generate captions with Gemini and Ittybit

Use Gemini for caption generation and Ittybit for video processing in an async pipeline

Most captioning workflows force you to pick between accuracy and speed. Gemini's multimodal video understanding gives you both -- it watches the video, generates timestamped captions, and you get production-ready WebVTT without training a custom model. Ittybit handles the media pipeline: extract the audio track, then burn the captions into the final video.

## Extract audio with Ittybit

Start by pulling the audio track from the source video. Gemini works best with audio input for transcription, and extracting it avoids sending the full video file.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript
const videoUrl = "https://example.com/videos/interview.mp4";

const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: videoUrl,
kind: "audio",
options: {
format: "mp3",
},
}),
});
const audioTask = await res.json();

console.log("Audio extraction task:", audioTask.id);
// status: "queued" -> "processing" -> "succeeded"

````

```python

video_url = "https://example.com/videos/interview.mp4"

res = requests.post(
    "https://api.ittybit.com/jobs",
    headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
    json={
        "input": video_url,
        "kind": "audio",
        "options": {
            "format": "mp3",
        },
    },
)
audio_task = res.json()

print("Audio extraction task:", audio_task["id"])
# status: "queued" -> "processing" -> "succeeded"
````

</CodeGroup>

Poll until the audio extraction completes:

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript
async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
      headers: {
        Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
      },
    });
    const task = await res.json();

    if (task.status === "succeeded") return task;
    if (task.status === "failed") throw new Error(`Task failed: ${task.error}`);

    await new Promise((r) => setTimeout(r, 2000));

}
}

const completedAudio = await waitForTask(audioTask.id);
const audioUrl = completedAudio.output_url;

````

```python

def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
        )
        task = res.json()

        if task["status"] == "succeeded":
            return task
        if task["status"] == "failed":
            raise Exception(f"Task failed: {task['error']}")

        time.sleep(2)

completed_audio = wait_for_task(audio_task["id"])
audio_url = completed_audio["output_url"]
````

</CodeGroup>

## Send audio to Gemini for captioning

Upload the extracted audio to Gemini and ask it to transcribe with timestamps. Gemini returns structured caption data that you can convert directly to WebVTT.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript

const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const fileManager = new GoogleAIFileManager(process.env.GEMINI_API_KEY!);

// Upload the extracted audio
const audioFile = await fileManager.uploadFile(audioUrl, {
mimeType: "audio/mp3",
displayName: "interview-audio",
});

const model = genai.getGenerativeModel({ model: "gemini-2.0-flash" });

const result = await model.generateContent([
{
fileData: {
mimeType: audioFile.file.mimeType,
fileUri: audioFile.file.uri,
},
},
{
text: `Transcribe this audio with precise timestamps.
Return ONLY a JSON array of caption segments, no other text.
Each segment should be 1-2 sentences and follow natural speech breaks.

Format:
[
{"start": "00:00:01.000", "end": "00:00:04.500", "text": "Caption text here."},
...
]

Use HH:MM:SS.mmm timestamp format.`,
},
]);

const responseText = result.response.text();
const captions = JSON.parse(responseText);

console.log(`Generated ${captions.length} caption segments`);

````

```python

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Upload the extracted audio
audio_file = genai.upload_file(audio_url, mime_type="audio/mp3")

model = genai.GenerativeModel("gemini-2.0-flash")

result = model.generate_content([
    audio_file,
    """Transcribe this audio with precise timestamps.
Return ONLY a JSON array of caption segments, no other text.
Each segment should be 1-2 sentences and follow natural speech breaks.

Format:
[
  {"start": "00:00:01.000", "end": "00:00:04.500", "text": "Caption text here."},
  ...
]

Use HH:MM:SS.mmm timestamp format.""",
])

captions = json.loads(result.text)

print(f"Generated {len(captions)} caption segments")
````

</CodeGroup>

## Convert to WebVTT

Turn Gemini's JSON output into a WebVTT file that any video player can use.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript
function toWebVTT(
  captions: { start: string; end: string; text: string }[]
): string {
  let vtt = "WEBVTT\n\n";

for (let i = 0; i < captions.length; i++) {
vtt += `${i + 1}\n`;
vtt += `${captions[i].start} --> ${captions[i].end}\n`;
vtt += `${captions[i].text}\n\n`;
}

return vtt;
}

const webvtt = toWebVTT(captions);

console.log(webvtt);
// WEBVTT
//
// 1
// 00:00:01.000 --> 00:00:04.500
// Welcome to the show, today we're talking about media APIs.
//
// 2
// 00:00:04.500 --> 00:00:08.200
// Our guest has been building video infrastructure for ten years.
// ...

````

```python
def to_webvtt(captions: list[dict]) -> str:
    vtt = "WEBVTT\n\n"

    for i, cap in enumerate(captions, 1):
        vtt += f"{i}\n"
        vtt += f"{cap['start']} --> {cap['end']}\n"
        vtt += f"{cap['text']}\n\n"

    return vtt

webvtt = to_webvtt(captions)

print(webvtt)
# WEBVTT
#
# 1
# 00:00:01.000 --> 00:00:04.500
# Welcome to the show, today we're talking about media APIs.
#
# 2
# 00:00:04.500 --> 00:00:08.200
# Our guest has been building video infrastructure for ten years.
# ...
````

</CodeGroup>

## Burn captions into the video

Use Ittybit to produce a final video with the captions embedded. Pass the WebVTT content as a subtitle track in the video task.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript
const captionedVideoTask = await fetch("https://api.ittybit.com/jobs", {
  method: "POST",
  headers: {
    Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
    "Content-Type": "application/json",
  },
  body: JSON.stringify({
    input: videoUrl,
    kind: "video",
    options: {
      format: "mp4",
      subtitles: webvtt,
    },
  }),
});
const videoTask = await captionedVideoTask.json();

const completed = await waitForTask(videoTask.id);

console.log("Captioned video:", completed.output_url);

````

```python
res = requests.post(
    "https://api.ittybit.com/jobs",
    headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
    json={
        "input": video_url,
        "kind": "video",
        "options": {
            "format": "mp4",
            "subtitles": webvtt,
        },
    },
)
video_task = res.json()

completed = wait_for_task(video_task["id"])

print("Captioned video:", completed["output_url"])
````

</CodeGroup>

## Full pipeline

Here's the complete flow as a single function -- extract audio, generate captions with Gemini, and produce the captioned video.

<CodeGroup labels={["TypeScript", "Python"]}>
```typescript

const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const fileManager = new GoogleAIFileManager(process.env.GEMINI_API_KEY!);

async function waitForTask(taskId: string): Promise<any> {
while (true) {
const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
},
});
const task = await res.json();

    if (task.status === "succeeded") return task;
    if (task.status === "failed") throw new Error(`Task failed: ${task.error}`);

    await new Promise((r) => setTimeout(r, 2000));

}
}

function toWebVTT(
captions: { start: string; end: string; text: string }[]
): string {
let vtt = "WEBVTT\n\n";
for (let i = 0; i < captions.length; i++) {
vtt += `${i + 1}\n`;
vtt += `${captions[i].start} --> ${captions[i].end}\n`;
vtt += `${captions[i].text}\n\n`;
}
return vtt;
}

async function autoCaptions(videoUrl: string): Promise<string> {
// 1. Extract audio
const audioRes = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: videoUrl,
kind: "audio",
options: { format: "mp3" },
}),
});
const audioTask = await audioRes.json();
const completedAudio = await waitForTask(audioTask.id);

// 2. Send to Gemini for transcription
const audioFile = await fileManager.uploadFile(completedAudio.output_url, {
mimeType: "audio/mp3",
displayName: "extracted-audio",
});

const model = genai.getGenerativeModel({ model: "gemini-2.0-flash" });
const result = await model.generateContent([
{
fileData: {
mimeType: audioFile.file.mimeType,
fileUri: audioFile.file.uri,
},
},
{
text: `Transcribe this audio with precise timestamps.
Return ONLY a JSON array of caption segments, no other text.
Each segment should be 1-2 sentences and follow natural speech breaks.

Format:
[{"start": "00:00:01.000", "end": "00:00:04.500", "text": "Caption text here."}]

Use HH:MM:SS.mmm timestamp format.`,
},
]);

const captions = JSON.parse(result.response.text());
const webvtt = toWebVTT(captions);

// 3. Burn captions into video
const videoRes = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: videoUrl,
kind: "video",
options: { format: "mp4", subtitles: webvtt },
}),
});
const videoTask = await videoRes.json();
const completedVideo = await waitForTask(videoTask.id);

return completedVideo.output_url;
}

// Usage
const outputUrl = await autoCaptions(
"https://example.com/videos/interview.mp4"
);
console.log("Captioned video ready:", outputUrl);

````

```python

genai.configure(api_key=os.environ["GEMINI_API_KEY"])
ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]

def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        )
        task = res.json()

        if task["status"] == "succeeded":
            return task
        if task["status"] == "failed":
            raise Exception(f"Task failed: {task['error']}")

        time.sleep(2)

def to_webvtt(captions: list[dict]) -> str:
    vtt = "WEBVTT\n\n"
    for i, cap in enumerate(captions, 1):
        vtt += f"{i}\n"
        vtt += f"{cap['start']} --> {cap['end']}\n"
        vtt += f"{cap['text']}\n\n"
    return vtt

def auto_captions(video_url: str) -> str:
    # 1. Extract audio
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "input": video_url,
            "kind": "audio",
            "options": {"format": "mp3"},
        },
    )
    audio_task = res.json()
    completed_audio = wait_for_task(audio_task["id"])

    # 2. Send to Gemini for transcription
    audio_file = genai.upload_file(
        completed_audio["output_url"], mime_type="audio/mp3"
    )

    model = genai.GenerativeModel("gemini-2.0-flash")
    result = model.generate_content([
        audio_file,
        """Transcribe this audio with precise timestamps.
Return ONLY a JSON array of caption segments, no other text.
Each segment should be 1-2 sentences and follow natural speech breaks.

Format:
[{"start": "00:00:01.000", "end": "00:00:04.500", "text": "Caption text here."}]

Use HH:MM:SS.mmm timestamp format.""",
    ])

    captions = json.loads(result.text)
    webvtt = to_webvtt(captions)

    # 3. Burn captions into video
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "input": video_url,
            "kind": "video",
            "options": {"format": "mp4", "subtitles": webvtt},
        },
    )
    video_task = res.json()
    completed_video = wait_for_task(video_task["id"])

    return completed_video["output_url"]

# Usage
output_url = auto_captions("https://example.com/videos/interview.mp4")
print("Captioned video ready:", output_url)
````

</CodeGroup>

## See also

- [Extract audio from video](/guides/extract-audio-from-video) -- audio extraction options and formats
- [Video editing with Gemini](/guides/video-editing-with-gemini) -- AI-powered trimming and editing
- [HLS streaming](/guides/create-hls-streams) -- deliver captioned video via adaptive streaming