Auto-generate captions with Gemini and Ittybit

View Markdown

Most captioning workflows force you to pick between accuracy and speed. Gemini’s multimodal video understanding gives you both — it watches the video, generates timestamped captions, and you get production-ready WebVTT without training a custom model. Ittybit handles the media pipeline: extract the audio track, then burn the captions into the final video.

Extract audio with Ittybit

Start by pulling the audio track from the source video. Gemini works best with audio input for transcription, and extracting it avoids sending the full video file.

const videoUrl = "https://example.com/videos/interview.mp4";

const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: videoUrl,
kind: "audio",
options: {
format: "mp3",
},
}),
});
const audioTask = await res.json();

console.log("Audio extraction task:", audioTask.id);
// status: "queued" -> "processing" -> "succeeded"
import os
import requests

video_url = "https://example.com/videos/interview.mp4"

res = requests.post(
    "https://api.ittybit.com/jobs",
    headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
    json={
        "input": video_url,
        "kind": "audio",
        "options": {
            "format": "mp3",
        },
    },
)
audio_task = res.json()

print("Audio extraction task:", audio_task["id"])
# status: "queued" -> "processing" -> "succeeded"

Poll until the audio extraction completes:

async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
      headers: {
        Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
      },
    });
    const task = await res.json();

    if (task.status === "succeeded") return task;
    if (task.status === "failed") throw new Error(`Task failed: ${task.error}`);

    await new Promise((r) => setTimeout(r, 2000));

}
}

const completedAudio = await waitForTask(audioTask.id);
const audioUrl = completedAudio.output_url;
import time


def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
        )
        task = res.json()

        if task["status"] == "succeeded":
            return task
        if task["status"] == "failed":
            raise Exception(f"Task failed: {task['error']}")

        time.sleep(2)


completed_audio = wait_for_task(audio_task["id"])
audio_url = completed_audio["output_url"]

Send audio to Gemini for captioning

Upload the extracted audio to Gemini and ask it to transcribe with timestamps. Gemini returns structured caption data that you can convert directly to WebVTT.

import { GoogleGenerativeAI } from "@google/generative-ai";
import { GoogleAIFileManager } from "@google/generative-ai/server";

const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const fileManager = new GoogleAIFileManager(process.env.GEMINI_API_KEY!);

// Upload the extracted audio
const audioFile = await fileManager.uploadFile(audioUrl, {
mimeType: "audio/mp3",
displayName: "interview-audio",
});

const model = genai.getGenerativeModel({ model: "gemini-2.0-flash" });

const result = await model.generateContent([
{
fileData: {
mimeType: audioFile.file.mimeType,
fileUri: audioFile.file.uri,
},
},
{
text: `Transcribe this audio with precise timestamps.
Return ONLY a JSON array of caption segments, no other text.
Each segment should be 1-2 sentences and follow natural speech breaks.

Format:
[
{"start": "00:00:01.000", "end": "00:00:04.500", "text": "Caption text here."},
...
]

Use HH:MM:SS.mmm timestamp format.`,
},
]);

const responseText = result.response.text();
const captions = JSON.parse(responseText);

console.log(`Generated ${captions.length} caption segments`);
import json
import google.generativeai as genai

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Upload the extracted audio
audio_file = genai.upload_file(audio_url, mime_type="audio/mp3")

model = genai.GenerativeModel("gemini-2.0-flash")

result = model.generate_content([
    audio_file,
    """Transcribe this audio with precise timestamps.
Return ONLY a JSON array of caption segments, no other text.
Each segment should be 1-2 sentences and follow natural speech breaks.

Format:
[
  {"start": "00:00:01.000", "end": "00:00:04.500", "text": "Caption text here."},
  ...
]

Use HH:MM:SS.mmm timestamp format.""",
])

captions = json.loads(result.text)

print(f"Generated {len(captions)} caption segments")

Convert to WebVTT

Turn Gemini’s JSON output into a WebVTT file that any video player can use.

function toWebVTT(
  captions: { start: string; end: string; text: string }[]
): string {
  let vtt = "WEBVTT\n\n";

for (let i = 0; i < captions.length; i++) {
vtt += `${i + 1}\n`;
vtt += `${captions[i].start} --> ${captions[i].end}\n`;
vtt += `${captions[i].text}\n\n`;
}

return vtt;
}

const webvtt = toWebVTT(captions);

console.log(webvtt);
// WEBVTT
//
// 1
// 00:00:01.000 --> 00:00:04.500
// Welcome to the show, today we're talking about media APIs.
//
// 2
// 00:00:04.500 --> 00:00:08.200
// Our guest has been building video infrastructure for ten years.
// ...
def to_webvtt(captions: list[dict]) -> str:
    vtt = "WEBVTT\n\n"

    for i, cap in enumerate(captions, 1):
        vtt += f"{i}\n"
        vtt += f"{cap['start']} --> {cap['end']}\n"
        vtt += f"{cap['text']}\n\n"

    return vtt


webvtt = to_webvtt(captions)

print(webvtt)
# WEBVTT
#
# 1
# 00:00:01.000 --> 00:00:04.500
# Welcome to the show, today we're talking about media APIs.
#
# 2
# 00:00:04.500 --> 00:00:08.200
# Our guest has been building video infrastructure for ten years.
# ...

Burn captions into the video

Use Ittybit to produce a final video with the captions embedded. Pass the WebVTT content as a subtitle track in the video task.

const captionedVideoTask = await fetch("https://api.ittybit.com/jobs", {
  method: "POST",
  headers: {
    Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
    "Content-Type": "application/json",
  },
  body: JSON.stringify({
    input: videoUrl,
    kind: "video",
    options: {
      format: "mp4",
      subtitles: webvtt,
    },
  }),
});
const videoTask = await captionedVideoTask.json();

const completed = await waitForTask(videoTask.id);

console.log("Captioned video:", completed.output_url);
res = requests.post(
    "https://api.ittybit.com/jobs",
    headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
    json={
        "input": video_url,
        "kind": "video",
        "options": {
            "format": "mp4",
            "subtitles": webvtt,
        },
    },
)
video_task = res.json()

completed = wait_for_task(video_task["id"])

print("Captioned video:", completed["output_url"])

Full pipeline

Here’s the complete flow as a single function — extract audio, generate captions with Gemini, and produce the captioned video.

import { GoogleGenerativeAI } from "@google/generative-ai";
import { GoogleAIFileManager } from "@google/generative-ai/server";

const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const fileManager = new GoogleAIFileManager(process.env.GEMINI_API_KEY!);

async function waitForTask(taskId: string): Promise<any> {
while (true) {
const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
},
});
const task = await res.json();

    if (task.status === "succeeded") return task;
    if (task.status === "failed") throw new Error(`Task failed: ${task.error}`);

    await new Promise((r) => setTimeout(r, 2000));

}
}

function toWebVTT(
captions: { start: string; end: string; text: string }[]
): string {
let vtt = "WEBVTT\n\n";
for (let i = 0; i < captions.length; i++) {
vtt += `${i + 1}\n`;
vtt += `${captions[i].start} --> ${captions[i].end}\n`;
vtt += `${captions[i].text}\n\n`;
}
return vtt;
}

async function autoCaptions(videoUrl: string): Promise<string> {
// 1. Extract audio
const audioRes = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: videoUrl,
kind: "audio",
options: { format: "mp3" },
}),
});
const audioTask = await audioRes.json();
const completedAudio = await waitForTask(audioTask.id);

// 2. Send to Gemini for transcription
const audioFile = await fileManager.uploadFile(completedAudio.output_url, {
mimeType: "audio/mp3",
displayName: "extracted-audio",
});

const model = genai.getGenerativeModel({ model: "gemini-2.0-flash" });
const result = await model.generateContent([
{
fileData: {
mimeType: audioFile.file.mimeType,
fileUri: audioFile.file.uri,
},
},
{
text: `Transcribe this audio with precise timestamps.
Return ONLY a JSON array of caption segments, no other text.
Each segment should be 1-2 sentences and follow natural speech breaks.

Format:
[{"start": "00:00:01.000", "end": "00:00:04.500", "text": "Caption text here."}]

Use HH:MM:SS.mmm timestamp format.`,
},
]);

const captions = JSON.parse(result.response.text());
const webvtt = toWebVTT(captions);

// 3. Burn captions into video
const videoRes = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: videoUrl,
kind: "video",
options: { format: "mp4", subtitles: webvtt },
}),
});
const videoTask = await videoRes.json();
const completedVideo = await waitForTask(videoTask.id);

return completedVideo.output_url;
}

// Usage
const outputUrl = await autoCaptions(
"https://example.com/videos/interview.mp4"
);
console.log("Captioned video ready:", outputUrl);
import json
import os
import time

import google.generativeai as genai
import requests

genai.configure(api_key=os.environ["GEMINI_API_KEY"])
ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]


def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        )
        task = res.json()

        if task["status"] == "succeeded":
            return task
        if task["status"] == "failed":
            raise Exception(f"Task failed: {task['error']}")

        time.sleep(2)


def to_webvtt(captions: list[dict]) -> str:
    vtt = "WEBVTT\n\n"
    for i, cap in enumerate(captions, 1):
        vtt += f"{i}\n"
        vtt += f"{cap['start']} --> {cap['end']}\n"
        vtt += f"{cap['text']}\n\n"
    return vtt


def auto_captions(video_url: str) -> str:
    # 1. Extract audio
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "input": video_url,
            "kind": "audio",
            "options": {"format": "mp3"},
        },
    )
    audio_task = res.json()
    completed_audio = wait_for_task(audio_task["id"])

    # 2. Send to Gemini for transcription
    audio_file = genai.upload_file(
        completed_audio["output_url"], mime_type="audio/mp3"
    )

    model = genai.GenerativeModel("gemini-2.0-flash")
    result = model.generate_content([
        audio_file,
        """Transcribe this audio with precise timestamps.
Return ONLY a JSON array of caption segments, no other text.
Each segment should be 1-2 sentences and follow natural speech breaks.

Format:
[{"start": "00:00:01.000", "end": "00:00:04.500", "text": "Caption text here."}]

Use HH:MM:SS.mmm timestamp format.""",
    ])

    captions = json.loads(result.text)
    webvtt = to_webvtt(captions)

    # 3. Burn captions into video
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "input": video_url,
            "kind": "video",
            "options": {"format": "mp4", "subtitles": webvtt},
        },
    )
    video_task = res.json()
    completed_video = wait_for_task(video_task["id"])

    return completed_video["output_url"]


# Usage
output_url = auto_captions("https://example.com/videos/interview.mp4")
print("Captioned video ready:", output_url)

See also