Subtitle pipeline with Trigger.dev and Ittybit
When a video lands in your system, you often need subtitles — for accessibility, SEO, or because most social feeds autoplay on mute. This guide builds a four-step pipeline with Trigger.dev and Ittybit: extract the audio track, transcribe it, generate a WebVTT file, and store the subtitles alongside the video. Each step is independently retryable, so a transient Whisper timeout won’t re-extract audio that already succeeded.
Install dependencies
npm install @trigger.dev/sdk openai
Set your environment variables:
TRIGGER_SECRET_KEY=tr_dev_...
ITTYBIT_API_KEY=your_ittybit_api_key
OPENAI_API_KEY=sk-...
Ittybit helper
A thin wrapper around the Ittybit Task API. Used by the pipeline steps to create tasks and check status.
const ITTYBIT_API = 'https://api.ittybit.com';
async function createIttybitTask(body: {
input: string;
kind: string;
options?: Record<string, unknown>;
}) {
const res = await fetch(`${ITTYBIT_API}/tasks`, {
method: 'POST',
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
'Content-Type': 'application/json',
},
body: JSON.stringify(body),
});
if (!res.ok) {
throw new Error(`Ittybit API error: ${res.status}`);
}
return res.json() as Promise<{
id: string;
status: string;
output_url?: string;
}>;
}
async function pollForCompletion(taskId: string) {
for (let i = 0; i < 120; i++) {
const res = await fetch(`${ITTYBIT_API}/tasks/${taskId}`, {
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
},
});
const task = (await res.json()) as {
id: string;
status: string;
output_url?: string;
};
if (task.status === 'succeeded') return task;
if (task.status === 'failed') throw new Error(`Task ${taskId} failed`);
await new Promise((r) => setTimeout(r, 3000));
}
throw new Error(`Task ${taskId} timed out`);
}
Define the subtitle pipeline
Each step.run is a durable checkpoint. If step 2 fails, Trigger.dev retries it without re-running step 1. The pipeline flows: extract audio, transcribe, build WebVTT, store the result.
import { task } from '@trigger.dev/sdk/v3';
import OpenAI from 'openai';
const openai = new OpenAI();
export const generateSubtitles = task({
id: 'generate-subtitles',
retry: { maxAttempts: 3 },
run: async (
payload: {
videoUrl: string;
videoId: string;
},
{ ctx },
) => {
const { videoUrl, videoId } = payload;
// Step 1: Extract audio from video
const audio = await ctx.run('extract-audio', async () => {
const task = await createIttybitTask({
input: videoUrl,
kind: 'audio',
options: { format: 'mp3' },
});
return pollForCompletion(task.id);
});
// Step 2: Transcribe the audio
const segments = await ctx.run('transcribe', async () => {
const audioFile = await fetch(audio.output_url!);
const blob = new Blob([await audioFile.arrayBuffer()], {
type: 'audio/mp3',
});
const file = new File([blob], 'audio.mp3', { type: 'audio/mp3' });
const transcription = await openai.audio.transcriptions.create({
model: 'whisper-1',
file,
response_format: 'verbose_json',
timestamp_granularities: ['segment'],
});
return transcription.segments!.map((seg) => ({
start: seg.start,
end: seg.end,
text: seg.text.trim(),
}));
});
// Step 3: Generate WebVTT
const webvtt = await ctx.run('generate-webvtt', async () => {
let vtt = 'WEBVTT\n\n';
for (let i = 0; i < segments.length; i++) {
const s = segments[i];
vtt += `${i + 1}\n`;
vtt += `${formatTimestamp(s.start)} --> ${formatTimestamp(s.end)}\n`;
vtt += `${s.text}\n\n`;
}
return vtt;
});
// Step 4: Store subtitles
const result = await ctx.run('store-subtitles', async () => {
await db.video.update({
where: { id: videoId },
data: {
subtitles_vtt: webvtt,
subtitle_status: 'ready',
},
});
return { videoId, segmentCount: segments.length };
});
return result;
},
});
function formatTimestamp(seconds: number): string {
const h = Math.floor(seconds / 3600);
const m = Math.floor((seconds % 3600) / 60);
const s = seconds % 60;
return (
String(h).padStart(2, '0') +
':' +
String(m).padStart(2, '0') +
':' +
s.toFixed(3).padStart(6, '0')
);
}
Trigger from an Ittybit webhook
When Ittybit finishes ingesting a video, it can POST a webhook to your server. Use that event to kick off the subtitle pipeline automatically.
import { tasks } from "@trigger.dev/sdk/v3";
import type { generateSubtitles } from "./trigger/generate-subtitles";
// app/api/webhooks/ittybit/route.ts
export async function POST(req: Request) {
const event = await req.json();
// Only process newly completed video uploads
if (event.kind !== "video" || event.status !== "succeeded") {
return new Response("ignored", { status: 200 });
}
const handle = await tasks.trigger<typeof generateSubtitles>(
"generate-subtitles",
{
videoUrl: event.output_url,
videoId: event.id,
},
);
return Response.json({ runId: handle.id });
}
import { tasks } from "@trigger.dev/sdk/v3";
import type { generateSubtitles } from "./trigger/generate-subtitles";
app.post("/webhooks/ittybit", async (req, res) => {
const event = req.body;
if (event.kind !== "video" || event.status !== "succeeded") {
return res.sendStatus(200);
}
const handle = await tasks.trigger<typeof generateSubtitles>(
"generate-subtitles",
{
videoUrl: event.output_url,
videoId: event.id,
},
);
res.json({ runId: handle.id });
}); Use wait tokens instead of polling
Polling works but burns compute while the audio extraction runs. For production, use Trigger.dev’s wait.for to pause the run until Ittybit calls back.
import { task, wait } from '@trigger.dev/sdk/v3';
import OpenAI from 'openai';
const openai = new OpenAI();
export const generateSubtitlesWithWait = task({
id: 'generate-subtitles-wait',
retry: { maxAttempts: 3 },
run: async (payload: { videoUrl: string; videoId: string }) => {
const { videoUrl, videoId } = payload;
// Step 1: Create audio extraction task
const audioTask = await createIttybitTask({
input: videoUrl,
kind: 'audio',
options: { format: 'mp3' },
});
// Pause until Ittybit webhook resolves the wait token
const audioResult = await wait.for<{ output_url: string }>({
id: `audio-${audioTask.id}`,
timeout: '15m',
});
// Step 2: Transcribe
const audioFile = await fetch(audioResult.output_url);
const blob = new Blob([await audioFile.arrayBuffer()], {
type: 'audio/mp3',
});
const file = new File([blob], 'audio.mp3', { type: 'audio/mp3' });
const transcription = await openai.audio.transcriptions.create({
model: 'whisper-1',
file,
response_format: 'verbose_json',
timestamp_granularities: ['segment'],
});
// Step 3: Build WebVTT
let vtt = 'WEBVTT\n\n';
for (let i = 0; i < transcription.segments!.length; i++) {
const seg = transcription.segments![i];
vtt += `${i + 1}\n`;
vtt += `${formatTimestamp(seg.start)} --> ${formatTimestamp(seg.end)}\n`;
vtt += `${seg.text.trim()}\n\n`;
}
// Step 4: Store
await db.video.update({
where: { id: videoId },
data: { subtitles_vtt: vtt, subtitle_status: 'ready' },
});
return { videoId, segmentCount: transcription.segments!.length };
},
});
Then resolve the wait token when the Ittybit webhook fires:
import { runs } from '@trigger.dev/sdk/v3';
// POST /api/webhooks/ittybit
export async function POST(req: Request) {
const event = await req.json();
if (event.status === 'succeeded') {
await runs.completeWaitToken(`audio-${event.id}`, {
output_url: event.output_url,
});
}
return new Response('ok');
}
See also
- Trigger.dev docs
- Video pipeline with Trigger.dev — transcode and thumbnail pipeline
- Extract audio from video — audio extraction options and formats
- Auto-generate captions with Gemini — alternative captioning approach