Video chapters with Llama and Ittybit
Timestamped chapters make long videos navigable. Ittybit extracts the audio track, you transcribe it locally with whisper.cpp, then Llama 3.3 running on Ollama turns the transcript into structured chapter data. No audio or text leaves your infrastructure.
Extract audio with Ittybit
Create an audio task to pull the audio track as MP3. This gives whisper.cpp a clean input without downloading the full video file.
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: "https://example.com/uploads/talk.mp4",
kind: "audio",
options: { format: "mp3" },
}),
});
const task = await res.json();import requests
import os
res = requests.post(
"https://api.ittybit.com/jobs",
headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
json={
"input": "https://example.com/uploads/talk.mp4",
"kind": "audio",
"options": {"format": "mp3"},
},
)
task = res.json() Poll the task or use a webhook to know when the audio file is ready. The completed job includes a URL to the extracted MP3.
Transcribe locally with whisper.cpp
Download the extracted audio and run it through whisper.cpp. The --output-json flag gives you word-level timestamps, which Llama needs to place chapter boundaries accurately.
import { execSync } from "child_process";
import { readFileSync } from "fs";
// Download the extracted audio
const audioUrl = task.output.url;
const audioRes = await fetch(audioUrl);
const audioBuffer = Buffer.from(await audioRes.arrayBuffer());
const fs = await import("fs");
fs.writeFileSync("/tmp/audio.mp3", audioBuffer);
// Transcribe with whisper.cpp
execSync(
`./whisper-cpp -m models/ggml-base.en.bin -f /tmp/audio.mp3 --output-json -of /tmp/transcript`
);
const transcript = JSON.parse(
readFileSync("/tmp/transcript.json", "utf-8")
);
// Build a timestamped transcript string
const lines = transcript.transcription.map(
(seg: { offsets: { from: number }; text: string }) => {
const seconds = Math.floor(seg.offsets.from / 1000);
const mm = String(Math.floor(seconds / 60)).padStart(2, "0");
const ss = String(seconds % 60).padStart(2, "0");
return `[${mm}:${ss}] ${seg.text.trim()}`;
}
);
const timestampedTranscript = lines.join("\n");
import subprocess
import json
# Download the extracted audio
audio_url = task["output"]["url"]
audio_bytes = requests.get(audio_url).content
with open("/tmp/audio.mp3", "wb") as f:
f.write(audio_bytes)
# Transcribe with whisper.cpp
subprocess.run([
"./whisper-cpp",
"-m", "models/ggml-base.en.bin",
"-f", "/tmp/audio.mp3",
"--output-json",
"-of", "/tmp/transcript",
], check=True)
with open("/tmp/transcript.json") as f:
transcript = json.load(f)
# Build a timestamped transcript string
lines = []
for seg in transcript["transcription"]:
seconds = seg["offsets"]["from"] // 1000
mm = str(seconds // 60).zfill(2)
ss = str(seconds % 60).zfill(2)
lines.append(f"[{mm}:{ss}] {seg['text'].strip()}")
timestamped_transcript = "\n".join(lines) Generate chapters with Llama
Send the timestamped transcript to Llama 3.3 running on Ollama. The system prompt asks for structured JSON with a timestamp, title, and summary for each chapter.
const OLLAMA_URL = "http://localhost:11434/api/chat";
const res = await fetch(OLLAMA_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "llama3.3",
stream: false,
messages: [
{
role: "system",
content: `You are a video editor. Given a timestamped transcript, produce chapter markers.
Return JSON only — an array of objects with these fields:
- "timestamp": start time as "MM:SS"
- "title": short chapter title (max 60 chars)
- "summary": one-sentence description of the chapter
Identify natural topic shifts. Aim for 4-8 chapters depending on length. Do not invent content not present in the transcript.`,
},
{
role: "user",
content: timestampedTranscript,
},
],
}),
});
const data = await res.json();
const chapters = JSON.parse(data.message.content);
console.log(chapters);
OLLAMA_URL = "http://localhost:11434/api/chat"
res = requests.post(OLLAMA_URL, json={
"model": "llama3.3",
"stream": False,
"messages": [
{
"role": "system",
"content": (
"You are a video editor. Given a timestamped transcript, produce chapter markers.\n\n"
"Return JSON only — an array of objects with these fields:\n"
'- "timestamp": start time as "MM:SS"\n'
'- "title": short chapter title (max 60 chars)\n'
'- "summary": one-sentence description of the chapter\n\n'
"Identify natural topic shifts. Aim for 4-8 chapters depending on length. "
"Do not invent content not present in the transcript."
),
},
{
"role": "user",
"content": timestamped_transcript,
},
],
})
chapters = json.loads(res.json()["message"]["content"])
print(chapters) The output looks like this:
[
{
"timestamp": "00:00",
"title": "Introduction and agenda",
"summary": "Speaker introduces themselves and outlines the three topics for the talk."
},
{
"timestamp": "03:42",
"title": "Architecture overview",
"summary": "Walkthrough of the system design and how the main components connect."
},
{
"timestamp": "12:15",
"title": "Live demo",
"summary": "Hands-on demonstration of the upload and processing pipeline."
},
{
"timestamp": "22:08",
"title": "Q&A",
"summary": "Audience questions about scaling, error handling, and deployment."
}
]
Put it together
The full pipeline in one function: extract audio, transcribe, generate chapters.
async function generateChapters(videoUrl: string) {
// 1. Extract audio via Ittybit
const taskRes = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: videoUrl,
kind: "audio",
options: { format: "mp3" },
}),
});
const task = await taskRes.json();
// 2. Poll until audio is ready
const audioUrl = await pollForResult(task.id);
// 3. Download and transcribe locally
const audioRes = await fetch(audioUrl);
const buffer = Buffer.from(await audioRes.arrayBuffer());
const { writeFileSync, readFileSync } = await import("fs");
const { execSync } = await import("child_process");
writeFileSync("/tmp/audio.mp3", buffer);
execSync(
`./whisper-cpp -m models/ggml-base.en.bin -f /tmp/audio.mp3 --output-json -of /tmp/transcript`
);
const transcript = JSON.parse(readFileSync("/tmp/transcript.json", "utf-8"));
const lines = transcript.transcription.map(
(seg: { offsets: { from: number }; text: string }) => {
const seconds = Math.floor(seg.offsets.from / 1000);
const mm = String(Math.floor(seconds / 60)).padStart(2, "0");
const ss = String(seconds % 60).padStart(2, "0");
return `[${mm}:${ss}] ${seg.text.trim()}`;
}
);
// 4. Generate chapters with Llama
const llamaRes = await fetch("http://localhost:11434/api/chat", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "llama3.3",
stream: false,
messages: [
{
role: "system",
content: `You are a video editor. Given a timestamped transcript, produce chapter markers.
Return JSON only — an array of objects with these fields:
- "timestamp": start time as "MM:SS"
- "title": short chapter title (max 60 chars)
- "summary": one-sentence description of the chapter
Identify natural topic shifts. Aim for 4-8 chapters depending on length. Do not invent content not present in the transcript.`,
},
{ role: "user", content: lines.join("\n") },
],
}),
});
const data = await llamaRes.json();
return JSON.parse(data.message.content);
}
def generate_chapters(video_url: str) -> list[dict]:
# 1. Extract audio via Ittybit
task = requests.post(
"https://api.ittybit.com/jobs",
headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
json={
"input": video_url,
"kind": "audio",
"options": {"format": "mp3"},
},
).json()
# 2. Poll until audio is ready
audio_url = poll_for_result(task["id"])
# 3. Download and transcribe locally
audio_bytes = requests.get(audio_url).content
with open("/tmp/audio.mp3", "wb") as f:
f.write(audio_bytes)
subprocess.run([
"./whisper-cpp",
"-m", "models/ggml-base.en.bin",
"-f", "/tmp/audio.mp3",
"--output-json",
"-of", "/tmp/transcript",
], check=True)
with open("/tmp/transcript.json") as f:
transcript = json.load(f)
lines = []
for seg in transcript["transcription"]:
seconds = seg["offsets"]["from"] // 1000
mm = str(seconds // 60).zfill(2)
ss = str(seconds % 60).zfill(2)
lines.append(f"[{mm}:{ss}] {seg['text'].strip()}")
# 4. Generate chapters with Llama
res = requests.post("http://localhost:11434/api/chat", json={
"model": "llama3.3",
"stream": False,
"messages": [
{
"role": "system",
"content": (
"You are a video editor. Given a timestamped transcript, produce chapter markers.\n\n"
"Return JSON only — an array of objects with these fields:\n"
'- "timestamp": start time as "MM:SS"\n'
'- "title": short chapter title (max 60 chars)\n'
'- "summary": one-sentence description of the chapter\n\n'
"Identify natural topic shifts. Aim for 4-8 chapters depending on length. "
"Do not invent content not present in the transcript."
),
},
{"role": "user", "content": "\n".join(lines)},
],
})
return json.loads(res.json()["message"]["content"]) See also
- API
POST /jobswithkind: "audio"— extract audio via HTTP - Extract audio from video — audio extraction options and formats
- Video moderation with Llama — another Llama + Ittybit pipeline
- Ollama documentation — running Llama models locally
- whisper.cpp — local speech-to-text