Video chapters with Llama and Ittybit

View Markdown

Timestamped chapters make long videos navigable. Ittybit extracts the audio track, you transcribe it locally with whisper.cpp, then Llama 3.3 running on Ollama turns the transcript into structured chapter data. No audio or text leaves your infrastructure.

Extract audio with Ittybit

Create an audio task to pull the audio track as MP3. This gives whisper.cpp a clean input without downloading the full video file.

const res = await fetch("https://api.ittybit.com/jobs", {
  method: "POST",
  headers: {
    Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
    "Content-Type": "application/json",
  },
  body: JSON.stringify({
    input: "https://example.com/uploads/talk.mp4",
    kind: "audio",
    options: { format: "mp3" },
  }),
});
const task = await res.json();

import requests
import os

res = requests.post(
    "https://api.ittybit.com/jobs",
    headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
    json={
        "input": "https://example.com/uploads/talk.mp4",
        "kind": "audio",
        "options": {"format": "mp3"},
    },
)
task = res.json()

Poll the task or use a webhook to know when the audio file is ready. The completed job includes a URL to the extracted MP3.

Transcribe locally with whisper.cpp

Download the extracted audio and run it through whisper.cpp. The --output-json flag gives you word-level timestamps, which Llama needs to place chapter boundaries accurately.

import { execSync } from "child_process";
import { readFileSync } from "fs";

// Download the extracted audio
const audioUrl = task.output.url;
const audioRes = await fetch(audioUrl);
const audioBuffer = Buffer.from(await audioRes.arrayBuffer());
const fs = await import("fs");
fs.writeFileSync("/tmp/audio.mp3", audioBuffer);

// Transcribe with whisper.cpp
execSync(
`./whisper-cpp -m models/ggml-base.en.bin -f /tmp/audio.mp3 --output-json -of /tmp/transcript`
);

const transcript = JSON.parse(
readFileSync("/tmp/transcript.json", "utf-8")
);

// Build a timestamped transcript string
const lines = transcript.transcription.map(
(seg: { offsets: { from: number }; text: string }) => {
const seconds = Math.floor(seg.offsets.from / 1000);
const mm = String(Math.floor(seconds / 60)).padStart(2, "0");
const ss = String(seconds % 60).padStart(2, "0");
return `[${mm}:${ss}] ${seg.text.trim()}`;
}
);
const timestampedTranscript = lines.join("\n");

import subprocess
import json

# Download the extracted audio
audio_url = task["output"]["url"]
audio_bytes = requests.get(audio_url).content
with open("/tmp/audio.mp3", "wb") as f:
    f.write(audio_bytes)

# Transcribe with whisper.cpp
subprocess.run([
    "./whisper-cpp",
    "-m", "models/ggml-base.en.bin",
    "-f", "/tmp/audio.mp3",
    "--output-json",
    "-of", "/tmp/transcript",
], check=True)

with open("/tmp/transcript.json") as f:
    transcript = json.load(f)

# Build a timestamped transcript string
lines = []
for seg in transcript["transcription"]:
    seconds = seg["offsets"]["from"] // 1000
    mm = str(seconds // 60).zfill(2)
    ss = str(seconds % 60).zfill(2)
    lines.append(f"[{mm}:{ss}] {seg['text'].strip()}")

timestamped_transcript = "\n".join(lines)

Generate chapters with Llama

Send the timestamped transcript to Llama 3.3 running on Ollama. The system prompt asks for structured JSON with a timestamp, title, and summary for each chapter.

const OLLAMA_URL = "http://localhost:11434/api/chat";

const res = await fetch(OLLAMA_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "llama3.3",
stream: false,
messages: [
{
role: "system",
content: `You are a video editor. Given a timestamped transcript, produce chapter markers.

Return JSON only — an array of objects with these fields:

- "timestamp": start time as "MM:SS"
- "title": short chapter title (max 60 chars)
- "summary": one-sentence description of the chapter

Identify natural topic shifts. Aim for 4-8 chapters depending on length. Do not invent content not present in the transcript.`,
},
{
role: "user",
content: timestampedTranscript,
},
],
}),
});

const data = await res.json();
const chapters = JSON.parse(data.message.content);

console.log(chapters);

OLLAMA_URL = "http://localhost:11434/api/chat"

res = requests.post(OLLAMA_URL, json={
    "model": "llama3.3",
    "stream": False,
    "messages": [
        {
            "role": "system",
            "content": (
                "You are a video editor. Given a timestamped transcript, produce chapter markers.\n\n"
                "Return JSON only — an array of objects with these fields:\n"
                '- "timestamp": start time as "MM:SS"\n'
                '- "title": short chapter title (max 60 chars)\n'
                '- "summary": one-sentence description of the chapter\n\n'
                "Identify natural topic shifts. Aim for 4-8 chapters depending on length. "
                "Do not invent content not present in the transcript."
            ),
        },
        {
            "role": "user",
            "content": timestamped_transcript,
        },
    ],
})

chapters = json.loads(res.json()["message"]["content"])

print(chapters)

The output looks like this:

[
  {
    "timestamp": "00:00",
    "title": "Introduction and agenda",
    "summary": "Speaker introduces themselves and outlines the three topics for the talk."
  },
  {
    "timestamp": "03:42",
    "title": "Architecture overview",
    "summary": "Walkthrough of the system design and how the main components connect."
  },
  {
    "timestamp": "12:15",
    "title": "Live demo",
    "summary": "Hands-on demonstration of the upload and processing pipeline."
  },
  {
    "timestamp": "22:08",
    "title": "Q&A",
    "summary": "Audience questions about scaling, error handling, and deployment."
  }
]

Put it together

The full pipeline in one function: extract audio, transcribe, generate chapters.

async function generateChapters(videoUrl: string) {
  // 1. Extract audio via Ittybit
  const taskRes = await fetch("https://api.ittybit.com/jobs", {
    method: "POST",
    headers: {
      Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      input: videoUrl,
      kind: "audio",
      options: { format: "mp3" },
    }),
  });
  const task = await taskRes.json();

// 2. Poll until audio is ready
const audioUrl = await pollForResult(task.id);

// 3. Download and transcribe locally
const audioRes = await fetch(audioUrl);
const buffer = Buffer.from(await audioRes.arrayBuffer());
const { writeFileSync, readFileSync } = await import("fs");
const { execSync } = await import("child_process");

writeFileSync("/tmp/audio.mp3", buffer);
execSync(
`./whisper-cpp -m models/ggml-base.en.bin -f /tmp/audio.mp3 --output-json -of /tmp/transcript`
);

const transcript = JSON.parse(readFileSync("/tmp/transcript.json", "utf-8"));
const lines = transcript.transcription.map(
(seg: { offsets: { from: number }; text: string }) => {
const seconds = Math.floor(seg.offsets.from / 1000);
const mm = String(Math.floor(seconds / 60)).padStart(2, "0");
const ss = String(seconds % 60).padStart(2, "0");
return `[${mm}:${ss}] ${seg.text.trim()}`;
}
);

// 4. Generate chapters with Llama
const llamaRes = await fetch("http://localhost:11434/api/chat", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "llama3.3",
stream: false,
messages: [
{
role: "system",
content: `You are a video editor. Given a timestamped transcript, produce chapter markers.

Return JSON only — an array of objects with these fields:

- "timestamp": start time as "MM:SS"
- "title": short chapter title (max 60 chars)
- "summary": one-sentence description of the chapter

Identify natural topic shifts. Aim for 4-8 chapters depending on length. Do not invent content not present in the transcript.`,
},
{ role: "user", content: lines.join("\n") },
],
}),
});

const data = await llamaRes.json();
return JSON.parse(data.message.content);
}

def generate_chapters(video_url: str) -> list[dict]:
    # 1. Extract audio via Ittybit
    task = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}"},
        json={
            "input": video_url,
            "kind": "audio",
            "options": {"format": "mp3"},
        },
    ).json()

    # 2. Poll until audio is ready
    audio_url = poll_for_result(task["id"])

    # 3. Download and transcribe locally
    audio_bytes = requests.get(audio_url).content
    with open("/tmp/audio.mp3", "wb") as f:
        f.write(audio_bytes)

    subprocess.run([
        "./whisper-cpp",
        "-m", "models/ggml-base.en.bin",
        "-f", "/tmp/audio.mp3",
        "--output-json",
        "-of", "/tmp/transcript",
    ], check=True)

    with open("/tmp/transcript.json") as f:
        transcript = json.load(f)

    lines = []
    for seg in transcript["transcription"]:
        seconds = seg["offsets"]["from"] // 1000
        mm = str(seconds // 60).zfill(2)
        ss = str(seconds % 60).zfill(2)
        lines.append(f"[{mm}:{ss}] {seg['text'].strip()}")

    # 4. Generate chapters with Llama
    res = requests.post("http://localhost:11434/api/chat", json={
        "model": "llama3.3",
        "stream": False,
        "messages": [
            {
                "role": "system",
                "content": (
                    "You are a video editor. Given a timestamped transcript, produce chapter markers.\n\n"
                    "Return JSON only — an array of objects with these fields:\n"
                    '- "timestamp": start time as "MM:SS"\n'
                    '- "title": short chapter title (max 60 chars)\n'
                    '- "summary": one-sentence description of the chapter\n\n'
                    "Identify natural topic shifts. Aim for 4-8 chapters depending on length. "
                    "Do not invent content not present in the transcript."
                ),
            },
            {"role": "user", "content": "\n".join(lines)},
        ],
    })

    return json.loads(res.json()["message"]["content"])

Video chapters with Llama and Ittybit

Extract audio with Ittybit

Transcribe locally with whisper.cpp

Generate chapters with Llama

Put it together

See also