AI-guided encoding presets with Llama and Ittybit

View Markdown

Every video is different. A talking head webinar can survive heavy compression, but a fast-paced sports clip falls apart at the same bitrate. Instead of picking one encoding preset and hoping for the best, you can extract a few sample frames with Ittybit, let Llama 4 Scout classify the content, and route the video to the right encoding settings automatically. Storage costs drop without visible quality loss.

Extract sample frames

Pull 3-5 frames spread across the video using Ittybit image tasks. These give Llama enough context to classify the scene type without processing the entire file.

const VIDEO_URL = "https://example.com/uploads/source.mp4";

const timestamps = [5, 30, 60, 90, 120];

async function extractFrame(start: number) {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "image",
input: VIDEO_URL,
options: { start, width: 512, format: "jpeg" },
}),
});
return res.json();
}

const tasks = await Promise.all(timestamps.map(extractFrame));
import os
import requests

VIDEO_URL = "https://example.com/uploads/source.mp4"
ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]

timestamps = [5, 30, 60, 90, 120]


def extract_frame(start: int) -> dict:
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "kind": "image",
            "input": VIDEO_URL,
            "options": {"start": start, "width": 512, "format": "jpeg"},
        },
    )
    return res.json()


tasks = [extract_frame(t) for t in timestamps]

Poll each task until it completes, then collect the output URLs.

async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
      headers: {
        Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
      },
    });
    const task = await res.json();

    if (task.status === "succeeded" || task.status === "failed") {
      return task;
    }

    await new Promise((r) => setTimeout(r, 2000));

}
}

const completed = await Promise.all(
tasks.map((t) => waitForTask(t.id))
);

const frameUrls = completed
.filter((t) => t.status === "succeeded")
.map((t) => t.output.url);
import time


def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        )
        task = res.json()

        if task["status"] in ("succeeded", "failed"):
            return task

        time.sleep(2)


completed = [wait_for_task(t["id"]) for t in tasks]

frame_urls = [
    t["output"]["url"]
    for t in completed
    if t["status"] == "succeeded"
]

Classify scene type with Llama

Send each frame to Llama 4 Scout running on Ollama. The prompt asks for a single scene classification and a confidence score.

const OLLAMA_URL = "http://localhost:11434/api/chat";

async function classifyFrame(imageUrl: string) {
const imageRes = await fetch(imageUrl);
const buffer = Buffer.from(await imageRes.arrayBuffer());
const base64 = buffer.toString("base64");

const res = await fetch(OLLAMA_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "llama4-scout",
stream: false,
messages: [
{
role: "system",
content: `You are a video scene classifier. Classify the image into exactly one category:

- talking_head: presenter, interview, webcam, lecture
- action: sports, fast motion, dynamic camera movement
- screencast: screen recording, slides, code, UI demo
- animation: motion graphics, animated content
- static: mostly still frame, landscape, b-roll

Respond with JSON only: {"scene_type": string, "confidence": number}`,
},
{
role: "user",
content: "Classify this video frame.",
images: [base64],
},
],
}),
});

const data = await res.json();
return JSON.parse(data.message.content);
}

const classifications = await Promise.all(frameUrls.map(classifyFrame));
import base64
import json

OLLAMA_URL = "http://localhost:11434/api/chat"


def classify_frame(image_url: str) -> dict:
    image_bytes = requests.get(image_url).content
    b64 = base64.b64encode(image_bytes).decode()

    res = requests.post(OLLAMA_URL, json={
        "model": "llama4-scout",
        "stream": False,
        "messages": [
            {
                "role": "system",
                "content": (
                    "You are a video scene classifier. Classify the image into exactly one category:\n"
                    "- talking_head: presenter, interview, webcam, lecture\n"
                    "- action: sports, fast motion, dynamic camera movement\n"
                    "- screencast: screen recording, slides, code, UI demo\n"
                    "- animation: motion graphics, animated content\n"
                    "- static: mostly still frame, landscape, b-roll\n\n"
                    'Respond with JSON only: {"scene_type": string, "confidence": number}'
                ),
            },
            {
                "role": "user",
                "content": "Classify this video frame.",
                "images": [b64],
            },
        ],
    })

    return json.loads(res.json()["message"]["content"])


classifications = [classify_frame(url) for url in frame_urls]

Map classifications to encoding presets

Take a majority vote across the sampled frames, then look up the encoding settings for that scene type.

type SceneType = "talking_head" | "action" | "screencast" | "animation" | "static";

const PRESETS: Record<SceneType, { codec: string; quality: string; bitrate: string; format: string }> = {
talking_head: { codec: "h264", quality: "medium", bitrate: "1500k", format: "mp4" },
action: { codec: "h264", quality: "high", bitrate: "6000k", format: "mp4" },
screencast: { codec: "h264", quality: "high", bitrate: "2000k", format: "mp4" },
animation: { codec: "h264", quality: "high", bitrate: "3000k", format: "mp4" },
static: { codec: "h264", quality: "low", bitrate: "800k", format: "mp4" },
};

function pickPreset(classifications: { scene_type: SceneType; confidence: number }[]) {
const votes: Record<string, number> = {};

for (const c of classifications) {
votes[c.scene_type] = (votes[c.scene_type] || 0) + c.confidence;
}

const winner = Object.entries(votes).sort((a, b) => b[1] - a[1])[0][0] as SceneType;
return { scene_type: winner, options: PRESETS[winner] };
}

const preset = pickPreset(classifications);
console.log(`Detected: ${preset.scene_type}`);
PRESETS = {
    "talking_head": {"codec": "h264", "quality": "medium", "bitrate": "1500k", "format": "mp4"},
    "action":       {"codec": "h264", "quality": "high",   "bitrate": "6000k", "format": "mp4"},
    "screencast":   {"codec": "h264", "quality": "high",   "bitrate": "2000k", "format": "mp4"},
    "animation":    {"codec": "h264", "quality": "high",   "bitrate": "3000k", "format": "mp4"},
    "static":       {"codec": "h264", "quality": "low",    "bitrate": "800k",  "format": "mp4"},
}


def pick_preset(classifications: list[dict]) -> dict:
    votes: dict[str, float] = {}

    for c in classifications:
        scene = c["scene_type"]
        votes[scene] = votes.get(scene, 0) + c["confidence"]

    winner = max(votes, key=votes.get)
    return {"scene_type": winner, "options": PRESETS[winner]}


preset = pick_preset(classifications)
print(f"Detected: {preset['scene_type']}")

Here is what each preset optimizes for:

Scene typeBitrateQualityRationale
talking_head1500kmediumLow motion, speech-dominant — compresses well
action6000khighFast motion needs more bits to avoid blocking artifacts
screencast2000khighSharp text and UI edges need quality, but motion is minimal
animation3000khighFlat colors compress efficiently, but transitions need headroom
static800klowNearly still frames — aggressive compression with no visible loss

Encode with the chosen preset

Dispatch an Ittybit video task using the preset selected by Llama.

const encodeRes = await fetch("https://api.ittybit.com/jobs", {
  method: "POST",
  headers: {
    Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
    "Content-Type": "application/json",
  },
  body: JSON.stringify({
    kind: "video",
    input: VIDEO_URL,
    options: preset.options,
  }),
});

const encodeTask = await encodeRes.json();
console.log(`Encoding task ${encodeTask.id}: ${encodeTask.status}`);
encode_res = requests.post(
    "https://api.ittybit.com/jobs",
    headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
    json={
        "kind": "video",
        "input": VIDEO_URL,
        "options": preset["options"],
    },
)

encode_task = encode_res.json()
print(f"Encoding task {encode_task['id']}: {encode_task['status']}")

Full pipeline

Wire it all together: extract frames, classify, pick a preset, and encode.

async function adaptiveEncode(videoUrl: string) {
  // 1. Extract sample frames
  const timestamps = [5, 30, 60, 90, 120];
  const tasks = await Promise.all(
    timestamps.map((start) =>
      fetch("https://api.ittybit.com/jobs", {
        method: "POST",
        headers: {
          Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
          "Content-Type": "application/json",
        },
        body: JSON.stringify({
          kind: "image",
          input: videoUrl,
          options: { start, width: 512, format: "jpeg" },
        }),
      }).then((r) => r.json())
    )
  );

// 2. Wait for frames
const completed = await Promise.all(
tasks.map((t) => waitForTask(t.id))
);
const frameUrls = completed
.filter((t) => t.status === "succeeded")
.map((t) => t.output.url);

// 3. Classify each frame with Llama
const classifications = await Promise.all(
frameUrls.map(classifyFrame)
);

// 4. Pick the best preset
const preset = pickPreset(classifications);
console.log(`Scene: ${preset.scene_type} -> ${JSON.stringify(preset.options)}`);

// 5. Encode with optimal settings
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "video",
input: videoUrl,
options: preset.options,
}),
});

return res.json();
}

const result = await adaptiveEncode("https://example.com/uploads/source.mp4");
console.log(`Task ${result.id}: ${result.status}`);
def adaptive_encode(video_url: str) -> dict:
    # 1. Extract sample frames
    timestamps = [5, 30, 60, 90, 120]
    tasks = [extract_frame_for(video_url, t) for t in timestamps]

    # 2. Wait for frames
    completed = [wait_for_task(t["id"]) for t in tasks]
    frame_urls = [
        t["output"]["url"]
        for t in completed
        if t["status"] == "succeeded"
    ]

    # 3. Classify each frame with Llama
    classifications = [classify_frame(url) for url in frame_urls]

    # 4. Pick the best preset
    preset = pick_preset(classifications)
    print(f"Scene: {preset['scene_type']} -> {preset['options']}")

    # 5. Encode with optimal settings
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "kind": "video",
            "input": video_url,
            "options": preset["options"],
        },
    )

    return res.json()


result = adaptive_encode("https://example.com/uploads/source.mp4")
print(f"Task {result['id']}: {result['status']}")

Tuning presets

The defaults above are a starting point. Adjust them based on your content and delivery targets.

  • Codec: Swap h264 for h265 or av1 for better compression at the same quality — useful if your audience supports newer codecs.
  • Resolution: Add width and height to the preset to downscale talking heads or static content that does not need full resolution.
  • Format: Use webm with VP9/AV1 for web-only delivery, or stick with mp4 for broad compatibility.
  • Thresholds: If no scene type crosses a confidence threshold, fall back to a safe middle-ground preset rather than guessing.

See also