AI-guided encoding presets with Llama and Ittybit
Every video is different. A talking head webinar can survive heavy compression, but a fast-paced sports clip falls apart at the same bitrate. Instead of picking one encoding preset and hoping for the best, you can extract a few sample frames with Ittybit, let Llama 4 Scout classify the content, and route the video to the right encoding settings automatically. Storage costs drop without visible quality loss.
Extract sample frames
Pull 3-5 frames spread across the video using Ittybit image tasks. These give Llama enough context to classify the scene type without processing the entire file.
const VIDEO_URL = "https://example.com/uploads/source.mp4";
const timestamps = [5, 30, 60, 90, 120];
async function extractFrame(start: number) {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "image",
input: VIDEO_URL,
options: { start, width: 512, format: "jpeg" },
}),
});
return res.json();
}
const tasks = await Promise.all(timestamps.map(extractFrame));
import os
import requests
VIDEO_URL = "https://example.com/uploads/source.mp4"
ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]
timestamps = [5, 30, 60, 90, 120]
def extract_frame(start: int) -> dict:
res = requests.post(
"https://api.ittybit.com/jobs",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
json={
"kind": "image",
"input": VIDEO_URL,
"options": {"start": start, "width": 512, "format": "jpeg"},
},
)
return res.json()
tasks = [extract_frame(t) for t in timestamps] Poll each task until it completes, then collect the output URLs.
async function waitForTask(taskId: string): Promise<any> {
while (true) {
const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
},
});
const task = await res.json();
if (task.status === "succeeded" || task.status === "failed") {
return task;
}
await new Promise((r) => setTimeout(r, 2000));
}
}
const completed = await Promise.all(
tasks.map((t) => waitForTask(t.id))
);
const frameUrls = completed
.filter((t) => t.status === "succeeded")
.map((t) => t.output.url);
import time
def wait_for_task(task_id: str) -> dict:
while True:
res = requests.get(
f"https://api.ittybit.com/jobs/{task_id}",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
)
task = res.json()
if task["status"] in ("succeeded", "failed"):
return task
time.sleep(2)
completed = [wait_for_task(t["id"]) for t in tasks]
frame_urls = [
t["output"]["url"]
for t in completed
if t["status"] == "succeeded"
] Classify scene type with Llama
Send each frame to Llama 4 Scout running on Ollama. The prompt asks for a single scene classification and a confidence score.
const OLLAMA_URL = "http://localhost:11434/api/chat";
async function classifyFrame(imageUrl: string) {
const imageRes = await fetch(imageUrl);
const buffer = Buffer.from(await imageRes.arrayBuffer());
const base64 = buffer.toString("base64");
const res = await fetch(OLLAMA_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "llama4-scout",
stream: false,
messages: [
{
role: "system",
content: `You are a video scene classifier. Classify the image into exactly one category:
- talking_head: presenter, interview, webcam, lecture
- action: sports, fast motion, dynamic camera movement
- screencast: screen recording, slides, code, UI demo
- animation: motion graphics, animated content
- static: mostly still frame, landscape, b-roll
Respond with JSON only: {"scene_type": string, "confidence": number}`,
},
{
role: "user",
content: "Classify this video frame.",
images: [base64],
},
],
}),
});
const data = await res.json();
return JSON.parse(data.message.content);
}
const classifications = await Promise.all(frameUrls.map(classifyFrame));
import base64
import json
OLLAMA_URL = "http://localhost:11434/api/chat"
def classify_frame(image_url: str) -> dict:
image_bytes = requests.get(image_url).content
b64 = base64.b64encode(image_bytes).decode()
res = requests.post(OLLAMA_URL, json={
"model": "llama4-scout",
"stream": False,
"messages": [
{
"role": "system",
"content": (
"You are a video scene classifier. Classify the image into exactly one category:\n"
"- talking_head: presenter, interview, webcam, lecture\n"
"- action: sports, fast motion, dynamic camera movement\n"
"- screencast: screen recording, slides, code, UI demo\n"
"- animation: motion graphics, animated content\n"
"- static: mostly still frame, landscape, b-roll\n\n"
'Respond with JSON only: {"scene_type": string, "confidence": number}'
),
},
{
"role": "user",
"content": "Classify this video frame.",
"images": [b64],
},
],
})
return json.loads(res.json()["message"]["content"])
classifications = [classify_frame(url) for url in frame_urls] Map classifications to encoding presets
Take a majority vote across the sampled frames, then look up the encoding settings for that scene type.
type SceneType = "talking_head" | "action" | "screencast" | "animation" | "static";
const PRESETS: Record<SceneType, { codec: string; quality: string; bitrate: string; format: string }> = {
talking_head: { codec: "h264", quality: "medium", bitrate: "1500k", format: "mp4" },
action: { codec: "h264", quality: "high", bitrate: "6000k", format: "mp4" },
screencast: { codec: "h264", quality: "high", bitrate: "2000k", format: "mp4" },
animation: { codec: "h264", quality: "high", bitrate: "3000k", format: "mp4" },
static: { codec: "h264", quality: "low", bitrate: "800k", format: "mp4" },
};
function pickPreset(classifications: { scene_type: SceneType; confidence: number }[]) {
const votes: Record<string, number> = {};
for (const c of classifications) {
votes[c.scene_type] = (votes[c.scene_type] || 0) + c.confidence;
}
const winner = Object.entries(votes).sort((a, b) => b[1] - a[1])[0][0] as SceneType;
return { scene_type: winner, options: PRESETS[winner] };
}
const preset = pickPreset(classifications);
console.log(`Detected: ${preset.scene_type}`);
PRESETS = {
"talking_head": {"codec": "h264", "quality": "medium", "bitrate": "1500k", "format": "mp4"},
"action": {"codec": "h264", "quality": "high", "bitrate": "6000k", "format": "mp4"},
"screencast": {"codec": "h264", "quality": "high", "bitrate": "2000k", "format": "mp4"},
"animation": {"codec": "h264", "quality": "high", "bitrate": "3000k", "format": "mp4"},
"static": {"codec": "h264", "quality": "low", "bitrate": "800k", "format": "mp4"},
}
def pick_preset(classifications: list[dict]) -> dict:
votes: dict[str, float] = {}
for c in classifications:
scene = c["scene_type"]
votes[scene] = votes.get(scene, 0) + c["confidence"]
winner = max(votes, key=votes.get)
return {"scene_type": winner, "options": PRESETS[winner]}
preset = pick_preset(classifications)
print(f"Detected: {preset['scene_type']}") Here is what each preset optimizes for:
| Scene type | Bitrate | Quality | Rationale |
|---|---|---|---|
| talking_head | 1500k | medium | Low motion, speech-dominant — compresses well |
| action | 6000k | high | Fast motion needs more bits to avoid blocking artifacts |
| screencast | 2000k | high | Sharp text and UI edges need quality, but motion is minimal |
| animation | 3000k | high | Flat colors compress efficiently, but transitions need headroom |
| static | 800k | low | Nearly still frames — aggressive compression with no visible loss |
Encode with the chosen preset
Dispatch an Ittybit video task using the preset selected by Llama.
const encodeRes = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "video",
input: VIDEO_URL,
options: preset.options,
}),
});
const encodeTask = await encodeRes.json();
console.log(`Encoding task ${encodeTask.id}: ${encodeTask.status}`);
encode_res = requests.post(
"https://api.ittybit.com/jobs",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
json={
"kind": "video",
"input": VIDEO_URL,
"options": preset["options"],
},
)
encode_task = encode_res.json()
print(f"Encoding task {encode_task['id']}: {encode_task['status']}") Full pipeline
Wire it all together: extract frames, classify, pick a preset, and encode.
async function adaptiveEncode(videoUrl: string) {
// 1. Extract sample frames
const timestamps = [5, 30, 60, 90, 120];
const tasks = await Promise.all(
timestamps.map((start) =>
fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "image",
input: videoUrl,
options: { start, width: 512, format: "jpeg" },
}),
}).then((r) => r.json())
)
);
// 2. Wait for frames
const completed = await Promise.all(
tasks.map((t) => waitForTask(t.id))
);
const frameUrls = completed
.filter((t) => t.status === "succeeded")
.map((t) => t.output.url);
// 3. Classify each frame with Llama
const classifications = await Promise.all(
frameUrls.map(classifyFrame)
);
// 4. Pick the best preset
const preset = pickPreset(classifications);
console.log(`Scene: ${preset.scene_type} -> ${JSON.stringify(preset.options)}`);
// 5. Encode with optimal settings
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "video",
input: videoUrl,
options: preset.options,
}),
});
return res.json();
}
const result = await adaptiveEncode("https://example.com/uploads/source.mp4");
console.log(`Task ${result.id}: ${result.status}`);
def adaptive_encode(video_url: str) -> dict:
# 1. Extract sample frames
timestamps = [5, 30, 60, 90, 120]
tasks = [extract_frame_for(video_url, t) for t in timestamps]
# 2. Wait for frames
completed = [wait_for_task(t["id"]) for t in tasks]
frame_urls = [
t["output"]["url"]
for t in completed
if t["status"] == "succeeded"
]
# 3. Classify each frame with Llama
classifications = [classify_frame(url) for url in frame_urls]
# 4. Pick the best preset
preset = pick_preset(classifications)
print(f"Scene: {preset['scene_type']} -> {preset['options']}")
# 5. Encode with optimal settings
res = requests.post(
"https://api.ittybit.com/jobs",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
json={
"kind": "video",
"input": video_url,
"options": preset["options"],
},
)
return res.json()
result = adaptive_encode("https://example.com/uploads/source.mp4")
print(f"Task {result['id']}: {result['status']}") Tuning presets
The defaults above are a starting point. Adjust them based on your content and delivery targets.
- Codec: Swap
h264forh265orav1for better compression at the same quality — useful if your audience supports newer codecs. - Resolution: Add
widthandheightto the preset to downscale talking heads or static content that does not need full resolution. - Format: Use
webmwith VP9/AV1 for web-only delivery, or stick withmp4for broad compatibility. - Thresholds: If no scene type crosses a confidence threshold, fall back to a safe middle-ground preset rather than guessing.
See also
- API
POST /jobswithkind: "video"— transcode video via HTTP - Reduce video file size — manual compression strategies
- AV1 encoding — next-generation codec settings
- Video moderation with Llama — frame-based content moderation using the same Ollama setup
- Ollama documentation — running Llama models locally