Video moderation with Llama and Ittybit
User-generated video needs moderation before it goes live. Ittybit extracts frames from uploaded video, then Llama 4 Scout running locally via Ollama classifies each frame against your content policy. No data leaves your infrastructure.
Extract frames with Ittybit
Use an image task to pull a frame at a specific timestamp. Extract several frames across the video to get reasonable coverage.
const timestamps = [0, 5, 10, 15, 20];
const frames = await Promise.all(
timestamps.map(async (start) => {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: "https://example.com/uploads/video.mp4",
kind: "image",
options: { start, width: 512, format: "jpeg" },
}),
});
return res.json();
})
);
import requests
from concurrent.futures import ThreadPoolExecutor
api_key = os.environ["ITTYBIT_API_KEY"]
timestamps = [0, 5, 10, 15, 20]
def extract_frame(start):
res = requests.post(
"https://api.ittybit.com/jobs",
headers={"Authorization": f"Bearer {api_key}"},
json={
"input": "https://example.com/uploads/video.mp4",
"kind": "image",
"options": {"start": start, "width": 512, "format": "jpeg"},
},
)
return res.json()
with ThreadPoolExecutor() as pool:
frames = list(pool.map(extract_frame, timestamps))for START in 0 5 10 15 20; do
TASK=$(printf '{
"input": "https://example.com/uploads/video.mp4",
"kind": "image",
"options": {"start": %d, "width": 512, "format": "jpeg"}
}' "$START")
curl -X POST https://api.ittybit.com/jobs \
-H "Authorization: Bearer $ITTYBIT_API_KEY" \
-H "Content-Type: application/json" \
-d "$TASK"
done Each task returns a URL to the extracted frame once complete. Poll the task or use a webhook to know when the frames are ready.
Classify frames with Llama
Once you have the frame URLs, download each image, encode it as base64, and send it to Llama 4 Scout running on Ollama. The system prompt defines your moderation policy and the expected response format.
import { readFileSync } from "fs";
const OLLAMA_URL = "http://localhost:11434/api/chat";
async function moderateFrame(imageUrl: string) {
const imageRes = await fetch(imageUrl);
const buffer = Buffer.from(await imageRes.arrayBuffer());
const base64 = buffer.toString("base64");
const res = await fetch(OLLAMA_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "llama4-scout",
stream: false,
messages: [
{
role: "system",
content: `You are a content moderator. Classify the image against this policy:
- violence: graphic violence or gore
- nudity: explicit nudity or sexual content
- hate: hate symbols, slurs, or extremist imagery
- drugs: illegal drug use or paraphernalia
Respond with JSON only: {"flagged": boolean, "categories": string[], "reasoning": string}`,
},
{
role: "user",
content: "Classify this image.",
images: [base64],
},
],
}),
});
const data = await res.json();
return JSON.parse(data.message.content);
}
import base64
import json
import requests
OLLAMA_URL = "http://localhost:11434/api/chat"
def moderate_frame(image_url: str) -> dict:
image_bytes = requests.get(image_url).content
b64 = base64.b64encode(image_bytes).decode()
res = requests.post(OLLAMA_URL, json={
"model": "llama4-scout",
"stream": False,
"messages": [
{
"role": "system",
"content": (
"You are a content moderator. Classify the image against this policy:\n"
"- violence: graphic violence or gore\n"
"- nudity: explicit nudity or sexual content\n"
"- hate: hate symbols, slurs, or extremist imagery\n"
"- drugs: illegal drug use or paraphernalia\n\n"
'Respond with JSON only: {"flagged": boolean, "categories": [], "reasoning": ""}'
),
},
{
"role": "user",
"content": "Classify this image.",
"images": [b64],
},
],
})
return json.loads(res.json()["message"]["content"])# Download the frame and base64-encode it
IMAGE_B64=$(curl -s "$FRAME_URL" | base64)
curl -s http://localhost:11434/api/chat \
-H "Content-Type: application/json" \
-d "$(cat <<EOF
{
"model": "llama4-scout",
"stream": false,
"messages": [
{
"role": "system",
"content": "You are a content moderator. Classify the image against this policy:\n- violence: graphic violence or gore\n- nudity: explicit nudity or sexual content\n- hate: hate symbols, slurs, or extremist imagery\n- drugs: illegal drug use or paraphernalia\n\nRespond with JSON only: {\"flagged\": boolean, \"categories\": [], \"reasoning\": \"\"}"
},
{
"role": "user",
"content": "Classify this image.",
"images": ["$IMAGE_B64"]
}
]
}
EOF
)" Put it together
Run the full pipeline: extract frames, classify each one, flag the video if any frame violates policy.
async function moderateVideo(videoUrl: string) {
const timestamps = [0, 5, 10, 15, 20, 30, 45, 60];
// 1. Extract frames via Ittybit
const tasks = await Promise.all(
timestamps.map(async (start) => {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: videoUrl,
kind: "image",
options: { start, width: 512, format: "jpeg" },
}),
});
return res.json();
})
);
// 2. Poll until frames are ready (simplified)
const frameUrls = await Promise.all(
tasks.map((task) => pollForResult(task.id))
);
// 3. Classify each frame with Llama
const results = await Promise.all(frameUrls.map(moderateFrame));
// 4. Aggregate verdict
const flagged = results.filter((r) => r.flagged);
return {
approved: flagged.length === 0,
violations: flagged.map((r) => ({
categories: r.categories,
reasoning: r.reasoning,
})),
};
}
def moderate_video(video_url: str) -> dict:
timestamps = [0, 5, 10, 15, 20, 30, 45, 60]
# 1. Extract frames via Ittybit
with ThreadPoolExecutor() as pool:
tasks = list(pool.map(
lambda t: extract_frame(video_url, t),
timestamps,
))
# 2. Poll until frames are ready (simplified)
frame_urls = [poll_for_result(task["id"]) for task in tasks]
# 3. Classify each frame with Llama
results = [moderate_frame(url) for url in frame_urls]
# 4. Aggregate verdict
flagged = [r for r in results if r["flagged"]]
return {
"approved": len(flagged) == 0,
"violations": [
{"categories": r["categories"], "reasoning": r["reasoning"]}
for r in flagged
],
} Tuning coverage
The number and spacing of frames depends on your content and risk tolerance.
| Strategy | Timestamps | Use case |
|---|---|---|
| Quick scan | Every 10s | Low-risk, high-volume |
| Thorough | Every 2s | User-facing platforms |
| Scene-based | After each cut | Highest accuracy, requires scene detection |
For longer videos, sample more frames. A 5-minute video at 2-second intervals is 150 frames — Llama 4 Scout processes each in under a second on a modern GPU.
See also
- API
POST /jobswithkind: "image"— extract frames via HTTP - Extract thumbnails from video — simpler single-frame extraction
- Build a user upload pipeline — full upload processing workflow
- Ollama documentation — running Llama models locally