Video moderation with Llama and Ittybit

View Markdown

User-generated video needs moderation before it goes live. Ittybit extracts frames from uploaded video, then Llama 4 Scout running locally via Ollama classifies each frame against your content policy. No data leaves your infrastructure.

Extract frames with Ittybit

Use an image task to pull a frame at a specific timestamp. Extract several frames across the video to get reasonable coverage.

const timestamps = [0, 5, 10, 15, 20];

const frames = await Promise.all(
timestamps.map(async (start) => {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: "https://example.com/uploads/video.mp4",
kind: "image",
options: { start, width: 512, format: "jpeg" },
}),
});
return res.json();
})
);

import requests
from concurrent.futures import ThreadPoolExecutor

api_key = os.environ["ITTYBIT_API_KEY"]
timestamps = [0, 5, 10, 15, 20]

def extract_frame(start):
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {api_key}"},
        json={
            "input": "https://example.com/uploads/video.mp4",
            "kind": "image",
            "options": {"start": start, "width": 512, "format": "jpeg"},
        },
    )
    return res.json()

with ThreadPoolExecutor() as pool:
    frames = list(pool.map(extract_frame, timestamps))

for START in 0 5 10 15 20; do
  TASK=$(printf '{
    "input": "https://example.com/uploads/video.mp4",
    "kind": "image",
    "options": {"start": %d, "width": 512, "format": "jpeg"}
  }' "$START")

  curl -X POST https://api.ittybit.com/jobs \
    -H "Authorization: Bearer $ITTYBIT_API_KEY" \
    -H "Content-Type: application/json" \
    -d "$TASK"
done

Each task returns a URL to the extracted frame once complete. Poll the task or use a webhook to know when the frames are ready.

Classify frames with Llama

Once you have the frame URLs, download each image, encode it as base64, and send it to Llama 4 Scout running on Ollama. The system prompt defines your moderation policy and the expected response format.

import { readFileSync } from "fs";

const OLLAMA_URL = "http://localhost:11434/api/chat";

async function moderateFrame(imageUrl: string) {
const imageRes = await fetch(imageUrl);
const buffer = Buffer.from(await imageRes.arrayBuffer());
const base64 = buffer.toString("base64");

const res = await fetch(OLLAMA_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "llama4-scout",
stream: false,
messages: [
{
role: "system",
content: `You are a content moderator. Classify the image against this policy:

- violence: graphic violence or gore
- nudity: explicit nudity or sexual content
- hate: hate symbols, slurs, or extremist imagery
- drugs: illegal drug use or paraphernalia

Respond with JSON only: {"flagged": boolean, "categories": string[], "reasoning": string}`,
},
{
role: "user",
content: "Classify this image.",
images: [base64],
},
],
}),
});

const data = await res.json();
return JSON.parse(data.message.content);
}

import base64
import json
import requests

OLLAMA_URL = "http://localhost:11434/api/chat"

def moderate_frame(image_url: str) -> dict:
    image_bytes = requests.get(image_url).content
    b64 = base64.b64encode(image_bytes).decode()

    res = requests.post(OLLAMA_URL, json={
        "model": "llama4-scout",
        "stream": False,
        "messages": [
            {
                "role": "system",
                "content": (
                    "You are a content moderator. Classify the image against this policy:\n"
                    "- violence: graphic violence or gore\n"
                    "- nudity: explicit nudity or sexual content\n"
                    "- hate: hate symbols, slurs, or extremist imagery\n"
                    "- drugs: illegal drug use or paraphernalia\n\n"
                    'Respond with JSON only: {"flagged": boolean, "categories": [], "reasoning": ""}'
                ),
            },
            {
                "role": "user",
                "content": "Classify this image.",
                "images": [b64],
            },
        ],
    })

    return json.loads(res.json()["message"]["content"])

# Download the frame and base64-encode it
IMAGE_B64=$(curl -s "$FRAME_URL" | base64)

curl -s http://localhost:11434/api/chat \
  -H "Content-Type: application/json" \
  -d "$(cat <<EOF
{
  "model": "llama4-scout",
  "stream": false,
  "messages": [
    {
      "role": "system",
      "content": "You are a content moderator. Classify the image against this policy:\n- violence: graphic violence or gore\n- nudity: explicit nudity or sexual content\n- hate: hate symbols, slurs, or extremist imagery\n- drugs: illegal drug use or paraphernalia\n\nRespond with JSON only: {\"flagged\": boolean, \"categories\": [], \"reasoning\": \"\"}"
    },
    {
      "role": "user",
      "content": "Classify this image.",
      "images": ["$IMAGE_B64"]
    }
  ]
}
EOF
)"

Put it together

Run the full pipeline: extract frames, classify each one, flag the video if any frame violates policy.

async function moderateVideo(videoUrl: string) {
  const timestamps = [0, 5, 10, 15, 20, 30, 45, 60];

// 1. Extract frames via Ittybit
const tasks = await Promise.all(
timestamps.map(async (start) => {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: videoUrl,
kind: "image",
options: { start, width: 512, format: "jpeg" },
}),
});
return res.json();
})
);

// 2. Poll until frames are ready (simplified)
const frameUrls = await Promise.all(
tasks.map((task) => pollForResult(task.id))
);

// 3. Classify each frame with Llama
const results = await Promise.all(frameUrls.map(moderateFrame));

// 4. Aggregate verdict
const flagged = results.filter((r) => r.flagged);
return {
approved: flagged.length === 0,
violations: flagged.map((r) => ({
categories: r.categories,
reasoning: r.reasoning,
})),
};
}

def moderate_video(video_url: str) -> dict:
    timestamps = [0, 5, 10, 15, 20, 30, 45, 60]

    # 1. Extract frames via Ittybit
    with ThreadPoolExecutor() as pool:
        tasks = list(pool.map(
            lambda t: extract_frame(video_url, t),
            timestamps,
        ))

    # 2. Poll until frames are ready (simplified)
    frame_urls = [poll_for_result(task["id"]) for task in tasks]

    # 3. Classify each frame with Llama
    results = [moderate_frame(url) for url in frame_urls]

    # 4. Aggregate verdict
    flagged = [r for r in results if r["flagged"]]
    return {
        "approved": len(flagged) == 0,
        "violations": [
            {"categories": r["categories"], "reasoning": r["reasoning"]}
            for r in flagged
        ],
    }

Tuning coverage

The number and spacing of frames depends on your content and risk tolerance.

Strategy	Timestamps	Use case
Quick scan	Every 10s	Low-risk, high-volume
Thorough	Every 2s	User-facing platforms
Scene-based	After each cut	Highest accuracy, requires scene detection

For longer videos, sample more frames. A 5-minute video at 2-second intervals is 150 frames — Llama 4 Scout processes each in under a second on a modern GPU.

Video moderation with Llama and Ittybit

Extract frames with Ittybit

Classify frames with Llama

Put it together

Tuning coverage

See also