AI-picked thumbnails with Gemini and Ittybit

View Markdown

A good thumbnail is the difference between a click and a scroll. Instead of grabbing the first frame or a random midpoint, use Gemini to watch the video and identify the most visually compelling moments β€” then extract and optimize frames at those exact timestamps with Ittybit.

Define the scene analysis tool

Give Gemini a function declaration that returns an array of timestamps. Each entry represents a visually strong moment suitable for a thumbnail.

import { GoogleGenerativeAI, FunctionDeclarationSchemaType } from "@google/generative-ai";

const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);

const pickThumbnailsTool = {
name: "pick_thumbnails",
description:
"Select the best moments from a video for use as thumbnails. Return timestamps where the visual content is most compelling -- clear subjects, good lighting, expressive faces, or dramatic action.",
parameters: {
type: FunctionDeclarationSchemaType.OBJECT,
properties: {
moments: {
type: FunctionDeclarationSchemaType.ARRAY,
description: "Array of thumbnail candidates, ordered by visual quality",
items: {
type: FunctionDeclarationSchemaType.OBJECT,
properties: {
timestamp: {
type: FunctionDeclarationSchemaType.NUMBER,
description: "Time in seconds",
},
reason: {
type: FunctionDeclarationSchemaType.STRING,
description: "Why this moment works as a thumbnail",
},
},
required: ["timestamp", "reason"],
},
},
},
required: ["moments"],
},
};
import google.generativeai as genai
import os

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

pick_thumbnails_tool = genai.protos.Tool(
    function_declarations=[
        genai.protos.FunctionDeclaration(
            name="pick_thumbnails",
            description=(
                "Select the best moments from a video for use as thumbnails. "
                "Return timestamps where the visual content is most compelling "
                "-- clear subjects, good lighting, expressive faces, or dramatic action."
            ),
            parameters=genai.protos.Schema(
                type=genai.protos.Type.OBJECT,
                properties={
                    "moments": genai.protos.Schema(
                        type=genai.protos.Type.ARRAY,
                        description="Array of thumbnail candidates, ordered by visual quality",
                        items=genai.protos.Schema(
                            type=genai.protos.Type.OBJECT,
                            properties={
                                "timestamp": genai.protos.Schema(
                                    type=genai.protos.Type.NUMBER,
                                    description="Time in seconds",
                                ),
                                "reason": genai.protos.Schema(
                                    type=genai.protos.Type.STRING,
                                    description="Why this moment works as a thumbnail",
                                ),
                            },
                            required=["timestamp", "reason"],
                        ),
                    ),
                },
                required=["moments"],
            ),
        )
    ]
)

Send the video to Gemini

Pass the video and ask Gemini to pick the best thumbnail moments. It watches the entire video and returns structured timestamps via the function call.

const model = genai.getGenerativeModel({
  model: "gemini-2.0-flash",
  tools: [{ functionDeclarations: [pickThumbnailsTool] }],
});

const videoUrl = "https://example.com/videos/product-launch.mp4";

const result = await model.generateContent([
{
fileData: { mimeType: "video/mp4", fileUri: videoUrl },
},
{
text: `Watch this video and pick the 3-5 best moments for thumbnails.
Look for: clear subjects in focus, good lighting, expressive faces,
dramatic action, or visually striking compositions.
Avoid: blurry frames, transitions, dark scenes, or text-heavy slides.
Call the pick_thumbnails function with your selections.`,
},
]);

const call = result.response.functionCalls()?.[0];
if (!call || call.name !== "pick_thumbnails") {
throw new Error("Gemini did not return a pick_thumbnails function call");
}

const { moments } = call.args as {
moments: Array<{ timestamp: number; reason: string }>;
};

console.log(`Gemini picked ${moments.length} moments:`);
for (const m of moments) {
console.log(`  ${m.timestamp}s -- ${m.reason}`);
}
// e.g.
// 14.2s -- Speaker mid-gesture with product visible, well-lit
// 38.7s -- Close-up of product with clear detail
// 52.1s -- Audience reaction shot, high energy
model = genai.GenerativeModel(
    model_name="gemini-2.0-flash",
    tools=[pick_thumbnails_tool],
)

video_url = "https://example.com/videos/product-launch.mp4"

response = model.generate_content([
    genai.protos.Part(
        file_data=genai.protos.FileData(
            mime_type="video/mp4", file_uri=video_url
        )
    ),
    (
        "Watch this video and pick the 3-5 best moments for thumbnails. "
        "Look for: clear subjects in focus, good lighting, expressive faces, "
        "dramatic action, or visually striking compositions. "
        "Avoid: blurry frames, transitions, dark scenes, or text-heavy slides. "
        "Call the pick_thumbnails function with your selections."
    ),
])

call = response.candidates[0].content.parts[0].function_call
if call.name != "pick_thumbnails":
    raise ValueError("Gemini did not return a pick_thumbnails function call")

moments = [dict(m) for m in call.args["moments"]]

print(f"Gemini picked {len(moments)} moments:")
for m in moments:
    print(f"  {m['timestamp']}s -- {m['reason']}")
# e.g.
#   14.2s -- Speaker mid-gesture with product visible, well-lit
#   38.7s -- Close-up of product with clear detail
#   52.1s -- Audience reaction shot, high energy

Extract thumbnails with Ittybit

For each timestamp Gemini picked, create an Ittybit image task. The kind: "image" task with a video input extracts a frame at the given start time and outputs an optimized image.

async function extractThumbnail(
  videoUrl: string,
  timestamp: number,
  options: { width?: number; height?: number; format?: string } = {}
) {
  const task = {
    kind: "image",
    input: videoUrl,
    options: {
      start: timestamp,
      width: options.width ?? 1280,
      height: options.height ?? 720,
      format: options.format ?? "webp",
    },
  };

const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify(task),
});
return res.json();
}

// Extract a thumbnail for each moment Gemini identified
const tasks = await Promise.all(
moments.map((m) => extractThumbnail(videoUrl, m.timestamp))
);

console.log(`Created ${tasks.length} thumbnail tasks`);
for (const task of tasks) {
console.log(`  ${task.id}: ${task.status}`);
}
import requests

ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]


def extract_thumbnail(
    video_url: str,
    timestamp: float,
    width: int = 1280,
    height: int = 720,
    format: str = "webp",
) -> dict:
    task = {
        "kind": "image",
        "input": video_url,
        "options": {
            "start": timestamp,
            "width": width,
            "height": height,
            "format": format,
        },
    }

    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json=task,
    )
    return res.json()


# Extract a thumbnail for each moment Gemini identified
tasks = [extract_thumbnail(video_url, m["timestamp"]) for m in moments]

print(f"Created {len(tasks)} thumbnail tasks")
for task in tasks:
    print(f"  {task['id']}: {task['status']}")

Poll for completion

Each task moves through queued -> processing -> succeeded. Poll until all thumbnails are ready.

async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
      headers: {
        Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
      },
    });
    const task = await res.json();

    if (task.status === "succeeded") return task;
    if (task.status === "failed") throw new Error(`Task failed: ${task.error}`);

    await new Promise((r) => setTimeout(r, 2000));

}
}

const completed = await Promise.all(
tasks.map((task) => waitForTask(task.id))
);

for (const [i, task] of completed.entries()) {
console.log(`Thumbnail ${i + 1}: ${task.output_url}`);
console.log(`  ${moments[i].timestamp}s -- ${moments[i].reason}`);
}
import time


def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        )
        task = res.json()

        if task["status"] == "succeeded":
            return task
        if task["status"] == "failed":
            raise Exception(f"Task failed: {task['error']}")

        time.sleep(2)


completed = [wait_for_task(task["id"]) for task in tasks]

for i, task in enumerate(completed):
    print(f"Thumbnail {i + 1}: {task['output_url']}")
    print(f"  {moments[i]['timestamp']}s -- {moments[i]['reason']}")

See also