AI-picked thumbnails with Gemini and Ittybit
A good thumbnail is the difference between a click and a scroll. Instead of grabbing the first frame or a random midpoint, use Gemini to watch the video and identify the most visually compelling moments β then extract and optimize frames at those exact timestamps with Ittybit.
Define the scene analysis tool
Give Gemini a function declaration that returns an array of timestamps. Each entry represents a visually strong moment suitable for a thumbnail.
import { GoogleGenerativeAI, FunctionDeclarationSchemaType } from "@google/generative-ai";
const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const pickThumbnailsTool = {
name: "pick_thumbnails",
description:
"Select the best moments from a video for use as thumbnails. Return timestamps where the visual content is most compelling -- clear subjects, good lighting, expressive faces, or dramatic action.",
parameters: {
type: FunctionDeclarationSchemaType.OBJECT,
properties: {
moments: {
type: FunctionDeclarationSchemaType.ARRAY,
description: "Array of thumbnail candidates, ordered by visual quality",
items: {
type: FunctionDeclarationSchemaType.OBJECT,
properties: {
timestamp: {
type: FunctionDeclarationSchemaType.NUMBER,
description: "Time in seconds",
},
reason: {
type: FunctionDeclarationSchemaType.STRING,
description: "Why this moment works as a thumbnail",
},
},
required: ["timestamp", "reason"],
},
},
},
required: ["moments"],
},
};
import google.generativeai as genai
import os
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
pick_thumbnails_tool = genai.protos.Tool(
function_declarations=[
genai.protos.FunctionDeclaration(
name="pick_thumbnails",
description=(
"Select the best moments from a video for use as thumbnails. "
"Return timestamps where the visual content is most compelling "
"-- clear subjects, good lighting, expressive faces, or dramatic action."
),
parameters=genai.protos.Schema(
type=genai.protos.Type.OBJECT,
properties={
"moments": genai.protos.Schema(
type=genai.protos.Type.ARRAY,
description="Array of thumbnail candidates, ordered by visual quality",
items=genai.protos.Schema(
type=genai.protos.Type.OBJECT,
properties={
"timestamp": genai.protos.Schema(
type=genai.protos.Type.NUMBER,
description="Time in seconds",
),
"reason": genai.protos.Schema(
type=genai.protos.Type.STRING,
description="Why this moment works as a thumbnail",
),
},
required=["timestamp", "reason"],
),
),
},
required=["moments"],
),
)
]
) Send the video to Gemini
Pass the video and ask Gemini to pick the best thumbnail moments. It watches the entire video and returns structured timestamps via the function call.
const model = genai.getGenerativeModel({
model: "gemini-2.0-flash",
tools: [{ functionDeclarations: [pickThumbnailsTool] }],
});
const videoUrl = "https://example.com/videos/product-launch.mp4";
const result = await model.generateContent([
{
fileData: { mimeType: "video/mp4", fileUri: videoUrl },
},
{
text: `Watch this video and pick the 3-5 best moments for thumbnails.
Look for: clear subjects in focus, good lighting, expressive faces,
dramatic action, or visually striking compositions.
Avoid: blurry frames, transitions, dark scenes, or text-heavy slides.
Call the pick_thumbnails function with your selections.`,
},
]);
const call = result.response.functionCalls()?.[0];
if (!call || call.name !== "pick_thumbnails") {
throw new Error("Gemini did not return a pick_thumbnails function call");
}
const { moments } = call.args as {
moments: Array<{ timestamp: number; reason: string }>;
};
console.log(`Gemini picked ${moments.length} moments:`);
for (const m of moments) {
console.log(` ${m.timestamp}s -- ${m.reason}`);
}
// e.g.
// 14.2s -- Speaker mid-gesture with product visible, well-lit
// 38.7s -- Close-up of product with clear detail
// 52.1s -- Audience reaction shot, high energy
model = genai.GenerativeModel(
model_name="gemini-2.0-flash",
tools=[pick_thumbnails_tool],
)
video_url = "https://example.com/videos/product-launch.mp4"
response = model.generate_content([
genai.protos.Part(
file_data=genai.protos.FileData(
mime_type="video/mp4", file_uri=video_url
)
),
(
"Watch this video and pick the 3-5 best moments for thumbnails. "
"Look for: clear subjects in focus, good lighting, expressive faces, "
"dramatic action, or visually striking compositions. "
"Avoid: blurry frames, transitions, dark scenes, or text-heavy slides. "
"Call the pick_thumbnails function with your selections."
),
])
call = response.candidates[0].content.parts[0].function_call
if call.name != "pick_thumbnails":
raise ValueError("Gemini did not return a pick_thumbnails function call")
moments = [dict(m) for m in call.args["moments"]]
print(f"Gemini picked {len(moments)} moments:")
for m in moments:
print(f" {m['timestamp']}s -- {m['reason']}")
# e.g.
# 14.2s -- Speaker mid-gesture with product visible, well-lit
# 38.7s -- Close-up of product with clear detail
# 52.1s -- Audience reaction shot, high energy Extract thumbnails with Ittybit
For each timestamp Gemini picked, create an Ittybit image task. The kind: "image" task with a video input extracts a frame at the given start time and outputs an optimized image.
async function extractThumbnail(
videoUrl: string,
timestamp: number,
options: { width?: number; height?: number; format?: string } = {}
) {
const task = {
kind: "image",
input: videoUrl,
options: {
start: timestamp,
width: options.width ?? 1280,
height: options.height ?? 720,
format: options.format ?? "webp",
},
};
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify(task),
});
return res.json();
}
// Extract a thumbnail for each moment Gemini identified
const tasks = await Promise.all(
moments.map((m) => extractThumbnail(videoUrl, m.timestamp))
);
console.log(`Created ${tasks.length} thumbnail tasks`);
for (const task of tasks) {
console.log(` ${task.id}: ${task.status}`);
}
import requests
ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]
def extract_thumbnail(
video_url: str,
timestamp: float,
width: int = 1280,
height: int = 720,
format: str = "webp",
) -> dict:
task = {
"kind": "image",
"input": video_url,
"options": {
"start": timestamp,
"width": width,
"height": height,
"format": format,
},
}
res = requests.post(
"https://api.ittybit.com/jobs",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
json=task,
)
return res.json()
# Extract a thumbnail for each moment Gemini identified
tasks = [extract_thumbnail(video_url, m["timestamp"]) for m in moments]
print(f"Created {len(tasks)} thumbnail tasks")
for task in tasks:
print(f" {task['id']}: {task['status']}") Poll for completion
Each task moves through queued -> processing -> succeeded. Poll until all thumbnails are ready.
async function waitForTask(taskId: string): Promise<any> {
while (true) {
const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
},
});
const task = await res.json();
if (task.status === "succeeded") return task;
if (task.status === "failed") throw new Error(`Task failed: ${task.error}`);
await new Promise((r) => setTimeout(r, 2000));
}
}
const completed = await Promise.all(
tasks.map((task) => waitForTask(task.id))
);
for (const [i, task] of completed.entries()) {
console.log(`Thumbnail ${i + 1}: ${task.output_url}`);
console.log(` ${moments[i].timestamp}s -- ${moments[i].reason}`);
}
import time
def wait_for_task(task_id: str) -> dict:
while True:
res = requests.get(
f"https://api.ittybit.com/jobs/{task_id}",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
)
task = res.json()
if task["status"] == "succeeded":
return task
if task["status"] == "failed":
raise Exception(f"Task failed: {task['error']}")
time.sleep(2)
completed = [wait_for_task(task["id"]) for task in tasks]
for i, task in enumerate(completed):
print(f"Thumbnail {i + 1}: {task['output_url']}")
print(f" {moments[i]['timestamp']}s -- {moments[i]['reason']}") See also
- Extract thumbnails from video β manual thumbnail extraction without AI
- Responsive images β generate multiple sizes from each thumbnail
- Video editing with Gemini β use Gemini to trim and edit video