Summarize video with GPT-4 Vision and Ittybit
Ittybit’s image task extracts a frame from any video at a given timestamp. Pull a handful of keyframes, pass them to GPT-4 Vision as image_url messages, and you get back a structured summary with chapter titles, scene descriptions, and a TL;DW — no need to send the full video anywhere.
Extract keyframes
Create one image task per timestamp you want to capture. Each task pulls a single frame from the video.
const VIDEO_URL = "https://example.com/uploads/product-demo.mp4";
const timestamps = [0, 15, 45, 90, 150, 210];
async function extractFrame(start: number) {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "image",
input: VIDEO_URL,
options: {
start,
width: 1280,
height: 720,
format: "jpg",
},
}),
});
return res.json();
}
const tasks = await Promise.all(timestamps.map(extractFrame));
console.log(`Created ${tasks.length} frame extraction tasks`);
import os
import requests
VIDEO_URL = "https://example.com/uploads/product-demo.mp4"
ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]
timestamps = [0, 15, 45, 90, 150, 210]
def extract_frame(start: int) -> dict:
res = requests.post(
"https://api.ittybit.com/jobs",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
json={
"kind": "image",
"input": VIDEO_URL,
"options": {
"start": start,
"width": 1280,
"height": 720,
"format": "jpg",
},
},
)
return res.json()
tasks = [extract_frame(t) for t in timestamps]
print(f"Created {len(tasks)} frame extraction tasks")# Extract a frame at 45 seconds
curl -X POST https://api.ittybit.com/jobs \
-H "Authorization: Bearer $ITTYBIT_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"kind": "image",
"input": "https://example.com/uploads/product-demo.mp4",
"options": {
"start": 45,
"width": 1280,
"height": 720,
"format": "jpg"
}
}'
# Repeat for each timestamp: 0, 15, 90, 150, 210 Poll until frames are ready
Wait for each task to finish and collect the output URLs.
async function waitForTask(taskId: string): Promise<any> {
while (true) {
const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
},
});
const task = await res.json();
if (task.status === "succeeded" || task.status === "failed") {
return task;
}
await new Promise((r) => setTimeout(r, 2000));
}
}
const completed = await Promise.all(
tasks.map((t) => waitForTask(t.id))
);
const frameUrls = completed
.filter((t) => t.status === "succeeded")
.map((t) => t.output.url);
console.log(`Got ${frameUrls.length} frames`);
import time
def wait_for_task(task_id: str) -> dict:
while True:
res = requests.get(
f"https://api.ittybit.com/jobs/{task_id}",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
)
task = res.json()
if task["status"] in ("succeeded", "failed"):
return task
time.sleep(2)
completed = [wait_for_task(t["id"]) for t in tasks]
frame_urls = [
t["output"]["url"]
for t in completed
if t["status"] == "succeeded"
]
print(f"Got {len(frame_urls)} frames")# Poll a task until complete
curl https://api.ittybit.com/jobs/task_abc123 \
-H "Authorization: Bearer $ITTYBIT_API_KEY"
# Repeat until status is "succeeded", then grab output.url Send frames to GPT-4 Vision
Build a message with each frame as an image_url content part. Ask for structured JSON back.
import OpenAI from "openai";
const openai = new OpenAI();
const imageMessages = frameUrls.map((url, i) => ({
type: "image_url" as const,
image_url: { url, detail: "low" as const },
}));
const response = await openai.chat.completions.create({
model: "gpt-4o",
response_format: { type: "json_object" },
messages: [
{
role: "system",
content: `You are a video analyst. You will receive keyframes extracted at
regular intervals from a video. Return a JSON object with this structure:
{
"title": "A short descriptive title",
"tldr": "One-sentence summary",
"chapters": [
{ "timestamp": 0, "title": "Chapter title", "description": "What happens" }
]
}`,
},
{
role: "user",
content: [
{
type: "text",
text: `These ${frameUrls.length} frames were extracted at timestamps: ${timestamps.join(", ")} seconds. Summarize the video.`,
},
...imageMessages,
],
},
],
});
const summary = JSON.parse(response.choices[0].message.content!);
console.log(summary);
from openai import OpenAI
import json
client = OpenAI()
image_messages = [
{"type": "image_url", "image_url": {"url": url, "detail": "low"}}
for url in frame_urls
]
response = client.chat.completions.create(
model="gpt-4o",
response_format={"type": "json_object"},
messages=[
{
"role": "system",
"content": (
"You are a video analyst. You will receive keyframes extracted at "
"regular intervals from a video. Return a JSON object with this structure:\n"
'{\n "title": "A short descriptive title",\n'
' "tldr": "One-sentence summary",\n'
' "chapters": [\n'
' { "timestamp": 0, "title": "Chapter title", "description": "What happens" }\n'
" ]\n}"
),
},
{
"role": "user",
"content": [
{
"type": "text",
"text": (
f"These {len(frame_urls)} frames were extracted at timestamps: "
f"{', '.join(str(t) for t in timestamps)} seconds. Summarize the video."
),
},
*image_messages,
],
},
],
)
summary = json.loads(response.choices[0].message.content)
print(json.dumps(summary, indent=2))curl https://api.openai.com/v1/chat/completions \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4o",
"response_format": { "type": "json_object" },
"messages": [
{
"role": "system",
"content": "You are a video analyst. Given keyframes from a video, return JSON: { \"title\": \"...\", \"tldr\": \"...\", \"chapters\": [{ \"timestamp\": 0, \"title\": \"...\", \"description\": \"...\" }] }"
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "These 6 frames were extracted at timestamps: 0, 15, 45, 90, 150, 210 seconds. Summarize the video."
},
{ "type": "image_url", "image_url": { "url": "FRAME_URL_1", "detail": "low" } },
{ "type": "image_url", "image_url": { "url": "FRAME_URL_2", "detail": "low" } },
{ "type": "image_url", "image_url": { "url": "FRAME_URL_3", "detail": "low" } },
{ "type": "image_url", "image_url": { "url": "FRAME_URL_4", "detail": "low" } },
{ "type": "image_url", "image_url": { "url": "FRAME_URL_5", "detail": "low" } },
{ "type": "image_url", "image_url": { "url": "FRAME_URL_6", "detail": "low" } }
]
}
]
}' Example output
GPT-4 Vision returns something like:
{
"title": "Product Demo: Dashboard Walkthrough",
"tldr": "A 4-minute walkthrough of the new analytics dashboard, covering setup, key metrics, and export features.",
"chapters": [
{
"timestamp": 0,
"title": "Introduction",
"description": "Presenter introduces the dashboard and its purpose"
},
{
"timestamp": 15,
"title": "Setup and configuration",
"description": "Connecting data sources and configuring the workspace"
},
{
"timestamp": 45,
"title": "Key metrics overview",
"description": "Walkthrough of the main KPI panels and chart types"
},
{
"timestamp": 90,
"title": "Filtering and drill-down",
"description": "Demonstrating date range filters and segment breakdowns"
},
{
"timestamp": 150,
"title": "Exporting reports",
"description": "Generating PDF and CSV exports from the dashboard"
},
{ "timestamp": 210, "title": "Wrap-up", "description": "Summary of features and next steps" }
]
}
See also
- API
POST /jobswithkind: "image"— extract frames from video - Generate video thumbnails — single-frame extraction
- AI-powered video clipping with OpenAI — GPT-4 function calling for trimming clips