Summarize video with GPT-4 Vision and Ittybit

View Markdown

Ittybit’s image task extracts a frame from any video at a given timestamp. Pull a handful of keyframes, pass them to GPT-4 Vision as image_url messages, and you get back a structured summary with chapter titles, scene descriptions, and a TL;DW — no need to send the full video anywhere.

Extract keyframes

Create one image task per timestamp you want to capture. Each task pulls a single frame from the video.

const VIDEO_URL = "https://example.com/uploads/product-demo.mp4";

const timestamps = [0, 15, 45, 90, 150, 210];

async function extractFrame(start: number) {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
kind: "image",
input: VIDEO_URL,
options: {
start,
width: 1280,
height: 720,
format: "jpg",
},
}),
});
return res.json();
}

const tasks = await Promise.all(timestamps.map(extractFrame));
console.log(`Created ${tasks.length} frame extraction tasks`);

import os
import requests

VIDEO_URL = "https://example.com/uploads/product-demo.mp4"
ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]

timestamps = [0, 15, 45, 90, 150, 210]


def extract_frame(start: int) -> dict:
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json={
            "kind": "image",
            "input": VIDEO_URL,
            "options": {
                "start": start,
                "width": 1280,
                "height": 720,
                "format": "jpg",
            },
        },
    )
    return res.json()


tasks = [extract_frame(t) for t in timestamps]
print(f"Created {len(tasks)} frame extraction tasks")

# Extract a frame at 45 seconds
curl -X POST https://api.ittybit.com/jobs \
  -H "Authorization: Bearer $ITTYBIT_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "kind": "image",
    "input": "https://example.com/uploads/product-demo.mp4",
    "options": {
      "start": 45,
      "width": 1280,
      "height": 720,
      "format": "jpg"
    }
  }'

# Repeat for each timestamp: 0, 15, 90, 150, 210

Poll until frames are ready

Wait for each task to finish and collect the output URLs.

async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
      headers: {
        Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
      },
    });
    const task = await res.json();

    if (task.status === "succeeded" || task.status === "failed") {
      return task;
    }

    await new Promise((r) => setTimeout(r, 2000));

}
}

const completed = await Promise.all(
tasks.map((t) => waitForTask(t.id))
);

const frameUrls = completed
.filter((t) => t.status === "succeeded")
.map((t) => t.output.url);

console.log(`Got ${frameUrls.length} frames`);

import time


def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        )
        task = res.json()

        if task["status"] in ("succeeded", "failed"):
            return task

        time.sleep(2)


completed = [wait_for_task(t["id"]) for t in tasks]

frame_urls = [
    t["output"]["url"]
    for t in completed
    if t["status"] == "succeeded"
]

print(f"Got {len(frame_urls)} frames")

# Poll a task until complete
curl https://api.ittybit.com/jobs/task_abc123 \
  -H "Authorization: Bearer $ITTYBIT_API_KEY"

# Repeat until status is "succeeded", then grab output.url

Send frames to GPT-4 Vision

Build a message with each frame as an image_url content part. Ask for structured JSON back.

import OpenAI from "openai";

const openai = new OpenAI();

const imageMessages = frameUrls.map((url, i) => ({
type: "image_url" as const,
image_url: { url, detail: "low" as const },
}));

const response = await openai.chat.completions.create({
model: "gpt-4o",
response_format: { type: "json_object" },
messages: [
{
role: "system",
content: `You are a video analyst. You will receive keyframes extracted at
regular intervals from a video. Return a JSON object with this structure:
{
  "title": "A short descriptive title",
  "tldr": "One-sentence summary",
  "chapters": [
    { "timestamp": 0, "title": "Chapter title", "description": "What happens" }
  ]
}`,
},
{
role: "user",
content: [
{
type: "text",
text: `These ${frameUrls.length} frames were extracted at timestamps: ${timestamps.join(", ")} seconds. Summarize the video.`,
},
...imageMessages,
],
},
],
});

const summary = JSON.parse(response.choices[0].message.content!);
console.log(summary);

from openai import OpenAI
import json

client = OpenAI()

image_messages = [
    {"type": "image_url", "image_url": {"url": url, "detail": "low"}}
    for url in frame_urls
]

response = client.chat.completions.create(
    model="gpt-4o",
    response_format={"type": "json_object"},
    messages=[
        {
            "role": "system",
            "content": (
                "You are a video analyst. You will receive keyframes extracted at "
                "regular intervals from a video. Return a JSON object with this structure:\n"
                '{\n  "title": "A short descriptive title",\n'
                '  "tldr": "One-sentence summary",\n'
                '  "chapters": [\n'
                '    { "timestamp": 0, "title": "Chapter title", "description": "What happens" }\n'
                "  ]\n}"
            ),
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": (
                        f"These {len(frame_urls)} frames were extracted at timestamps: "
                        f"{', '.join(str(t) for t in timestamps)} seconds. Summarize the video."
                    ),
                },
                *image_messages,
            ],
        },
    ],
)

summary = json.loads(response.choices[0].message.content)
print(json.dumps(summary, indent=2))

curl https://api.openai.com/v1/chat/completions \
  -H "Authorization: Bearer $OPENAI_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-4o",
    "response_format": { "type": "json_object" },
    "messages": [
      {
        "role": "system",
        "content": "You are a video analyst. Given keyframes from a video, return JSON: { \"title\": \"...\", \"tldr\": \"...\", \"chapters\": [{ \"timestamp\": 0, \"title\": \"...\", \"description\": \"...\" }] }"
      },
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": "These 6 frames were extracted at timestamps: 0, 15, 45, 90, 150, 210 seconds. Summarize the video."
          },
          { "type": "image_url", "image_url": { "url": "FRAME_URL_1", "detail": "low" } },
          { "type": "image_url", "image_url": { "url": "FRAME_URL_2", "detail": "low" } },
          { "type": "image_url", "image_url": { "url": "FRAME_URL_3", "detail": "low" } },
          { "type": "image_url", "image_url": { "url": "FRAME_URL_4", "detail": "low" } },
          { "type": "image_url", "image_url": { "url": "FRAME_URL_5", "detail": "low" } },
          { "type": "image_url", "image_url": { "url": "FRAME_URL_6", "detail": "low" } }
        ]
      }
    ]
  }'

Example output

GPT-4 Vision returns something like:

{
  "title": "Product Demo: Dashboard Walkthrough",
  "tldr": "A 4-minute walkthrough of the new analytics dashboard, covering setup, key metrics, and export features.",
  "chapters": [
    {
      "timestamp": 0,
      "title": "Introduction",
      "description": "Presenter introduces the dashboard and its purpose"
    },
    {
      "timestamp": 15,
      "title": "Setup and configuration",
      "description": "Connecting data sources and configuring the workspace"
    },
    {
      "timestamp": 45,
      "title": "Key metrics overview",
      "description": "Walkthrough of the main KPI panels and chart types"
    },
    {
      "timestamp": 90,
      "title": "Filtering and drill-down",
      "description": "Demonstrating date range filters and segment breakdowns"
    },
    {
      "timestamp": 150,
      "title": "Exporting reports",
      "description": "Generating PDF and CSV exports from the dashboard"
    },
    { "timestamp": 210, "title": "Wrap-up", "description": "Summary of features and next steps" }
  ]
}

Summarize video with GPT-4 Vision and Ittybit

Extract keyframes

Poll until frames are ready

Send frames to GPT-4 Vision

Example output

See also