AI-powered video clipping with OpenAI

View Markdown

“Cliphound” (a repurposing tool for content creators) takes a long-form video, sends the transcript to GPT-4, and lets the model decide which segments make the best social clips. GPT-4 calls Ittybit’s task API via function calling to trim and transcode each clip automatically.

Define the tool

Give GPT-4 a create_video_clip function that maps directly to Ittybit’s POST /jobs endpoint:

const tools = [
  {
    type: "function" as const,
    function: {
      name: "create_video_clip",
      description:
        "Trim a segment from a video and transcode it for social media",
      parameters: {
        type: "object",
        properties: {
          start: {
            type: "number",
            description: "Start time in seconds",
          },
          end: {
            type: "number",
            description: "End time in seconds",
          },
          format: {
            type: "string",
            enum: ["mp4", "webm"],
            description: "Output format",
          },
          width: {
            type: "number",
            description: "Output width in pixels",
          },
          height: {
            type: "number",
            description: "Output height in pixels",
          },
        },
        required: ["start", "end"],
      },
    },
  },
];

tools = [
    {
        "type": "function",
        "function": {
            "name": "create_video_clip",
            "description": "Trim a segment from a video and transcode it for social media",
            "parameters": {
                "type": "object",
                "properties": {
                    "start": {
                        "type": "number",
                        "description": "Start time in seconds",
                    },
                    "end": {
                        "type": "number",
                        "description": "End time in seconds",
                    },
                    "format": {
                        "type": "string",
                        "enum": ["mp4", "webm"],
                        "description": "Output format",
                    },
                    "width": {
                        "type": "number",
                        "description": "Output width in pixels",
                    },
                    "height": {
                        "type": "number",
                        "description": "Output height in pixels",
                    },
                },
                "required": ["start", "end"],
            },
        },
    }
]

# The tool definition is part of the OpenAI chat completion request body.
# See the TypeScript or Python tabs for the full structure.
# The next sections show how to wire it up end-to-end.

Ask GPT-4 to pick the clips

Send the video transcript (or a description) along with the tool definition. GPT-4 analyzes the content and decides where to cut.

import OpenAI from "openai";

const openai = new OpenAI();

const VIDEO_URL = "https://cliphound-app.com/uploads/podcast-ep42.mp4";

const transcript = `[0:00] Intro and sponsor read
[2:15] Guest arrives, small talk
[4:30] "The moment I knew the startup was going to fail was when..."
[7:45] Detailed breakdown of what went wrong with fundraising
[12:00] Lessons learned about co-founder dynamics
[15:30] Lightning round questions
[18:00] Outro`;

const response = await openai.chat.completions.create({
model: "gpt-4o",
tools,
messages: [
{
role: "system",
content: `You are a video editor. Given a transcript, identify the 2-3 most
engaging segments for short-form social clips (15-60 seconds each).
Call create_video_clip for each one. Target 1080x1920 vertical mp4.`,
},
{
role: "user",
content: `Here's the transcript for ${VIDEO_URL}:\n${transcript}`,
},
],
});

from openai import OpenAI

client = OpenAI()

VIDEO_URL = "https://cliphound-app.com/uploads/podcast-ep42.mp4"

transcript = """
[0:00] Intro and sponsor read
[2:15] Guest arrives, small talk
[4:30] "The moment I knew the startup was going to fail was when..."
[7:45] Detailed breakdown of what went wrong with fundraising
[12:00] Lessons learned about co-founder dynamics
[15:30] Lightning round questions
[18:00] Outro
"""

response = client.chat.completions.create(
    model="gpt-4o",
    tools=tools,
    messages=[
        {
            "role": "system",
            "content": (
                "You are a video editor. Given a transcript, identify the 2-3 most "
                "engaging segments for short-form social clips (15-60 seconds each). "
                "Call create_video_clip for each one. Target 1080x1920 vertical mp4."
            ),
        },
        {
            "role": "user",
            "content": f"Here's the transcript for {VIDEO_URL}:\n{transcript}",
        },
    ],
)

curl https://api.openai.com/v1/chat/completions \
  -H "Authorization: Bearer $OPENAI_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-4o",
    "tools": [{
      "type": "function",
      "function": {
        "name": "create_video_clip",
        "description": "Trim a segment from a video and transcode it for social media",
        "parameters": {
          "type": "object",
          "properties": {
            "start": { "type": "number", "description": "Start time in seconds" },
            "end": { "type": "number", "description": "End time in seconds" },
            "format": { "type": "string", "enum": ["mp4", "webm"] },
            "width": { "type": "number" },
            "height": { "type": "number" }
          },
          "required": ["start", "end"]
        }
      }
    }],
    "messages": [
      {
        "role": "system",
        "content": "You are a video editor. Identify the 2-3 most engaging segments for short-form social clips (15-60s each). Call create_video_clip for each. Target 1080x1920 vertical mp4."
      },
      {
        "role": "user",
        "content": "Here is the transcript:\n[0:00] Intro and sponsor read\n[2:15] Guest arrives\n[4:30] The moment I knew the startup was going to fail...\n[7:45] Fundraising breakdown\n[12:00] Co-founder dynamics\n[15:30] Lightning round\n[18:00] Outro"
      }
    ]
  }'

GPT-4 will typically return multiple tool calls — one per clip it wants to create.

Handle the function calls

Loop through the tool calls and send each one to Ittybit as a task:

async function createIttybitTask(args: {
  start: number;
  end: number;
  format?: string;
  width?: number;
  height?: number;
}) {
  const task = {
    kind: "video",
    input: VIDEO_URL,
    options: {
      start: args.start,
      end: args.end,
      format: args.format ?? "mp4",
      codec: "h264",
      width: args.width ?? 1080,
      height: args.height ?? 1920,
      quality: "high",
    },
  };

const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify(task),
});
return res.json();
}

// Process each tool call from GPT-4
const toolCalls = response.choices[0].message.tool_calls ?? [];
const tasks = await Promise.all(
toolCalls.map(async (call) => {
const args = JSON.parse(call.function.arguments);
return createIttybitTask(args);
})
);

console.log(`Created ${tasks.length} clip tasks`);
// [{ id: "task_abc123", status: "queued", ... }, ...]

import json
import os
import requests

ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]


def create_ittybit_task(args: dict) -> dict:
    task = {
        "kind": "video",
        "input": VIDEO_URL,
        "options": {
            "start": args["start"],
            "end": args["end"],
            "format": args.get("format", "mp4"),
            "codec": "h264",
            "width": args.get("width", 1080),
            "height": args.get("height", 1920),
            "quality": "high",
        },
    }

    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json=task,
    )
    return res.json()


# Process each tool call from GPT-4
tool_calls = response.choices[0].message.tool_calls or []
tasks = [
    create_ittybit_task(json.loads(call.function.arguments))
    for call in tool_calls
]

print(f"Created {len(tasks)} clip tasks")

# For each clip GPT-4 suggests, POST to Ittybit.
# Example: GPT-4 picked the segment from 4:30 to 5:45 (270s-345s).

curl -X POST https://api.ittybit.com/jobs \
  -H "Authorization: Bearer $ITTYBIT_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "kind": "video",
    "input": "https://cliphound-app.com/uploads/podcast-ep42.mp4",
    "options": {
      "start": 270,
      "end": 345,
      "format": "mp4",
      "codec": "h264",
      "width": 1080,
      "height": 1920,
      "quality": "high"
    }
  }'

# Response:
# {
#   "id": "task_abc123",
#   "object": "task",
#   "kind": "video",
#   "status": "queued",
#   "input": "https://cliphound-app.com/uploads/podcast-ep42.mp4",
#   "options": { "start": 270, "end": 345, ... },
#   "created_at": 1712000000000
# }

Poll for completion

Each task moves through queued -> processing -> succeeded. Poll until all clips are ready:

async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
      headers: {
        Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
      },
    });
    const task = await res.json();

    if (task.status === "succeeded" || task.status === "failed") {
      return task;
    }

    await new Promise((r) => setTimeout(r, 2000));

}
}

const completed = await Promise.all(
tasks.map((task) => waitForTask(task.id))
);

for (const task of completed) {
console.log(`${task.id}: ${task.status}`);
}

import time


def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        )
        task = res.json()

        if task["status"] in ("succeeded", "failed"):
            return task

        time.sleep(2)


completed = [wait_for_task(task["id"]) for task in tasks]

for task in completed:
    print(f"{task['id']}: {task['status']}")

# Poll a task until it completes
curl https://api.ittybit.com/jobs/task_abc123 \
  -H "Authorization: Bearer $ITTYBIT_API_KEY"

# Repeat until status is "succeeded" or "failed"

AI-powered video clipping with OpenAI

Define the tool

Ask GPT-4 to pick the clips

Handle the function calls

Poll for completion

See also