Searchable video library with Gemini and Ittybit

View Markdown

You have a library of videos. Users want to find the one where “the engineer draws on the whiteboard” or “the CEO talks about Q3 revenue.” Ittybit handles ingestion — HLS streams for playback, thumbnails for previews. Gemini watches each video and writes dense scene descriptions. You store those descriptions as searchable text and query them with plain language.

Ingest the video with Ittybit

Create two tasks: one for adaptive HLS streaming, one for a thumbnail. Both run asynchronously.

const ITTYBIT_API_KEY = process.env.ITTYBIT_API_KEY!;
const videoUrl = "https://example.com/uploads/meeting-2026-04-01.mp4";

async function createTask(body: Record<string, unknown>) {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
});
return res.json();
}

const [hlsTask, thumbTask] = await Promise.all([
createTask({ input: videoUrl, kind: "adaptive_video" }),
createTask({ input: videoUrl, kind: "image" }),
]);

console.log("HLS task:", hlsTask.id);
console.log("Thumbnail task:", thumbTask.id);

import os
import requests

ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]
video_url = "https://example.com/uploads/meeting-2026-04-01.mp4"

def create_task(body: dict) -> dict:
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        json=body,
    )
    return res.json()

hls_task = create_task({"input": video_url, "kind": "adaptive_video"})
thumb_task = create_task({"input": video_url, "kind": "image"})

print("HLS task:", hls_task["id"])
print("Thumbnail task:", thumb_task["id"])

Wait for processing

Poll until both tasks complete. In production, use webhooks instead.

async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
      headers: { Authorization: `Bearer ${ITTYBIT_API_KEY}` },
    });
    const task = await res.json();

    if (task.status === "succeeded") return task;
    if (task.status === "failed") throw new Error(`Task failed: ${task.error}`);

    await new Promise((r) => setTimeout(r, 2000));

}
}

const [hls, thumb] = await Promise.all([
waitForTask(hlsTask.id),
waitForTask(thumbTask.id),
]);

const hlsUrl = hls.output_url; // e.g. https://cdn.ittybit.com/.../playlist.m3u8
const thumbnailUrl = thumb.output_url; // e.g. https://cdn.ittybit.com/.../thumb.jpg

import time

def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
        )
        task = res.json()

        if task["status"] == "succeeded":
            return task
        if task["status"] == "failed":
            raise Exception(f"Task failed: {task['error']}")

        time.sleep(2)

hls = wait_for_task(hls_task["id"])
thumb = wait_for_task(thumb_task["id"])

hls_url = hls["output_url"]        # e.g. https://cdn.ittybit.com/.../playlist.m3u8
thumbnail_url = thumb["output_url"] # e.g. https://cdn.ittybit.com/.../thumb.jpg

Generate scene descriptions with Gemini

Send the original video to Gemini and ask for a dense, timestamped description of every scene. This is the text that makes the video searchable.

import { GoogleGenerativeAI } from "@google/generative-ai";

const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const model = genai.getGenerativeModel({ model: "gemini-2.0-flash" });

const result = await model.generateContent([
{
fileData: { mimeType: "video/mp4", fileUri: videoUrl },
},
{
text: `Describe every distinct scene in this video. For each scene, include:

- Approximate start and end timestamps (MM:SS)
- What is happening: people, actions, objects, text on screen
- Any spoken topics or key phrases you can identify
- The setting and visual context

Be thorough. These descriptions will be used for text search, so include specific nouns, verbs, and details a user might search for. Return the result as JSON:
[{ "start": "00:00", "end": "01:23", "description": "..." }, ...]`,
},
]);

const text = result.response.text();
const scenes = JSON.parse(text.replace(/`json\n?|\n?`/g, ""));

console.log(`${scenes.length} scenes described`);

import json
import re
import google.generativeai as genai

genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel(model_name="gemini-2.0-flash")

response = model.generate_content([
    genai.protos.Part(
        file_data=genai.protos.FileData(
            mime_type="video/mp4", file_uri=video_url
        )
    ),
    """Describe every distinct scene in this video. For each scene, include:
- Approximate start and end timestamps (MM:SS)
- What is happening: people, actions, objects, text on screen
- Any spoken topics or key phrases you can identify
- The setting and visual context

Be thorough. These descriptions will be used for text search, so include specific nouns, verbs, and details a user might search for. Return the result as JSON:
[{ "start": "00:00", "end": "01:23", "description": "..." }, ...]""",
])

text = response.text
scenes = json.loads(re.sub(r"```json\n?|\n?```", "", text))

print(f"{len(scenes)} scenes described")

Store descriptions with full-text search

Insert each scene description into PostgreSQL with a tsvector column for full-text search. The video’s HLS and thumbnail URLs go in too so search results link directly to playable media.

import pg from "pg";

const pool = new pg.Pool({
connectionString: process.env.DATABASE_URL,
});

// Create the table (run once)
await pool.query(`  CREATE TABLE IF NOT EXISTS video_scenes (
    id SERIAL PRIMARY KEY,
    video_url TEXT NOT NULL,
    hls_url TEXT NOT NULL,
    thumbnail_url TEXT NOT NULL,
    start_time TEXT NOT NULL,
    end_time TEXT NOT NULL,
    description TEXT NOT NULL,
    search_vector TSVECTOR GENERATED ALWAYS AS (
      to_tsvector('english', description)
    ) STORED
  );
  CREATE INDEX IF NOT EXISTS idx_search_vector
    ON video_scenes USING GIN (search_vector);`);

// Index each scene
for (const scene of scenes) {
await pool.query(
`INSERT INTO video_scenes (video_url, hls_url, thumbnail_url, start_time, end_time, description)
     VALUES ($1, $2, $3, $4, $5, $6)`,
[videoUrl, hlsUrl, thumbnailUrl, scene.start, scene.end, scene.description]
);
}

console.log(`Indexed ${scenes.length} scenes`);

import psycopg2

conn = psycopg2.connect(os.environ["DATABASE_URL"])
cur = conn.cursor()

# Create the table (run once)
cur.execute("""
    CREATE TABLE IF NOT EXISTS video_scenes (
        id SERIAL PRIMARY KEY,
        video_url TEXT NOT NULL,
        hls_url TEXT NOT NULL,
        thumbnail_url TEXT NOT NULL,
        start_time TEXT NOT NULL,
        end_time TEXT NOT NULL,
        description TEXT NOT NULL,
        search_vector TSVECTOR GENERATED ALWAYS AS (
            to_tsvector('english', description)
        ) STORED
    );
    CREATE INDEX IF NOT EXISTS idx_search_vector
        ON video_scenes USING GIN (search_vector);
""")

# Index each scene
for scene in scenes:
    cur.execute(
        """INSERT INTO video_scenes (video_url, hls_url, thumbnail_url, start_time, end_time, description)
           VALUES (%s, %s, %s, %s, %s, %s)""",
        (video_url, hls_url, thumbnail_url, scene["start"], scene["end"], scene["description"]),
    )

conn.commit()
print(f"Indexed {len(scenes)} scenes")

Search the library

Users type a natural language query. PostgreSQL’s ts_query handles stemming and ranking. Results come back with the HLS URL for playback and the timestamp for seeking.

async function searchVideos(query: string, limit = 10) {
  const { rows } = await pool.query(
    `SELECT
       video_url,
       hls_url,
       thumbnail_url,
       start_time,
       end_time,
       description,
       ts_rank(search_vector, websearch_to_tsquery('english', $1)) AS rank
     FROM video_scenes
     WHERE search_vector @@ websearch_to_tsquery('english', $1)
     ORDER BY rank DESC
     LIMIT $2`,
    [query, limit]
  );
  return rows;
}

// Example: find the whiteboard scene
const results = await searchVideos("engineer draws on whiteboard");

for (const r of results) {
console.log(`[${r.start_time} - ${r.end_time}] ${r.description}`);
console.log(`  Play: ${r.hls_url}`);
console.log(`  Thumbnail: ${r.thumbnail_url}`);
}