Searchable video library with Gemini and Ittybit
You have a library of videos. Users want to find the one where “the engineer draws on the whiteboard” or “the CEO talks about Q3 revenue.” Ittybit handles ingestion — HLS streams for playback, thumbnails for previews. Gemini watches each video and writes dense scene descriptions. You store those descriptions as searchable text and query them with plain language.
Ingest the video with Ittybit
Create two tasks: one for adaptive HLS streaming, one for a thumbnail. Both run asynchronously.
const ITTYBIT_API_KEY = process.env.ITTYBIT_API_KEY!;
const videoUrl = "https://example.com/uploads/meeting-2026-04-01.mp4";
async function createTask(body: Record<string, unknown>) {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
});
return res.json();
}
const [hlsTask, thumbTask] = await Promise.all([
createTask({ input: videoUrl, kind: "adaptive_video" }),
createTask({ input: videoUrl, kind: "image" }),
]);
console.log("HLS task:", hlsTask.id);
console.log("Thumbnail task:", thumbTask.id);
import os
import requests
ITTYBIT_API_KEY = os.environ["ITTYBIT_API_KEY"]
video_url = "https://example.com/uploads/meeting-2026-04-01.mp4"
def create_task(body: dict) -> dict:
res = requests.post(
"https://api.ittybit.com/jobs",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
json=body,
)
return res.json()
hls_task = create_task({"input": video_url, "kind": "adaptive_video"})
thumb_task = create_task({"input": video_url, "kind": "image"})
print("HLS task:", hls_task["id"])
print("Thumbnail task:", thumb_task["id"]) Wait for processing
Poll until both tasks complete. In production, use webhooks instead.
async function waitForTask(taskId: string): Promise<any> {
while (true) {
const res = await fetch(`https://api.ittybit.com/jobs/${taskId}`, {
headers: { Authorization: `Bearer ${ITTYBIT_API_KEY}` },
});
const task = await res.json();
if (task.status === "succeeded") return task;
if (task.status === "failed") throw new Error(`Task failed: ${task.error}`);
await new Promise((r) => setTimeout(r, 2000));
}
}
const [hls, thumb] = await Promise.all([
waitForTask(hlsTask.id),
waitForTask(thumbTask.id),
]);
const hlsUrl = hls.output_url; // e.g. https://cdn.ittybit.com/.../playlist.m3u8
const thumbnailUrl = thumb.output_url; // e.g. https://cdn.ittybit.com/.../thumb.jpg
import time
def wait_for_task(task_id: str) -> dict:
while True:
res = requests.get(
f"https://api.ittybit.com/jobs/{task_id}",
headers={"Authorization": f"Bearer {ITTYBIT_API_KEY}"},
)
task = res.json()
if task["status"] == "succeeded":
return task
if task["status"] == "failed":
raise Exception(f"Task failed: {task['error']}")
time.sleep(2)
hls = wait_for_task(hls_task["id"])
thumb = wait_for_task(thumb_task["id"])
hls_url = hls["output_url"] # e.g. https://cdn.ittybit.com/.../playlist.m3u8
thumbnail_url = thumb["output_url"] # e.g. https://cdn.ittybit.com/.../thumb.jpg Generate scene descriptions with Gemini
Send the original video to Gemini and ask for a dense, timestamped description of every scene. This is the text that makes the video searchable.
import { GoogleGenerativeAI } from "@google/generative-ai";
const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const model = genai.getGenerativeModel({ model: "gemini-2.0-flash" });
const result = await model.generateContent([
{
fileData: { mimeType: "video/mp4", fileUri: videoUrl },
},
{
text: `Describe every distinct scene in this video. For each scene, include:
- Approximate start and end timestamps (MM:SS)
- What is happening: people, actions, objects, text on screen
- Any spoken topics or key phrases you can identify
- The setting and visual context
Be thorough. These descriptions will be used for text search, so include specific nouns, verbs, and details a user might search for. Return the result as JSON:
[{ "start": "00:00", "end": "01:23", "description": "..." }, ...]`,
},
]);
const text = result.response.text();
const scenes = JSON.parse(text.replace(/`json\n?|\n?`/g, ""));
console.log(`${scenes.length} scenes described`);
import json
import re
import google.generativeai as genai
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel(model_name="gemini-2.0-flash")
response = model.generate_content([
genai.protos.Part(
file_data=genai.protos.FileData(
mime_type="video/mp4", file_uri=video_url
)
),
"""Describe every distinct scene in this video. For each scene, include:
- Approximate start and end timestamps (MM:SS)
- What is happening: people, actions, objects, text on screen
- Any spoken topics or key phrases you can identify
- The setting and visual context
Be thorough. These descriptions will be used for text search, so include specific nouns, verbs, and details a user might search for. Return the result as JSON:
[{ "start": "00:00", "end": "01:23", "description": "..." }, ...]""",
])
text = response.text
scenes = json.loads(re.sub(r"```json\n?|\n?```", "", text))
print(f"{len(scenes)} scenes described") Store descriptions with full-text search
Insert each scene description into PostgreSQL with a tsvector column for full-text search. The video’s HLS and thumbnail URLs go in too so search results link directly to playable media.
import pg from "pg";
const pool = new pg.Pool({
connectionString: process.env.DATABASE_URL,
});
// Create the table (run once)
await pool.query(` CREATE TABLE IF NOT EXISTS video_scenes (
id SERIAL PRIMARY KEY,
video_url TEXT NOT NULL,
hls_url TEXT NOT NULL,
thumbnail_url TEXT NOT NULL,
start_time TEXT NOT NULL,
end_time TEXT NOT NULL,
description TEXT NOT NULL,
search_vector TSVECTOR GENERATED ALWAYS AS (
to_tsvector('english', description)
) STORED
);
CREATE INDEX IF NOT EXISTS idx_search_vector
ON video_scenes USING GIN (search_vector);`);
// Index each scene
for (const scene of scenes) {
await pool.query(
`INSERT INTO video_scenes (video_url, hls_url, thumbnail_url, start_time, end_time, description)
VALUES ($1, $2, $3, $4, $5, $6)`,
[videoUrl, hlsUrl, thumbnailUrl, scene.start, scene.end, scene.description]
);
}
console.log(`Indexed ${scenes.length} scenes`);
import psycopg2
conn = psycopg2.connect(os.environ["DATABASE_URL"])
cur = conn.cursor()
# Create the table (run once)
cur.execute("""
CREATE TABLE IF NOT EXISTS video_scenes (
id SERIAL PRIMARY KEY,
video_url TEXT NOT NULL,
hls_url TEXT NOT NULL,
thumbnail_url TEXT NOT NULL,
start_time TEXT NOT NULL,
end_time TEXT NOT NULL,
description TEXT NOT NULL,
search_vector TSVECTOR GENERATED ALWAYS AS (
to_tsvector('english', description)
) STORED
);
CREATE INDEX IF NOT EXISTS idx_search_vector
ON video_scenes USING GIN (search_vector);
""")
# Index each scene
for scene in scenes:
cur.execute(
"""INSERT INTO video_scenes (video_url, hls_url, thumbnail_url, start_time, end_time, description)
VALUES (%s, %s, %s, %s, %s, %s)""",
(video_url, hls_url, thumbnail_url, scene["start"], scene["end"], scene["description"]),
)
conn.commit()
print(f"Indexed {len(scenes)} scenes") Search the library
Users type a natural language query. PostgreSQL’s ts_query handles stemming and ranking. Results come back with the HLS URL for playback and the timestamp for seeking.
async function searchVideos(query: string, limit = 10) {
const { rows } = await pool.query(
`SELECT
video_url,
hls_url,
thumbnail_url,
start_time,
end_time,
description,
ts_rank(search_vector, websearch_to_tsquery('english', $1)) AS rank
FROM video_scenes
WHERE search_vector @@ websearch_to_tsquery('english', $1)
ORDER BY rank DESC
LIMIT $2`,
[query, limit]
);
return rows;
}
// Example: find the whiteboard scene
const results = await searchVideos("engineer draws on whiteboard");
for (const r of results) {
console.log(`[${r.start_time} - ${r.end_time}] ${r.description}`);
console.log(` Play: ${r.hls_url}`);
console.log(` Thumbnail: ${r.thumbnail_url}`);
}
def search_videos(query: str, limit: int = 10) -> list[dict]:
cur.execute(
"""SELECT
video_url,
hls_url,
thumbnail_url,
start_time,
end_time,
description,
ts_rank(search_vector, websearch_to_tsquery('english', %s)) AS rank
FROM video_scenes
WHERE search_vector @@ websearch_to_tsquery('english', %s)
ORDER BY rank DESC
LIMIT %s""",
(query, query, limit),
)
columns = [desc[0] for desc in cur.description]
return [dict(zip(columns, row)) for row in cur.fetchall()]
# Example: find the whiteboard scene
results = search_videos("engineer draws on whiteboard")
for r in results:
print(f"[{r['start_time']} - {r['end_time']}] {r['description']}")
print(f" Play: {r['hls_url']}")
print(f" Thumbnail: {r['thumbnail_url']}") See also
- HLS streaming — adaptive streaming setup
- Thumbnails — thumbnail generation options
- Video editing with Gemini — AI-driven video trimming