Media processing agent with Mistral

View Markdown

Large language models are good at understanding intent but can’t process media on their own. By wiring Mistral’s function calling to the Ittybit API, you can build an agent that accepts plain English requests like “transcode this to 720p” or “create an HLS stream” and dispatches the right media tasks automatically.

Install dependencies

npm install @mistralai/mistralai
pip install mistralai requests

Define the tool

The agent needs one tool: process_media. It accepts an input URL, a task kind, and processing options. Mistral will extract these from the user’s natural language request.

const tools = [
  {
    type: "function" as const,
    function: {
      name: "process_media",
      description:
        "Create a media processing task via Ittybit. Supports transcoding video, extracting audio, generating thumbnails, and creating adaptive streams.",
      parameters: {
        type: "object",
        properties: {
          input: {
            type: "string",
            description: "URL of the source media file",
          },
          kind: {
            type: "string",
            enum: ["video", "audio", "image", "adaptive_video"],
            description: "Type of processing task",
          },
          options: {
            type: "object",
            description:
              "Processing options like format, width, height, quality, start, end",
          },
        },
        required: ["input", "kind"],
      },
    },
  },
];
tools = [
    {
        "type": "function",
        "function": {
            "name": "process_media",
            "description": (
                "Create a media processing task via Ittybit. "
                "Supports transcoding video, extracting audio, "
                "generating thumbnails, and creating adaptive streams."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "input": {
                        "type": "string",
                        "description": "URL of the source media file",
                    },
                    "kind": {
                        "type": "string",
                        "enum": ["video", "audio", "image", "adaptive_video"],
                        "description": "Type of processing task",
                    },
                    "options": {
                        "type": "object",
                        "description": (
                            "Processing options like format, width, "
                            "height, quality, start, end"
                        ),
                    },
                },
                "required": ["input", "kind"],
            },
        },
    }
]

Handle tool calls

Send the user message to Mistral with the tool definition. When the model decides to call process_media, parse the arguments and POST to the Ittybit Tasks API.

import { Mistral } from "@mistralai/mistralai";

const mistral = new Mistral({
apiKey: process.env.MISTRAL_API_KEY,
});

async function processMedia(args: {
input: string;
kind: string;
options?: Record<string, unknown>;
}) {
const res = await fetch("https://api.ittybit.com/jobs", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
input: args.input,
kind: args.kind,
options: args.options ?? {},
}),
});
return await res.json();
}

async function runAgent(userMessage: string) {
const messages: any[] = [
{
role: "system",
content:
"You are a media processing assistant. Use the process_media tool to handle user requests for transcoding, thumbnails, format conversion, and streaming.",
},
{ role: "user", content: userMessage },
];

const response = await mistral.chat.complete({
model: "mistral-large-latest",
messages,
tools,
});

const choice = response.choices?.[0];
if (!choice) return;

// If the model wants to call a tool, execute it
if (choice.finishReason === "tool_calls" && choice.message.toolCalls) {
const results = [];

    for (const toolCall of choice.message.toolCalls) {
      if (toolCall.function.name === "process_media") {
        const args = JSON.parse(toolCall.function.arguments);
        const result = await processMedia(args);
        results.push(result);

        messages.push(choice.message);
        messages.push({
          role: "tool",
          name: "process_media",
          content: JSON.stringify(result),
          toolCallId: toolCall.id,
        });
      }
    }

    // Get the model's summary of what happened
    const followUp = await mistral.chat.complete({
      model: "mistral-large-latest",
      messages,
      tools,
    });

    return {
      reply: followUp.choices?.[0]?.message.content,
      tasks: results,
    };

}

return { reply: choice.message.content, tasks: [] };
}
import json
import os
import requests
from mistralai import Mistral

client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])


def process_media(args: dict) -> dict:
    res = requests.post(
        "https://api.ittybit.com/jobs",
        headers={
            "Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}",
        },
        json={
            "input": args["input"],
            "kind": args["kind"],
            "options": args.get("options", {}),
        },
    )
    return res.json()


def run_agent(user_message: str) -> dict:
    messages = [
        {
            "role": "system",
            "content": (
                "You are a media processing assistant. "
                "Use the process_media tool to handle user requests "
                "for transcoding, thumbnails, format conversion, "
                "and streaming."
            ),
        },
        {"role": "user", "content": user_message},
    ]

    response = client.chat.complete(
        model="mistral-large-latest",
        messages=messages,
        tools=tools,
    )

    choice = response.choices[0]

    if choice.finish_reason == "tool_calls" and choice.message.tool_calls:
        results = []

        for tool_call in choice.message.tool_calls:
            if tool_call.function.name == "process_media":
                args = json.loads(tool_call.function.arguments)
                result = process_media(args)
                results.append(result)

                messages.append(choice.message)
                messages.append(
                    {
                        "role": "tool",
                        "name": "process_media",
                        "content": json.dumps(result),
                        "tool_call_id": tool_call.id,
                    }
                )

        follow_up = client.chat.complete(
            model="mistral-large-latest",
            messages=messages,
            tools=tools,
        )

        return {
            "reply": follow_up.choices[0].message.content,
            "tasks": results,
        }

    return {"reply": choice.message.content, "tasks": []}

Try it

// Transcode to 720p
await runAgent(
  "Transcode https://example.com/raw.mov to 720p MP4"
);

// Generate a thumbnail
await runAgent(
"Grab a thumbnail at the 5 second mark from https://example.com/raw.mov"
);

// Create an HLS stream
await runAgent(
"Create an HLS stream from https://example.com/raw.mov"
);

// Multiple tasks from one prompt
await runAgent(
"Take https://example.com/raw.mov and make a 720p MP4, a WebP thumbnail, and an HLS stream"
);
# Transcode to 720p
run_agent(
    "Transcode https://example.com/raw.mov to 720p MP4"
)

# Generate a thumbnail
run_agent(
    "Grab a thumbnail at the 5 second mark from https://example.com/raw.mov"
)

# Create an HLS stream
run_agent(
    "Create an HLS stream from https://example.com/raw.mov"
)

# Multiple tasks from one prompt
run_agent(
    "Take https://example.com/raw.mov and make a 720p MP4, "
    "a WebP thumbnail, and an HLS stream"
)

For the last example, Mistral will issue multiple process_media calls in a single response — one for each task the user asked for.

Polling for results

Tasks run asynchronously. Poll the task endpoint to check status, or use webhooks for push notifications.

async function waitForTask(taskId: string): Promise<any> {
  while (true) {
    const res = await fetch(
      `https://api.ittybit.com/jobs/${taskId}`,
      {
        headers: {
          Authorization: `Bearer ${process.env.ITTYBIT_API_KEY}`,
        },
      }
    );
    const data = await res.json();

    if (data.status === "completed" || data.status === "error") {
      return data;
    }

    await new Promise((r) => setTimeout(r, 2000));

}
}
import time

def wait_for_task(task_id: str) -> dict:
    while True:
        res = requests.get(
            f"https://api.ittybit.com/jobs/{task_id}",
            headers={
                "Authorization": f"Bearer {os.environ['ITTYBIT_API_KEY']}",
            },
        )
        data = res.json()

        if data["status"] in ("completed", "error"):
            return data

        time.sleep(2)

See also