fern/01-guide/04-baml-basics/multi-modal.mdx
You can use audio, image, pdf, or video input types in BAML prompts. Just create an input argument of that type and render it in the prompt.
Switch from "Prompt Review" to "Raw cURL" in the playground to see how BAML translates multi-modal input into the LLM Request body.
// "image" is a reserved keyword so we name the arg "img"
function DescribeMedia(img: image) -> string {
client "openai-responses/gpt-5" // GPT-5 has excellent multimodal support
// Most LLM providers require images or audio to be sent as "user" messages.
prompt #"
{{_.role("user")}}
Describe this image: {{ img }}
"#
}
// See the "testing functions" Guide for more on testing Multimodal functions
test Test {
functions [DescribeMedia]
args {
img {
url "https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png"
}
}
}
See how to test images in the playground.
height="640" style="border: none;" resize="both" overflow="auto" msallowfullscreen
</iframe></div>
Calling a BAML function with an image input argument type (see image types)
The from_url and from_base64 methods create an Image object based on input type.
<CodeBlocks>
from baml_py import Image
from baml_client import b
async def test_image_input():
# from URL
res = await b.TestImageInput(
img=Image.from_url(
"https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png"
)
)
# Base64 image
image_b64 = "iVBORw0K...."
res = await b.TestImageInput(
img=Image.from_base64("image/png", image_b64)
)
import { b } from '../baml_client'
import { Image } from "@boundaryml/baml"
...
// URL
let res = await b.TestImageInput(
Image.fromUrl('https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png'),
)
// Base64
const image_b64 = "iVB0R..."
let res = await b.TestImageInput(
Image.fromBase64('image/png', image_b64),
)
package main
import (
"context"
b "example.com/myproject/baml_client"
)
func testImageInput() error {
ctx := context.Background()
// From URL
img, err := b.NewImageFromUrl(
"https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png",
nil,
)
if err != nil {
return err
}
result, err := b.TestImageInput(ctx, img)
if err != nil {
return err
}
// Base64 image
imageB64 := "iVBORw0K...."
img2, err := b.NewImageFromBase64(imageB64, stringPtr("image/png"))
if err != nil {
return err
}
result2, err := b.TestImageInput(ctx, img2)
if err != nil {
return err
}
return nil
}
// Helper function for string pointer
func stringPtr(s string) *string {
return &s
}
we're working on it!
use myproject::baml_client::sync_client::B;
use myproject::baml_client::new_image_from_url;
use myproject::baml_client::new_image_from_base64;
fn test_image_input() {
// From URL
let image = new_image_from_url(
"https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png",
None,
);
let res = B.TestImageInput.call(&image).unwrap();
// Base64 image
let image_b64 = "iVBORw0K....";
let image = new_image_from_base64(image_b64, Some("image/png"));
let res = B.TestImageInput.call(&image).unwrap();
}
Calling functions that have audio types. See audio types
async def run():
res = await b.TestAudioInput( img=Audio.from_url( "https://actions.google.com/sounds/v1/emergency/beeper_emergency_call.ogg" ) )
b64 = "iVBORw0K...." res = await b.TestAudioInput( audio=Audio.from_base64("audio/ogg", b64) )
```typescript TypeScript
import { b } from '../baml_client'
import { Audio } from "@boundaryml/baml"
...
// URL
let res = await b.TestAudioInput(
Audio.fromUrl('https://actions.google.com/sounds/v1/emergency/beeper_emergency_call.ogg'),
)
// Base64
const audio_base64 = ".."
let res = await b.TestAudioInput(
Audio.fromBase64('audio/ogg', audio_base64),
)
package main
import (
"context"
b "example.com/myproject/baml_client"
)
func testAudioInput() error {
ctx := context.Background()
// From URL
aud, err := b.NewAudioFromUrl(
"https://actions.google.com/sounds/v1/emergency/beeper_emergency_call.ogg",
nil,
)
if err != nil {
return err
}
result, err := b.TestAudioInput(ctx, aud)
if err != nil {
return err
}
// Base64 audio
audioB64 := "iVBORw0K...."
aud2, err := b.NewAudioFromBase64(audioB64, stringPtr("audio/ogg"))
if err != nil {
return err
}
result2, err := b.TestAudioInput(ctx, aud2)
if err != nil {
return err
}
return nil
}
we're working on it!
use myproject::baml_client::sync_client::B;
use myproject::baml_client::new_audio_from_url;
fn test_audio_input() {
// From URL
let audio = new_audio_from_url(
"https://actions.google.com/sounds/v1/emergency/beeper_emergency_call.ogg",
None,
);
let res = B.TestAudioInput.call(&audio).unwrap();
}
Calling functions that have pdf types. See pdf types
<CodeBlocks> ```python Python from baml_py import Pdf from baml_client import b⚠️ Warning Pdf inputs must be provided as Base64 data (e.g.
Pdf.from_base64). URL-based Pdf inputs are not currently supported. Additionally, Pdf inputs are only supported by models that explicitly allow document (Pdf) modalities, such as Gemini 2.x Flash/Pro or VertexAI Gemini. Make sure theclientyou select advertises Pdf support, otherwise your request will fail.
async def run():
b64 = "JVBERi0K...." res = await b.TestPdfInput( pdf=Pdf.from_base64("application/pdf", b64) )
```typescript TypeScript
import { b } from '../baml_client'
import { Pdf } from "@boundaryml/baml"
...
// Base64
const pdf_base64 = ".."
let res = await b.TestPdfInput(
Pdf.fromBase64('application/pdf', pdf_base64),
)
package main
import (
"context"
b "example.com/myproject/baml_client"
)
func testPdfInput() error {
ctx := context.Background()
// Base64 PDF data
pdfB64 := "JVBERi0K...."
pdf, err := b.NewPDFFromBase64(pdfB64, nil)
if err != nil {
return err
}
result, err := b.TestPdfInput(ctx, pdf)
if err != nil {
return err
}
return nil
}
we're working on it!
use myproject::baml_client::sync_client::B;
use myproject::baml_client::new_pdf_from_base64;
fn test_pdf_input() {
// Base64 data
let b64 = "JVBERi0K....";
let pdf = new_pdf_from_base64(b64, None);
let res = B.TestPdfInput.call(&pdf).unwrap();
}
Calling functions that have video types. See video types
<CodeBlocks> ```python Python from baml_py import Video from baml_client import b⚠️ Warning Video inputs require a model that supports video understanding (for example Gemini 2.x Flash/Pro). If your chosen model does not list video support your function call will return an error. When you supply a Video as a URL the URL is forwarded unchanged to the model; if the model cannot fetch remote content you must instead pass the bytes via
Video.from_base64.
async def run():
res = await b.TestVideoInput( video=Video.from_url( "https://example.com/sample.mp4" ) )
b64 = "AAAAGGZ0eXBpc29t...." res = await b.TestVideoInput( video=Video.from_base64("video/mp4", b64) )
```typescript TypeScript
import { b } from '../baml_client'
import { Video } from "@boundaryml/baml"
...
// URL
let res = await b.TestVideoInput(
Video.fromUrl('https://example.com/sample.mp4'),
)
// Base64
const video_base64 = ".."
let res = await b.TestVideoInput(
Video.fromBase64('video/mp4', video_base64),
)
package main
import (
"context"
b "example.com/myproject/baml_client"
)
func testVideoInput() error {
ctx := context.Background()
// From URL
vid, err := b.NewVideoFromUrl("https://example.com/sample.mp4", nil)
if err != nil {
return err
}
result, err := b.TestVideoInput(ctx, vid)
if err != nil {
return err
}
// Base64 video
videoB64 := "AAAAGGZ0eXBpc29t...."
vid2, err := b.NewVideoFromBase64(videoB64, stringPtr("video/mp4"))
if err != nil {
return err
}
result2, err := b.TestVideoInput(ctx, vid2)
if err != nil {
return err
}
return nil
}
we're working on it!
use myproject::baml_client::sync_client::B;
use myproject::baml_client::new_video_from_url;
fn test_video_input() {
// From URL
let video = new_video_from_url("https://example.com/sample.mp4", None);
let res = B.TestVideoInput.call(&video).unwrap();
}
By default, BAML automatically handles URL-to-base64 conversion based on what each provider supports. However, you can customize this behavior using the media_url_handler configuration:
If you're using Anthropic and want to avoid the latency of URL fetching:
client<llm> FastClaude {
provider anthropic
options {
model "claude-3-5-sonnet-20241022"
api_key env.ANTHROPIC_API_KEY
media_url_handler {
image "send_url" // Anthropic can fetch URLs directly
pdf "send_base64" // Required by Anthropic API (As of October 2025)
}
}
}
When using Google AI with images stored in GCS:
client<llm> GeminiWithGCS {
provider google-ai
options {
model "gemini-1.5-pro"
api_key env.GOOGLE_API_KEY
media_url_handler {
image "send_base64_unless_google_url" // Preserve gs:// URLs, convert others
}
}
}
For maximum compatibility across providers:
client<llm> CompatibleClient {
provider openai
options {
model "gpt-4o"
api_key env.OPENAI_API_KEY
media_url_handler {
image "send_base64" // Ensure images are embedded
audio "send_base64" // OpenAI requires base64 for audio
pdf "send_base64" // Embed PDFs for reliability
}
}
}
send_url - Allows providers to fetch URLs reducing payload sizesend_base64 - Embedding content avoids external dependenciessend_url_add_mime_type - Required for proper media handling for some providers (if the mime type is not provided, it will be downloaded to determine the mime type)send_base64_unless_google_url - Preserves Google Cloud Storage URLs for Google providersSee the provider documentation for provider-specific defaults and requirements.