Back to Baml

Multi Modal

fern/01-guide/04-baml-basics/multi-modal.mdx

0.222.011.4 KB
Original Source

Multi-modal input

You can use audio, image, pdf, or video input types in BAML prompts. Just create an input argument of that type and render it in the prompt.

Switch from "Prompt Review" to "Raw cURL" in the playground to see how BAML translates multi-modal input into the LLM Request body.

baml
// "image" is a reserved keyword so we name the arg "img"
function DescribeMedia(img: image) -> string {
  client "openai-responses/gpt-5"  // GPT-5 has excellent multimodal support
  // Most LLM providers require images or audio to be sent as "user" messages.
  prompt #"
    {{_.role("user")}}
    Describe this image: {{ img }}
  "#
}

// See the "testing functions" Guide for more on testing Multimodal functions
test Test {
  functions [DescribeMedia]
  args {
    img {
      url "https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png"
    }
  }
}

See how to test images in the playground.

Try it! Press 'Run Test' below!

<div class="resizer"> <iframe class="resized" src="https://promptfiddle.com/embed?id=multimodal"

height="640" style="border: none;" resize="both" overflow="auto" msallowfullscreen

</iframe>
</div>

Calling Multimodal BAML Functions

Images

Calling a BAML function with an image input argument type (see image types)

The from_url and from_base64 methods create an Image object based on input type. <CodeBlocks>

python
from baml_py import Image
from baml_client import b

async def test_image_input():
  # from URL
  res = await b.TestImageInput(
      img=Image.from_url(
          "https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png"
      )
  )

  # Base64 image
  image_b64 = "iVBORw0K...."
  res = await b.TestImageInput(
    img=Image.from_base64("image/png", image_b64)
  )
typescript
import { b } from '../baml_client'
import { Image } from "@boundaryml/baml"
...

  // URL
  let res = await b.TestImageInput(
    Image.fromUrl('https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png'),
  )

  // Base64
  const image_b64 = "iVB0R..."
  let res = await b.TestImageInput(
    Image.fromBase64('image/png', image_b64),
  )
  
go
package main

import (
    "context"
    
    b "example.com/myproject/baml_client"
)

func testImageInput() error {
    ctx := context.Background()
    
    // From URL
    img, err := b.NewImageFromUrl(
        "https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png",
        nil,
    )
    if err != nil {
        return err
    }
    
    result, err := b.TestImageInput(ctx, img)
    if err != nil {
        return err
    }

    // Base64 image
    imageB64 := "iVBORw0K...."
    img2, err := b.NewImageFromBase64(imageB64, stringPtr("image/png"))
    if err != nil {
        return err
    }
    
    result2, err := b.TestImageInput(ctx, img2)
    if err != nil {
        return err
    }
    
    return nil
}

// Helper function for string pointer
func stringPtr(s string) *string {
    return &s
}
ruby
we're working on it!
rust
use myproject::baml_client::sync_client::B;
use myproject::baml_client::new_image_from_url;
use myproject::baml_client::new_image_from_base64;

fn test_image_input() {
    // From URL
    let image = new_image_from_url(
        "https://upload.wikimedia.org/wikipedia/en/4/4d/Shrek_%28character%29.png",
        None,
    );
    let res = B.TestImageInput.call(&image).unwrap();

    // Base64 image
    let image_b64 = "iVBORw0K....";
    let image = new_image_from_base64(image_b64, Some("image/png"));
    let res = B.TestImageInput.call(&image).unwrap();
}
</CodeBlocks>

Audio

Calling functions that have audio types. See audio types

<CodeBlocks> ```python Python from baml_py import Audio from baml_client import b

async def run():

from URL

res = await b.TestAudioInput( img=Audio.from_url( "https://actions.google.com/sounds/v1/emergency/beeper_emergency_call.ogg" ) )

Base64

b64 = "iVBORw0K...." res = await b.TestAudioInput( audio=Audio.from_base64("audio/ogg", b64) )


```typescript TypeScript
import { b } from '../baml_client'
import { Audio } from "@boundaryml/baml"
...

  // URL
  let res = await b.TestAudioInput(
    Audio.fromUrl('https://actions.google.com/sounds/v1/emergency/beeper_emergency_call.ogg'),
  )

  // Base64
  const audio_base64 = ".."
  let res = await b.TestAudioInput(
    Audio.fromBase64('audio/ogg', audio_base64),
  )
  
go
package main

import (
    "context"
    
    b "example.com/myproject/baml_client"
)

func testAudioInput() error {
    ctx := context.Background()
    
    // From URL
    aud, err := b.NewAudioFromUrl(
        "https://actions.google.com/sounds/v1/emergency/beeper_emergency_call.ogg",
        nil,
    )
    if err != nil {
        return err
    }
    
    result, err := b.TestAudioInput(ctx, aud)
    if err != nil {
        return err
    }

    // Base64 audio
    audioB64 := "iVBORw0K...."
    aud2, err := b.NewAudioFromBase64(audioB64, stringPtr("audio/ogg"))
    if err != nil {
        return err
    }
    
    result2, err := b.TestAudioInput(ctx, aud2)
    if err != nil {
        return err
    }
    
    return nil
}
ruby
we're working on it!
rust
use myproject::baml_client::sync_client::B;
use myproject::baml_client::new_audio_from_url;

fn test_audio_input() {
    // From URL
    let audio = new_audio_from_url(
        "https://actions.google.com/sounds/v1/emergency/beeper_emergency_call.ogg",
        None,
    );
    let res = B.TestAudioInput.call(&audio).unwrap();
}
</CodeBlocks>

Pdf

Calling functions that have pdf types. See pdf types

⚠️ Warning Pdf inputs must be provided as Base64 data (e.g. Pdf.from_base64). URL-based Pdf inputs are not currently supported. Additionally, Pdf inputs are only supported by models that explicitly allow document (Pdf) modalities, such as Gemini 2.x Flash/Pro or VertexAI Gemini. Make sure the client you select advertises Pdf support, otherwise your request will fail.

<CodeBlocks> ```python Python from baml_py import Pdf from baml_client import b

async def run():

Base64 data

b64 = "JVBERi0K...." res = await b.TestPdfInput( pdf=Pdf.from_base64("application/pdf", b64) )


```typescript TypeScript
import { b } from '../baml_client'
import { Pdf } from "@boundaryml/baml"
...

  // Base64
  const pdf_base64 = ".."
  let res = await b.TestPdfInput(
    Pdf.fromBase64('application/pdf', pdf_base64),
  )
  
go
package main

import (
    "context"
    
    b "example.com/myproject/baml_client"
)

func testPdfInput() error {
    ctx := context.Background()
    
    // Base64 PDF data
    pdfB64 := "JVBERi0K...."
    pdf, err := b.NewPDFFromBase64(pdfB64, nil)
    if err != nil {
        return err
    }
    
    result, err := b.TestPdfInput(ctx, pdf)
    if err != nil {
        return err
    }
    
    return nil
}
ruby
we're working on it!
rust
use myproject::baml_client::sync_client::B;
use myproject::baml_client::new_pdf_from_base64;

fn test_pdf_input() {
    // Base64 data
    let b64 = "JVBERi0K....";
    let pdf = new_pdf_from_base64(b64, None);
    let res = B.TestPdfInput.call(&pdf).unwrap();
}
</CodeBlocks>

Video

Calling functions that have video types. See video types

⚠️ Warning Video inputs require a model that supports video understanding (for example Gemini 2.x Flash/Pro). If your chosen model does not list video support your function call will return an error. When you supply a Video as a URL the URL is forwarded unchanged to the model; if the model cannot fetch remote content you must instead pass the bytes via Video.from_base64.

<CodeBlocks> ```python Python from baml_py import Video from baml_client import b

async def run():

from URL

res = await b.TestVideoInput( video=Video.from_url( "https://example.com/sample.mp4" ) )

Base64

b64 = "AAAAGGZ0eXBpc29t...." res = await b.TestVideoInput( video=Video.from_base64("video/mp4", b64) )


```typescript TypeScript
import { b } from '../baml_client'
import { Video } from "@boundaryml/baml"
...

  // URL
  let res = await b.TestVideoInput(
    Video.fromUrl('https://example.com/sample.mp4'),
  )

  // Base64
  const video_base64 = ".."
  let res = await b.TestVideoInput(
    Video.fromBase64('video/mp4', video_base64),
  )
  
go
package main

import (
    "context"
    
    b "example.com/myproject/baml_client"
)

func testVideoInput() error {
    ctx := context.Background()
    
    // From URL
    vid, err := b.NewVideoFromUrl("https://example.com/sample.mp4", nil)
    if err != nil {
        return err
    }
    
    result, err := b.TestVideoInput(ctx, vid)
    if err != nil {
        return err
    }

    // Base64 video
    videoB64 := "AAAAGGZ0eXBpc29t...."
    vid2, err := b.NewVideoFromBase64(videoB64, stringPtr("video/mp4"))
    if err != nil {
        return err
    }
    
    result2, err := b.TestVideoInput(ctx, vid2)
    if err != nil {
        return err
    }
    
    return nil
}
ruby
we're working on it!
rust
use myproject::baml_client::sync_client::B;
use myproject::baml_client::new_video_from_url;

fn test_video_input() {
    // From URL
    let video = new_video_from_url("https://example.com/sample.mp4", None);
    let res = B.TestVideoInput.call(&video).unwrap();
}
</CodeBlocks>

Controlling URL Resolution

By default, BAML automatically handles URL-to-base64 conversion based on what each provider supports. However, you can customize this behavior using the media_url_handler configuration:

Example: Optimizing for Performance

If you're using Anthropic and want to avoid the latency of URL fetching:

baml
client<llm> FastClaude {
  provider anthropic
  options {
    model "claude-3-5-sonnet-20241022"
    api_key env.ANTHROPIC_API_KEY
    media_url_handler {
      image "send_url"       // Anthropic can fetch URLs directly
      pdf "send_base64"      // Required by Anthropic API (As of October 2025)
    }
  }
}

Example: Working with Google Cloud Storage

When using Google AI with images stored in GCS:

baml
client<llm> GeminiWithGCS {
  provider google-ai
  options {
    model "gemini-1.5-pro"
    api_key env.GOOGLE_API_KEY
    media_url_handler {
      image "send_base64_unless_google_url"  // Preserve gs:// URLs, convert others
    }
  }
}

Example: Ensuring Compatibility

For maximum compatibility across providers:

baml
client<llm> CompatibleClient {
  provider openai
  options {
    model "gpt-4o"
    api_key env.OPENAI_API_KEY
    media_url_handler {
      image "send_base64"    // Ensure images are embedded
      audio "send_base64"    // OpenAI requires base64 for audio
      pdf "send_base64"      // Embed PDFs for reliability
    }
  }
}

Random Thoughts

  1. send_url - Allows providers to fetch URLs reducing payload size
  2. send_base64 - Embedding content avoids external dependencies
  3. send_url_add_mime_type - Required for proper media handling for some providers (if the mime type is not provided, it will be downloaded to determine the mime type)
  4. send_base64_unless_google_url - Preserves Google Cloud Storage URLs for Google providers

See the provider documentation for provider-specific defaults and requirements.