Back to Mistral Rs

Multimodal streaming with combined image and audio inputs

docs/src/content/docs/examples/rust/models/multimodal.md

0.8.102.2 KB
Original Source
<!-- generated by docs/scripts/render_examples.py; edit the source example instead -->

Multimodal streaming with combined image and audio inputs.

Run with: cargo run --release --example multimodal -p mistralrs

rust
//! Multimodal streaming with combined image and audio inputs.
//!
//! Run with: `cargo run --release --example multimodal -p mistralrs`

use std::io::Write;

use anyhow::Result;
use mistralrs::{
    AudioInput, ChatCompletionChunkResponse, ChunkChoice, Delta, MultimodalMessages,
    MultimodalModelBuilder, Response, TextMessageRole,
};

#[tokio::main]
async fn main() -> Result<()> {
    let model = MultimodalModelBuilder::new("microsoft/Phi-4-multimodal-instruct")
        .with_logging()
        .build()
        .await?;

    let audio_bytes =
        reqwest::get("https://upload.wikimedia.org/wikipedia/commons/4/42/Bird_singing.ogg")
            .await?
            .bytes()
            .await?
            .to_vec();
    let audio = AudioInput::from_bytes(&audio_bytes)?;

    let image_bytes =
        reqwest::get("https://www.allaboutbirds.org/guide/assets/og/528129121-1200px.jpg")
            .await?
            .bytes()
            .await?
            .to_vec();
    let image = image::load_from_memory(&image_bytes)?;

    let messages = MultimodalMessages::new().add_multimodal_message(
        TextMessageRole::User,
        "Describe in detail what is happening.",
        vec![image],
        vec![audio],
        vec![],
    );

    let mut stream = model.stream_chat_request(messages).await?;

    while let Some(chunk) = stream.next().await {
        if let Response::Chunk(ChatCompletionChunkResponse { choices, .. }) = chunk {
            if let Some(ChunkChoice {
                delta:
                    Delta {
                        content: Some(content),
                        ..
                    },
                ..
            }) = choices.first()
            {
                print!("{content}");
                std::io::stdout().flush()?;
            };
        } else {
            // Handle errors
        }
    }
    Ok(())
}

Source: mistralrs/examples/models/multimodal/main.rs