Back to Mistral Rs

Example demonstrating multi-model usage with mistral.rs Python bindings

docs/src/content/docs/examples/python/multi-model-example.md

0.8.107.1 KB
Original Source
<!-- generated by docs/scripts/render_examples.py; edit the source example instead -->

Example demonstrating multi-model usage with mistral.rs Python bindings.

This example shows how to:

  1. Load a model using Runner
  2. List available models
  3. Use model_id parameter to target specific models
  4. Manage default model selection
  5. Model loading/unloading operations

Models used:

  • Multimodal: google/gemma-4-E4B-it (MultimodalArchitecture.Gemma4)
  • Text: Qwen/Qwen3-4B (Architecture.Qwen3)
python
"""
Example demonstrating multi-model usage with mistral.rs Python bindings.

This example shows how to:
1. Load a model using Runner
2. List available models
3. Use model_id parameter to target specific models
4. Manage default model selection
5. Model loading/unloading operations

Models used:
- Multimodal: google/gemma-4-E4B-it (MultimodalArchitecture.Gemma4)
- Text: Qwen/Qwen3-4B (Architecture.Qwen3)
"""

from mistralrs import (
    Runner,
    Which,
    ChatCompletionRequest,
    Architecture,
    MultimodalArchitecture,
)


# Example 1: Using Runner with model_id parameter for multi-model operations
def example_runner_with_model_id():
    """Demonstrate using Runner with model_id in requests."""

    # Create a runner with Gemma 4 E4B multimodal model
    runner = Runner(
        which=Which.MultimodalPlain(
            model_id="google/gemma-4-E4B-it",
            arch=MultimodalArchitecture.Gemma4,
        ),
        in_situ_quant="Q4K",
    )

    # List available models
    model_ids = runner.list_models()
    print("Available models:", model_ids)

    # Get default model
    default_model = runner.get_default_model_id()
    print(f"Default model: {default_model}")

    # Send request with specific model_id
    messages = [{"role": "user", "content": "Hello, how are you?"}]
    request = ChatCompletionRequest(messages=messages, model="default")

    if model_ids:
        # Request to specific model
        response = runner.send_chat_completion_request(
            request=request, model_id=model_ids[0]
        )
        print(f"Response from {model_ids[0]}:", response.choices[0].message.content)

        # Request without model_id (uses default)
        response = runner.send_chat_completion_request(request=request)
        print("Response from default model:", response.choices[0].message.content)


# Example 2: Model management operations
def example_model_management():
    """Demonstrate model management operations."""

    runner = Runner(
        which=Which.Plain(
            model_id="Qwen/Qwen3-4B",
            arch=Architecture.Qwen3,
        ),
        in_situ_quant="Q4K",
    )

    # List models with their status
    print("Initial models with status:", runner.list_models_with_status())

    # Get default model
    current_default = runner.get_default_model_id()
    print(f"Current default model: {current_default}")

    # Check if a model is loaded
    model_ids = runner.list_models()
    if model_ids:
        is_loaded = runner.is_model_loaded(model_ids[0])
        print(f"Is {model_ids[0]} loaded? {is_loaded}")

    # In a multi-model setup, you could change the default
    if model_ids and len(model_ids) > 1:
        runner.set_default_model_id(model_ids[1])
        print(f"Changed default model to: {model_ids[1]}")


# Example 3: Model unloading and reloading
def example_unload_reload():
    """Demonstrate model unloading and reloading."""

    runner = Runner(
        which=Which.MultimodalPlain(
            model_id="google/gemma-4-E4B-it",
            arch=MultimodalArchitecture.Gemma4,
        ),
        in_situ_quant="Q4K",
    )

    model_ids = runner.list_models()
    if not model_ids:
        print("No models loaded")
        return

    model_id = model_ids[0]
    print(f"Initial status: {runner.list_models_with_status()}")

    # Unload the model to free memory
    # Note: This preserves the model configuration for later reload
    print(f"Unloading model: {model_id}")
    runner.unload_model(model_id)

    # Check status after unload
    print(f"Status after unload: {runner.list_models_with_status()}")
    print(f"Is {model_id} loaded? {runner.is_model_loaded(model_id)}")

    # Reload the model when needed
    print(f"Reloading model: {model_id}")
    runner.reload_model(model_id)

    # Check status after reload
    print(f"Status after reload: {runner.list_models_with_status()}")


# Example 4: Streaming with specific models
def example_streaming_with_models():
    """Demonstrate streaming responses from specific models."""

    runner = Runner(
        which=Which.Plain(
            model_id="Qwen/Qwen3-4B",
            arch=Architecture.Qwen3,
        ),
        in_situ_quant="Q4K",
    )

    messages = [{"role": "user", "content": "Tell me a short story"}]
    request = ChatCompletionRequest(messages=messages, model="default", stream=True)

    model_ids = runner.list_models()
    if model_ids:
        # Stream from specific model
        stream = runner.send_chat_completion_request(
            request=request, model_id=model_ids[0]
        )

        print(f"Streaming from {model_ids[0]}:")
        for chunk in stream:
            if chunk.choices[0].delta.content:
                print(chunk.choices[0].delta.content, end="", flush=True)
        print()  # New line after streaming


# Example 5: Multi-model setup with multimodal and text models
def example_multi_model_setup():
    """
    Example showing a real multi-model setup with multimodal and text models.

    This example loads:
    - Multimodal model: google/gemma-4-E4B-it
    - Text model: Qwen/Qwen3-4B
    """
    # Load a multimodal model first
    runner = Runner(
        which=Which.MultimodalPlain(
            model_id="google/gemma-4-E4B-it",
            arch=MultimodalArchitecture.Gemma4,
        ),
        in_situ_quant="Q4K",
    )

    print("Initial models:", runner.list_models())

    # Add a text model dynamically (if add_model is available)
    # runner.add_model(
    #     model_id="qwen",
    #     which=Which.Plain(
    #         model_id="Qwen/Qwen3-4B",
    #         arch=Architecture.Qwen3,
    #     ),
    #     in_situ_quant="Q4K",
    # )
    # print("After add_model:", runner.list_models())

    # Send a request to gemma
    messages = [{"role": "user", "content": "What is 2 + 2?"}]
    request = ChatCompletionRequest(messages=messages, model="default")
    response = runner.send_chat_completion_request(request)
    print(f"Gemma response: {response.choices[0].message.content}")


if __name__ == "__main__":
    print("=== Multi-Model Example 1: Runner with model_id ===")
    example_runner_with_model_id()
    print("\n" + "=" * 50 + "\n")

    print("=== Multi-Model Example 2: Model Management ===")
    example_model_management()
    print("\n" + "=" * 50 + "\n")

    print("=== Multi-Model Example 3: Unload/Reload ===")
    example_unload_reload()
    print("\n" + "=" * 50 + "\n")

    print("=== Multi-Model Example 4: Streaming ===")
    example_streaming_with_models()
    print("\n" + "=" * 50 + "\n")

    print("=== Multi-Model Example 5: Multi-Model Setup ===")
    example_multi_model_setup()

Source: examples/python/multi_model_example.py