Back to Mistral Rs

online_calibration

docs/src/content/docs/examples/python/online-calibration.md

0.8.131.3 KB
Original Source
<!-- generated by docs/scripts/render_examples.py; edit the source example instead -->

Runnable Python SDK example online_calibration.

<!-- needs-header -->
python
from mistralrs import Runner, Which, ChatCompletionRequest

runner = Runner(
    which=Which.Plain(
        model_id="google/gemma-4-E4B-it",
    ),
    in_situ_quant="Q4K",
)

request = ChatCompletionRequest(
    model="default",
    messages=[{"role": "user", "content": "Explain how a hash map works, briefly."}],
    max_tokens=64,
)

# Collect activation statistics while serving normally (~15% decode overhead while on).
runner.begin_calibration()
for _ in range(8):
    runner.send_chat_completion_request(request)

status = runner.calibration_status()
print(
    f"Collecting on {status.layers_tracking}/{status.layers} layers, "
    f"{status.total_rows} token rows seen"
)

# Requantize from the source weights with the traffic-derived importance matrix and
# hot-swap each layer. The optional path also saves the imatrix for reuse.
runner.apply_calibration(save_cimatrix="traffic.cimatrix")

res = runner.send_chat_completion_request(request)
print(res.choices[0].message.content)

Source: examples/python/online_calibration.py