tests/models/multimodal/generation/test_voxtral.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import pytest
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
from mistral_common.protocol.instruct.messages import UserMessage
from transformers import VoxtralForConditionalGeneration

from vllm.tokenizers.mistral import MistralTokenizer

from ....conftest import AudioTestAssets
from ....utils import RemoteOpenAIServer
from ...utils import check_logprobs_close
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
from .vlm_utils import model_utils

MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
MISTRAL_FORMAT_ARGS = [
    "--tokenizer_mode",
    "mistral",
    "--config_format",
    "mistral",
    "--load_format",
    "mistral",
]


def _get_prompt(audio_assets: AudioTestAssets, question: str) -> list[int]:
    """Build a token-ID prompt via mistral_common for vLLM offline inference."""
    tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)

    audios = [
        Audio.from_file(str(asset.get_local_path()), strict=False)
        for asset in audio_assets
    ]
    audio_chunks = [
        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
    ]

    messages = [
        UserMessage(content=[*audio_chunks, TextChunk(text=question)]).to_openai()
    ]
    return tokenizer.apply_chat_template(messages=messages)


@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_with_multiple_audios(
    vllm_runner,
    audio_assets: AudioTestAssets,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
) -> None:
    vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
    run_multi_audio_test(
        vllm_runner,
        [(vllm_prompt, [a.audio_and_sample_rate for a in audio_assets])],  # type: ignore[list-item]
        MODEL_NAME,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tokenizer_mode="mistral",
    )


def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
    """Two-layer accuracy and serving validation using Mistral format.

    1. Offline vLLM greedy output (runs first to avoid CUDA fork issues
       with multiprocessing - see vlm_utils/core.py).
    2. Online OpenAI-compatible API output must match offline — validates
       that the serving path (chat template, audio encoding, tokenization)
       does not corrupt anything.

    Steps run sequentially so each releases the GPU before the next starts.
    """

    question = f"What's happening in these {len(audio_assets)} audio clips?"
    max_tokens = 10
    audio_data = [asset.audio_and_sample_rate for asset in audio_assets]

    vllm_prompt = _get_prompt(audio_assets, question)
    with vllm_runner(
        MODEL_NAME,
        dtype="half",
        enforce_eager=True,
        tokenizer_mode="mistral",
        config_format="mistral",
        load_format="mistral",
        limit_mm_per_prompt={"audio": len(audio_assets)},
    ) as vllm_model:
        offline_outputs = vllm_model.generate_greedy(
            [vllm_prompt],
            max_tokens,
            audios=[audio_data],
        )

    offline_text = offline_outputs[0][1]
    assert offline_text, "Offline vLLM inference produced empty output"

    def _asset_to_openai_chunk(asset):
        audio = Audio.from_file(str(asset.get_local_path()), strict=False)
        audio.format = "wav"
        return AudioChunk.from_audio(audio).to_openai()

    messages = [
        {
            "role": "user",
            "content": [
                *[_asset_to_openai_chunk(a) for a in audio_assets],
                {"type": "text", "text": question},
            ],
        }
    ]

    server_args = [
        "--enforce-eager",
        "--limit-mm-per-prompt",
        json.dumps({"audio": len(audio_assets)}),
        *MISTRAL_FORMAT_ARGS,
    ]

    with RemoteOpenAIServer(
        MODEL_NAME,
        server_args,
        env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"},
    ) as remote_server:
        client = remote_server.get_client()
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            max_tokens=max_tokens,
            temperature=0,
        )

    assert len(completion.choices) == 1
    choice = completion.choices[0]
    assert choice.finish_reason == "length"
    assert choice.message.content == offline_text, (
        f"Online serving output does not match offline inference.\n"
        f"  Online:  {choice.message.content!r}\n"
        f"  Offline: {offline_text!r}"
    )


def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):
    """Compare vLLM Mistral-format output against HF Transformers reference.

    Instead of requiring an exact text match (which is brittle across
    attention backends), we compare per-token logprobs using the standard
    check_logprobs_close helper: when tokens diverge at a position, each
    runner's chosen token must appear in the other's top-k logprobs.

    Marked xfail(strict=False) so remaining edge-case mismatches
    don't block CI.
    """
    question = f"What's happening in these {len(audio_assets)} audio clips?"
    max_tokens = 10
    num_logprobs = 5
    audio_data = [asset.audio_and_sample_rate for asset in audio_assets]

    vllm_prompt = _get_prompt(audio_assets, question)
    with vllm_runner(
        MODEL_NAME,
        dtype="half",
        enforce_eager=True,
        tokenizer_mode="mistral",
        config_format="mistral",
        load_format="mistral",
        limit_mm_per_prompt={"audio": len(audio_assets)},
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            [vllm_prompt],
            max_tokens,
            num_logprobs,
            audios=[audio_data],
        )
    assert vllm_outputs[0][1], "vLLM inference produced empty output"

    with hf_runner(
        MODEL_NAME,
        dtype="half",
        auto_cls=VoxtralForConditionalGeneration,
    ) as hf_model:
        hf_model = model_utils.voxtral_patch_hf_runner(hf_model)
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            [question],
            max_tokens,
            num_logprobs,
            audios=[audio_data],
        )
    assert hf_outputs[0][1], "HF Transformers produced empty output"

    print(
        f"HF Reference Comparison\n"
        f"  vLLM: {vllm_outputs[0][1]!r}\n"
        f"  HF:   {hf_outputs[0][1]!r}"
    )
    check_logprobs_close(
        outputs_0_lst=vllm_outputs,
        outputs_1_lst=hf_outputs,
        name_0="vllm",
        name_1="hf",
    )
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`

			`import json`

			`import pytest`
			`from mistral_common.audio import Audio`
Refactor MistralTokenizer (#26358) Signed-off-by: Julien Denize <julien.denize@mistral.ai> 2025-10-10 00:48:58 +02:00			`from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk`
			`from mistral_common.protocol.instruct.messages import UserMessage`
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`from transformers import VoxtralForConditionalGeneration`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00
[Chore] Adjust tokenizer import to avoid circular imports (#30601) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2025-12-13 20:42:39 +08:00			`from vllm.tokenizers.mistral import MistralTokenizer`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00
			`from ....conftest import AudioTestAssets`
			`from ....utils import RemoteOpenAIServer`
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`from ...utils import check_logprobs_close`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00			`from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test`
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`from .vlm_utils import model_utils`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00
			`MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"`
			`MISTRAL_FORMAT_ARGS = [`
			`"--tokenizer_mode",`
			`"mistral",`
			`"--config_format",`
			`"mistral",`
			`"--load_format",`
			`"mistral",`
			`]`


[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`def _get_prompt(audio_assets: AudioTestAssets, question: str) -> list[int]:`
			`"""Build a token-ID prompt via mistral_common for vLLM offline inference."""`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00			`tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)`

			`audios = [`
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`Audio.from_file(str(asset.get_local_path()), strict=False)`
			`for asset in audio_assets`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00			`]`
			`audio_chunks = [`
			`AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios`
			`]`

[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`messages = [`
			`UserMessage(content=[*audio_chunks, TextChunk(text=question)]).to_openai()`
			`]`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00			`return tokenizer.apply_chat_template(messages=messages)`


			`@pytest.mark.core_model`
			`@pytest.mark.parametrize("dtype", ["half"])`
			`@pytest.mark.parametrize("max_tokens", [128])`
			`@pytest.mark.parametrize("num_logprobs", [5])`
			`def test_models_with_multiple_audios(`
			`vllm_runner,`
			`audio_assets: AudioTestAssets,`
			`dtype: str,`
			`max_tokens: int,`
			`num_logprobs: int,`
			`) -> None:`
			`vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)`
			`run_multi_audio_test(`
			`vllm_runner,`
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`[(vllm_prompt, [a.audio_and_sample_rate for a in audio_assets])], # type: ignore[list-item]`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00			`MODEL_NAME,`
			`dtype=dtype,`
			`max_tokens=max_tokens,`
			`num_logprobs=num_logprobs,`
			`tokenizer_mode="mistral",`
			`)`


[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):`
			`"""Two-layer accuracy and serving validation using Mistral format.`

			`1. Offline vLLM greedy output (runs first to avoid CUDA fork issues`
			`with multiprocessing - see vlm_utils/core.py).`
			`2. Online OpenAI-compatible API output must match offline — validates`
			`that the serving path (chat template, audio encoding, tokenization)`
			`does not corrupt anything.`

			`Steps run sequentially so each releases the GPU before the next starts.`
			`"""`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`question = f"What's happening in these {len(audio_assets)} audio clips?"`
			`max_tokens = 10`
			`audio_data = [asset.audio_and_sample_rate for asset in audio_assets]`

			`vllm_prompt = _get_prompt(audio_assets, question)`
			`with vllm_runner(`
			`MODEL_NAME,`
			`dtype="half",`
			`enforce_eager=True,`
			`tokenizer_mode="mistral",`
			`config_format="mistral",`
			`load_format="mistral",`
			`limit_mm_per_prompt={"audio": len(audio_assets)},`
			`) as vllm_model:`
			`offline_outputs = vllm_model.generate_greedy(`
			`[vllm_prompt],`
			`max_tokens,`
			`audios=[audio_data],`
			`)`

			`offline_text = offline_outputs[0][1]`
			`assert offline_text, "Offline vLLM inference produced empty output"`

			`def _asset_to_openai_chunk(asset):`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00			`audio = Audio.from_file(str(asset.get_local_path()), strict=False)`
			`audio.format = "wav"`
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`return AudioChunk.from_audio(audio).to_openai()`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00
			`messages = [`
			`{`
			`"role": "user",`
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`"content": [`
			`*[_asset_to_openai_chunk(a) for a in audio_assets],`
			`{"type": "text", "text": question},`
			`],`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00			`}`
			`]`
Convert formatting to use `ruff` instead of `yapf` + `isort` (#26247) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-10-05 15:06:22 +01:00
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`server_args = [`
			`"--enforce-eager",`
			`"--limit-mm-per-prompt",`
			`json.dumps({"audio": len(audio_assets)}),`
			`*MISTRAL_FORMAT_ARGS,`
			`]`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`with RemoteOpenAIServer(`
			`MODEL_NAME,`
			`server_args,`
			`env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"},`
			`) as remote_server:`
			`client = remote_server.get_client()`
			`completion = client.chat.completions.create(`
			`model=MODEL_NAME,`
			`messages=messages,`
			`max_tokens=max_tokens,`
			`temperature=0,`
			`)`

			`assert len(completion.choices) == 1`
			`choice = completion.choices[0]`
[Voxtral] Add more tests (#21010) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-07-16 06:11:49 +02:00			`assert choice.finish_reason == "length"`
[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-02-13 22:04:29 -06:00			`assert choice.message.content == offline_text, (`
			`f"Online serving output does not match offline inference.\n"`
			`f" Online: {choice.message.content!r}\n"`
			`f" Offline: {offline_text!r}"`
			`)`


			`def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):`
			`"""Compare vLLM Mistral-format output against HF Transformers reference.`

			`Instead of requiring an exact text match (which is brittle across`
			`attention backends), we compare per-token logprobs using the standard`
			`check_logprobs_close helper: when tokens diverge at a position, each`
			`runner's chosen token must appear in the other's top-k logprobs.`

			`Marked xfail(strict=False) so remaining edge-case mismatches`
			`don't block CI.`
			`"""`
			`question = f"What's happening in these {len(audio_assets)} audio clips?"`
			`max_tokens = 10`
			`num_logprobs = 5`
			`audio_data = [asset.audio_and_sample_rate for asset in audio_assets]`

			`vllm_prompt = _get_prompt(audio_assets, question)`
			`with vllm_runner(`
			`MODEL_NAME,`
			`dtype="half",`
			`enforce_eager=True,`
			`tokenizer_mode="mistral",`
			`config_format="mistral",`
			`load_format="mistral",`
			`limit_mm_per_prompt={"audio": len(audio_assets)},`
			`) as vllm_model:`
			`vllm_outputs = vllm_model.generate_greedy_logprobs(`
			`[vllm_prompt],`
			`max_tokens,`
			`num_logprobs,`
			`audios=[audio_data],`
			`)`
			`assert vllm_outputs[0][1], "vLLM inference produced empty output"`

			`with hf_runner(`
			`MODEL_NAME,`
			`dtype="half",`
			`auto_cls=VoxtralForConditionalGeneration,`
			`) as hf_model:`
			`hf_model = model_utils.voxtral_patch_hf_runner(hf_model)`
			`hf_outputs = hf_model.generate_greedy_logprobs_limit(`
			`[question],`
			`max_tokens,`
			`num_logprobs,`
			`audios=[audio_data],`
			`)`
			`assert hf_outputs[0][1], "HF Transformers produced empty output"`

			`print(`
			`f"HF Reference Comparison\n"`
			`f" vLLM: {vllm_outputs[0][1]!r}\n"`
			`f" HF: {hf_outputs[0][1]!r}"`
			`)`
			`check_logprobs_close(`
			`outputs_0_lst=vllm_outputs,`
			`outputs_1_lst=hf_outputs,`
			`name_0="vllm",`
			`name_1="hf",`
			`)`