[Voxtral models] Skip warm-up to skip confusing error message in warm-up (#33576)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-03 16:22:34 +01:00
parent 5c4f2dd6ef
commit f0d5251715
3 changed files with 10 additions and 3 deletions
--- a/vllm/entrypoints/openai/translations/speech_to_text.py
+++ b/vllm/entrypoints/openai/translations/speech_to_text.py
@@ -138,6 +138,9 @@ class OpenAISpeechToText(OpenAIServing):
        if not supports_transcription(self.model_cls):
            return

+        if getattr(self.model_cls, "skip_warmup_audio_preprocessing", False):
+            return
+
        try:
            warmup_start = time.perf_counter()
            logger.info("Warming up audio preprocessing libraries...")
@@ -150,9 +153,7 @@ class OpenAISpeechToText(OpenAIServing):
            _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)

            # Warm up mel-spectrogram computation with model-specific parameters
-            from vllm.transformers_utils.processor import (
-                cached_processor_from_config,
-            )
+            from vllm.transformers_utils.processor import cached_processor_from_config

            processor = cached_processor_from_config(self.model_config)
            feature_extractor = None
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -335,6 +335,9 @@ class VoxtralForConditionalGeneration(
    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
 ):
    supported_languages = ISO639_1_SUPPORTED_LANGS
+    # transformers' currently has limited support for MistralCommon backend
+    # and cached_get_processor. Let's skip until fixed
+    skip_warmup_audio_preprocessing = True

    packed_modules_mapping = {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
--- a/vllm/model_executor/models/voxtral_realtime.py
+++ b/vllm/model_executor/models/voxtral_realtime.py
@@ -218,6 +218,9 @@ class VoxtralRealtimeBuffer:
@support_torch_compile
 class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
    requires_raw_input_tokens = True
+    # transformers' currently has limited support for MistralCommon backend
+    # and cached_get_processor. Let's skip until fixed
+    skip_warmup_audio_preprocessing = True

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)