diff --git a/vllm/entrypoints/openai/translations/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py
index e80d6b9a2..58bfb3e97 100644
--- a/vllm/entrypoints/openai/translations/speech_to_text.py
+++ b/vllm/entrypoints/openai/translations/speech_to_text.py
@@ -138,6 +138,9 @@ class OpenAISpeechToText(OpenAIServing):
         if not supports_transcription(self.model_cls):
             return
 
+        if getattr(self.model_cls, "skip_warmup_audio_preprocessing", False):
+            return
+
         try:
             warmup_start = time.perf_counter()
             logger.info("Warming up audio preprocessing libraries...")
@@ -150,9 +153,7 @@ class OpenAISpeechToText(OpenAIServing):
             _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)
 
             # Warm up mel-spectrogram computation with model-specific parameters
-            from vllm.transformers_utils.processor import (
-                cached_processor_from_config,
-            )
+            from vllm.transformers_utils.processor import cached_processor_from_config
 
             processor = cached_processor_from_config(self.model_config)
             feature_extractor = None
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 86ee98147..942d91e44 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -335,6 +335,9 @@ class VoxtralForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
 ):
     supported_languages = ISO639_1_SUPPORTED_LANGS
+    # transformers' currently has limited support for MistralCommon backend
+    # and cached_get_processor. Let's skip until fixed
+    skip_warmup_audio_preprocessing = True
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
diff --git a/vllm/model_executor/models/voxtral_realtime.py b/vllm/model_executor/models/voxtral_realtime.py
index 82801b6eb..6c4d20d35 100644
--- a/vllm/model_executor/models/voxtral_realtime.py
+++ b/vllm/model_executor/models/voxtral_realtime.py
@@ -218,6 +218,9 @@ class VoxtralRealtimeBuffer:
 @support_torch_compile
 class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
     requires_raw_input_tokens = True
+    # transformers' currently has limited support for MistralCommon backend
+    # and cached_get_processor. Let's skip until fixed
+    skip_warmup_audio_preprocessing = True
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)