diff --git a/vllm/entrypoints/openai/translations/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py index e80d6b9a2..58bfb3e97 100644 --- a/vllm/entrypoints/openai/translations/speech_to_text.py +++ b/vllm/entrypoints/openai/translations/speech_to_text.py @@ -138,6 +138,9 @@ class OpenAISpeechToText(OpenAIServing): if not supports_transcription(self.model_cls): return + if getattr(self.model_cls, "skip_warmup_audio_preprocessing", False): + return + try: warmup_start = time.perf_counter() logger.info("Warming up audio preprocessing libraries...") @@ -150,9 +153,7 @@ class OpenAISpeechToText(OpenAIServing): _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate) # Warm up mel-spectrogram computation with model-specific parameters - from vllm.transformers_utils.processor import ( - cached_processor_from_config, - ) + from vllm.transformers_utils.processor import cached_processor_from_config processor = cached_processor_from_config(self.model_config) feature_extractor = None diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 86ee98147..942d91e44 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -335,6 +335,9 @@ class VoxtralForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription ): supported_languages = ISO639_1_SUPPORTED_LANGS + # transformers' currently has limited support for MistralCommon backend + # and cached_get_processor. Let's skip until fixed + skip_warmup_audio_preprocessing = True packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], diff --git a/vllm/model_executor/models/voxtral_realtime.py b/vllm/model_executor/models/voxtral_realtime.py index 82801b6eb..6c4d20d35 100644 --- a/vllm/model_executor/models/voxtral_realtime.py +++ b/vllm/model_executor/models/voxtral_realtime.py @@ -218,6 +218,9 @@ class VoxtralRealtimeBuffer: @support_torch_compile class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtime): requires_raw_input_tokens = True + # transformers' currently has limited support for MistralCommon backend + # and cached_get_processor. Let's skip until fixed + skip_warmup_audio_preprocessing = True def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix)