diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index fca941acd..7fe010e5f 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -251,6 +251,7 @@ No extra registration is required beyond having your model class available via t - Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py) - Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`. - Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py) +- Qwen3-Omni multimodal with audio embeddings: [vllm/model_executor/models/qwen3_omni_moe_thinker.py](../../../vllm/model_executor/models/qwen3_omni_moe_thinker.py) ## Test with the API diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index a96abd891..e07e17ec5 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -781,6 +781,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ | | `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | | ✅︎ | +| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, etc. | | ✅︎ | | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 93a17f0c8..b06503031 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -24,7 +24,7 @@ from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence from functools import partial -from typing import Any +from typing import Any, Literal, cast import numpy as np import torch @@ -48,8 +48,9 @@ from transformers import __version__ as TRANSFORMERS_VERSION # isort: on from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.attention.mm_encoder_attention import ( @@ -79,6 +80,7 @@ from vllm.multimodal.processing.processor import ( PromptUpdateDetails, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.processor import cached_processor_from_config from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( @@ -86,6 +88,7 @@ from .interfaces import ( SupportsMRoPE, SupportsMultiModal, SupportsPP, + SupportsTranscription, ) from .qwen2_5_omni_thinker import ( Qwen2_5OmniAudioFeatureInputs, @@ -110,6 +113,29 @@ from .vision import get_vit_attn_backend logger = init_logger(__name__) +# Speech input languages supported by Qwen3-Omni +# From: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct +ISO639_1_SUPPORTED_LANGS = { + "en": "English", + "zh": "Chinese", + "ko": "Korean", + "ja": "Japanese", + "de": "German", + "ru": "Russian", + "it": "Italian", + "fr": "French", + "es": "Spanish", + "pt": "Portuguese", + "ms": "Malay", + "nl": "Dutch", + "id": "Indonesian", + "tr": "Turkish", + "vi": "Vietnamese", + "yue": "Cantonese", + "ar": "Arabic", + "ur": "Urdu", +} + def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): input_lengths_leave = input_lengths % 100 @@ -1572,6 +1598,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( SupportsPP, SupportsMRoPE, Qwen3OmniMoeConditionalGenerationMixin, + SupportsTranscription, ): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -1593,6 +1620,8 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( ], } + supported_languages = ISO639_1_SUPPORTED_LANGS + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): @@ -2085,6 +2114,77 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( total_tokens = num_video + audio_len return np.concatenate(pos_ids_list, axis=1), total_tokens + @classmethod + def get_speech_to_text_config( + cls, model_config: ModelConfig, task_type: str + ) -> SpeechToTextConfig: + processor = cached_processor_from_config( + model_config, processor_cls=Qwen3OmniMoeProcessor + ) + return SpeechToTextConfig( + max_audio_clip_s=processor.feature_extractor.chunk_length, + sample_rate=processor.feature_extractor.sampling_rate, + min_energy_split_window_size=None, + ) + + @classmethod + def get_generation_prompt( + cls, + audio: np.ndarray, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + language: str | None, + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: str | None, + ) -> PromptType: + """ + Construct a transcription/translation prompt for Qwen3-Omni. + """ + # Transcribe this audio [into ] | for transcription + # Translate this audio [from into ] | for translation + instruction = "Transcribe" if task_type == "transcribe" else "Translate" + instruction += " this audio" + + # Default to_language to English for translation + if task_type == "translate" and to_language is None: + to_language = "en" + + # Get full language names from supported_languages mapping + full_lang_name = cls.supported_languages.get(language, "") + full_lang_name_to = cls.supported_languages.get(to_language, "") + + if task_type == "transcribe" and full_lang_name: + instruction += f" into {full_lang_name}" + elif task_type == "translate": + if full_lang_name: + instruction += f" from {full_lang_name}" + if full_lang_name_to: + instruction += f" into {full_lang_name_to}" + + instruction += "." + + if request_prompt: + instruction += f" {request_prompt}" + + processor = cached_processor_from_config( + model_config, processor_cls=Qwen3OmniMoeProcessor + ) + # Audio placeholder format: <|audio_start|><|audio_pad|><|audio_end|> + audio_placeholder = "<|audio_start|><|audio_pad|><|audio_end|>" + user_content = f"{audio_placeholder}{instruction}" + + messages = [{"role": "user", "content": user_content}] + prompt = processor.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + audio_data = (audio, stt_config.sample_rate) + prompts_dict = {"multi_modal_data": {"audio": audio_data}, "prompt": prompt} + return cast(PromptType, prompts_dict) + def get_mrope_input_positions( self, input_tokens: list[int],