diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index ee2bb837a..f53a0e9bc 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -353,6 +353,39 @@ class Qwen2_5OmniThinkerProcessingInfo( def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": None, "image": None, "video": None} + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int] | None = None, + ) -> Mapping[str, int] | None: + mm_counts = mm_counts or {} + requested_modalities = {m for m, c in mm_counts.items() if c > 0} + mm_max_tokens: dict[str, int] = {} + + if requested_modalities & {"image", "video"}: + vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item( + self, + seq_len=seq_len, + mm_counts=mm_counts, + ) + mm_max_tokens.update( + { + m: vl_tokens[m] + for m in ["image", "video"] + if m in requested_modalities + } + ) + + if "audio" in requested_modalities: + audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item( + self, + seq_len=seq_len, + mm_counts=mm_counts, + ) + mm_max_tokens["audio"] = audio_tokens["audio"] + + return mm_max_tokens + class Qwen2_5OmniThinkerDummyInputsBuilder( BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo] diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 053e8bb85..d125570a1 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -179,6 +179,26 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": None} + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int] | None = None, + ) -> Mapping[str, int]: + mm_counts = mm_counts or {} + if mm_counts.get("audio", 0) <= 0: + return {} + + feature_extractor = self.get_feature_extractor() + chunk_length = min(feature_extractor.chunk_length, 30) + audio_len = int(chunk_length * feature_extractor.sampling_rate) + hop_length = feature_extractor.hop_length + max_mel_seq_len = audio_len // hop_length + + input_lengths = torch.tensor([max_mel_seq_len], dtype=torch.long) + _, output_lengths = _get_feat_extract_output_lengths(input_lengths) + + return {"audio": int(output_lengths.item())} + class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 1e6348b72..00335b88b 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1163,6 +1163,39 @@ class Qwen3OmniMoeThinkerProcessingInfo( def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": None, "image": None, "video": None} + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int] | None = None, + ) -> Mapping[str, int] | None: + mm_counts = mm_counts or {} + requested_modalities = {m for m, c in mm_counts.items() if c > 0} + mm_max_tokens: dict[str, int] = {} + + if requested_modalities & {"image", "video"}: + vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item( + self, + seq_len=seq_len, + mm_counts=mm_counts, + ) + mm_max_tokens.update( + { + m: vl_tokens[m] + for m in ["image", "video"] + if m in requested_modalities + } + ) + + if "audio" in requested_modalities: + audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item( + self, + seq_len=seq_len, + mm_counts=mm_counts, + ) + mm_max_tokens["audio"] = audio_tokens["audio"] + + return mm_max_tokens + Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder