[Bugfix]: avoid overriding audio/text kwargs (Qwen3-Omni) (#31790)

Signed-off-by: Zhongze Jiang <jiangzhongze.jzz@ant-intl.com>
This commit is contained in:
Jzz1943
2026-01-06 20:59:17 +08:00
committed by GitHub
parent 6444824873
commit 2c1a4f2488

View File

@@ -750,18 +750,20 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
# https://github.com/huggingface/transformers/pull/41473
mm_kwargs = dict(mm_kwargs)
tok_kwargs = dict(tok_kwargs)
mm_kwargs["audio_kwargs"] = dict(mm_kwargs.get("audio_kwargs") or {})
mm_kwargs["text_kwargs"] = dict(mm_kwargs.get("text_kwargs") or {})
if Version(TRANSFORMERS_VERSION) < Version("4.58.0"):
# Extract audio_sample_rate before restructuring
audio_sample_rate = mm_kwargs.pop("audio_sample_rate", None)
# move truncation to audio_kwargs level to avoid conflict
# with tok_kwargs
mm_kwargs["audio_kwargs"] = {
"truncation": mm_kwargs.pop("truncation", False)
}
mm_kwargs["text_kwargs"] = {
"truncation": tok_kwargs.pop("truncation", False)
}
mm_kwargs["audio_kwargs"].setdefault(
"truncation", mm_kwargs.pop("truncation", False)
)
mm_kwargs["text_kwargs"].setdefault(
"truncation", tok_kwargs.pop("truncation", False)
)
# Validate and conditionally pass audio_sample_rate
# WhisperFeatureExtractor has a fixed sampling rate, and vLLM's