From 2a8d84e66d19014c44155ca1ee79b4aa0227734d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 3 Feb 2026 13:49:49 +0000 Subject: [PATCH] Fix Gemma3n audio encoder for Transformers v5 (#33673) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/gemma3n_mm.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 1460a4586..8b5e7b8bb 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -621,10 +621,15 @@ class Gemma3nForConditionalGeneration( # Run on padded features to enable batching input_features = audio_input["input_features_padded"].squeeze(1) input_features_mask = audio_input["input_features_mask"].squeeze(1) - audio_outputs, audio_mask = self.audio_tower( - input_features, ~input_features_mask - ) - audio_features = self.embed_audio(inputs_embeds=audio_outputs) + audio_outputs = self.audio_tower(input_features, ~input_features_mask) + if isinstance(audio_outputs, tuple): + # Transformers v4 + audio_encodings, audio_mask = audio_outputs + else: + # Transformers v5 + audio_encodings = audio_outputs.last_hidden_state + audio_mask = audio_outputs.audio_mel_mask + audio_features = self.embed_audio(inputs_embeds=audio_encodings) # The Gemma3nProcessor expects all audio will be 30s in length and # inserts 188 audio soft tokens into the text to account for this.