[Bugfix] Fix Voxtral streaming slot_mapping (#33073)

Signed-off-by: NickLucche <nlucches@redhat.com>
2026-01-26 19:40:40 +01:00
parent 67fe677c53
commit 19ab0f7ce5
1 changed files with 40 additions and 0 deletions
--- a/vllm/model_executor/models/whisper_causal.py
+++ b/vllm/model_executor/models/whisper_causal.py
@@ -105,6 +105,7 @@ def create_whisper_attention_backend_with_block_pooling(
 ) -> type[AttentionBackend]:
    prefix = "WhisperCausalAttentionWithBlockPooling_"
    underlying_builder = underlying_attn_backend.get_builder_cls()
+    underlying_impl = underlying_attn_backend.get_impl_cls()

    class WhisperCausalAttentionWithBlockPoolingBuilder(underlying_builder):  # type: ignore
        def __init__(
@@ -151,6 +152,43 @@ def create_whisper_attention_backend_with_block_pooling(
                common_prefix_len, new_common_attn_metadata, fast_build
            )

+    # NOTE: We need a custom impl so we can use the transformed slot_mapping
+    # computed by `WhisperCausalAttentionWithBlockPoolingBuilder` instead of
+    # the one from `forward_context.slot_mapping` (gpu_model_runner).
+    # This follows the same pattern as CrossAttentionImpl.
+    class WhisperCausalAttentionWithBlockPoolingImpl(underlying_impl):  # type: ignore[valid-type,misc]
+        def forward(
+            self,
+            layer: torch.nn.Module,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            kv_cache: torch.Tensor,
+            attn_metadata: AttentionMetadata,
+            output: torch.Tensor | None = None,
+            output_scale: torch.Tensor | None = None,
+            output_block_scale: torch.Tensor | None = None,
+        ) -> torch.Tensor:
+            if (
+                not underlying_attn_backend.forward_includes_kv_cache_update
+                and attn_metadata is not None
+            ):
+                self.do_kv_cache_update(
+                    layer, key, value, kv_cache, attn_metadata.slot_mapping
+                )
+
+            return super().forward(
+                layer,
+                query,
+                key,
+                value,
+                kv_cache,
+                attn_metadata,
+                output,
+                output_scale,
+                output_block_scale,
+            )
+
    if not issubclass(underlying_attn_backend, FlashAttentionBackend):
        raise NotImplementedError(
            f"{underlying_attn_backend} is not yet supported."
@@ -163,6 +201,7 @@ def create_whisper_attention_backend_with_block_pooling(
        attention_backend_cls=underlying_attn_backend,
        overrides={
            "get_builder_cls": lambda: WhisperCausalAttentionWithBlockPoolingBuilder,
+            "get_impl_cls": lambda: WhisperCausalAttentionWithBlockPoolingImpl,
            "get_kv_cache_shape": lambda num_blocks,
            block_size,
            num_kv_heads,
@@ -175,6 +214,7 @@ def create_whisper_attention_backend_with_block_pooling(
                num_kv_heads // block_pool_size,
                head_size,
            ),  # TODO: generalize to other backends
+            "forward_includes_kv_cache_update": True,
        },
    )