[Bugfix] Zero-init MLA attention output buffers to prevent NaN from CUDA graph padding (#37442)

Signed-off-by: Elvir Crncevic <elvircrn@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> (cherry picked from commit ef2c4f778d)
2026-03-19 01:28:37 +01:00
parent 6edd43de3c
commit 89138b21cc
2 changed files with 58 additions and 1 deletions
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -162,6 +162,11 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
        # Share workspace buffer across all executions
        self._workspace = g_sm100_workspace
        # Pre-allocated output buffer, lazily sized on first call.
        # Zero-init once to prevent NaN in padding slots (seq_lens=0)
        # from contaminating downstream per-tensor reductions.
        self._decode_out: torch.Tensor | None = None
    def _sm100_cutlass_mla_decode(
        self,
        q_nope: torch.Tensor,
@@ -218,7 +223,15 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
            if is_quantized_kv_cache(self.kv_cache_dtype)
            else q_nope.dtype
        )
-        out = q_nope.new_empty((B_q, MAX_HEADS, D_latent), dtype=dtype)
+        # Reuse pre-allocated zero-init output buffer to avoid a memset
        # kernel on every CUDA graph replay.
        if (
            self._decode_out is None
            or self._decode_out.shape[0] < B_q
            or self._decode_out.dtype != dtype
        ):
            self._decode_out = q_nope.new_zeros((B_q, MAX_HEADS, D_latent), dtype=dtype)
        out = self._decode_out[:B_q]
        lse = (
            torch.empty((B_q, MAX_HEADS), dtype=torch.float32, device=q_nope.device)
            if self.need_to_return_lse_for_decode
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -21,6 +21,7 @@ from vllm.v1.attention.backend import (
    AttentionLayer,
    AttentionType,
    MultipleOf,
    is_quantized_kv_cache,
 )
 from vllm.v1.attention.backends.utils import KVCacheLayoutType
@@ -151,6 +152,11 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
        self.bmm1_scale: float | None = None
        self.bmm2_scale: float | None = None
        # Pre-allocated output buffer, lazily sized on first call.
        # Zero-init once to prevent NaN in padding slots (seq_lens=0)
        # from contaminating downstream per-tensor reductions.
        self._decode_out: torch.Tensor | None = None
    def forward_mqa(
        self,
        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
@@ -181,6 +187,37 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
        if self.bmm2_scale is None:
            self.bmm2_scale = layer._v_scale_float
        # Reuse pre-allocated zero-init output buffer to avoid a memset
        # kernel on every CUDA graph replay.
        # q is 4D: (batch, q_len_per_req, num_heads, head_dim)
        # FlashInfer has a bug where out= validation hardcodes 3D shape
        # (batch, num_heads, kv_lora_rank), but the kernel writes 4D
        # (batch, q_len, num_heads, kv_lora_rank) when q_len > 1.
        # So we can only pass out= for single-token decode (q_len == 1).
        # For q_len > 1, we zero padding slots after the kernel returns.
        # TODO: upstream fix to FlashInfer
        B, q_len_per_req = q.shape[0], q.shape[1]
        out_kwargs: dict[str, torch.Tensor] = {}
        if q_len_per_req == 1:
            dtype = (
                torch.bfloat16
                if is_quantized_kv_cache(self.kv_cache_dtype)
                else q.dtype
            )
            if (
                self._decode_out is None
                or self._decode_out.shape[0] < B
                or self._decode_out.dtype != dtype
            ):
                self._decode_out = torch.zeros(
                    B,
                    q.shape[2],
                    self.kv_lora_rank,
                    dtype=dtype,
                    device=q.device,
                )
            out_kwargs["out"] = self._decode_out[:B]
        o = trtllm_batch_decode_with_kv_cache_mla(
            query=q,
            kv_cache=kv_c_and_k_pe_cache.unsqueeze(1),
@@ -193,8 +230,15 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
            max_seq_len=attn_metadata.max_seq_len,
            bmm1_scale=self.bmm1_scale,
            bmm2_scale=self.bmm2_scale,
            **out_kwargs,
        )
        # For q_len > 1, we can't pass out= so we work around by zeroing padding slots
        if not out_kwargs:
            num_real = attn_metadata.num_decodes
            if num_real < o.shape[0]:
                o[num_real:] = 0
        # Flatten the output for consistent shape
        o = o.view(-1, o.shape[-2], o.shape[-1])