From db5192fe419abf25ce5a7f2fad48fb9cd00b20cb Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Tue, 19 May 2026 03:35:15 +0000
Subject: [PATCH] Patch from Docker image's vLLM (0.20.2rc1) instead of newer
 upstream

The nightly Docker image uses an older vLLM that doesn't have
NormGateLinear, breakable_cudagraph, etc. Patching the Docker
image's own files ensures compatibility.

- deepseek_v4.py: Patches from Docker image + NVFP4 mapper + wo_a BF16
- deepseek_v4_attention.py: Patches from Docker image + inv rope BF16
  + weights_proj quant + removed QuantFP8/GroupShape imports
---
 vllm/patches/deepseek_v4.py           | 220 +++++++++++---------------
 vllm/patches/deepseek_v4_attention.py |  18 +--
 2 files changed, 95 insertions(+), 143 deletions(-)

diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py
index 6c99cdef..3af73362 100644
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -23,14 +23,11 @@ from vllm.model_executor.layers.deepseek_v4_attention import (
     DeepseekV4MLAModules,
     DeepseekV4MultiHeadLatentAttentionWrapper,
 )
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear
 from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import (
     fused_topk_bias,
 )
-from vllm.model_executor.layers.fused_moe.router.norm_gate_linear import (
-    NormGateLinear,
-)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -38,12 +35,6 @@ from vllm.model_executor.layers.linear import (
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mhc import (
-    HCHeadOp,
-    MHCFusedPostPreOp,
-    MHCPostOp,
-    MHCPreOp,
-)
 from vllm.model_executor.layers.quantization import (
     QuantizationConfig,
     QuantizationMethods,
@@ -758,23 +749,23 @@ class DeepseekV4MoE(nn.Module):
                 "deep_gemm_mega_moe for this checkpoint."
             )
 
-        # Fused RMSNorm + gate: owns both ffn_norm and the gate matmul.
-        self.norm_gate = NormGateLinear(
-            hidden_size=config.hidden_size,
-            num_experts=config.n_routed_experts,
-            rms_eps=config.rms_norm_eps,
-            prefix=f"{prefix}.norm_gate",
+        self.gate = GateLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            out_dtype=torch.float32,
+            bias=False,
+            prefix=f"{prefix}.gate",
         )
-        # Routing-side tensors live on ``norm_gate`` directly (not on the
-        # inner gate); they are initialized to None in NormGatedLinear and
-        # populated below depending on the MoE variant.
+        self.gate.e_score_correction_bias = None
+        self.gate.tid2eid = None
         is_hash_moe = extract_layer_index(prefix) < config.num_hash_layers
         self.hash_indices_dtype = torch.int64 if self.use_mega_moe else torch.int32
+
         if is_hash_moe:
             # hash MoE doesn't use e_score_correction_bias
             # Use randint instead of empty to avoid garbage values causing
             # invalid memory access in dummy mode (--load-format="dummy")
-            self.norm_gate.tid2eid = nn.Parameter(
+            self.gate.tid2eid = nn.Parameter(
                 torch.randint(
                     0,
                     config.n_routed_experts,
@@ -784,7 +775,7 @@ class DeepseekV4MoE(nn.Module):
                 requires_grad=False,
             )
         elif getattr(config, "topk_method", None) == "noaux_tc":
-            self.norm_gate.e_score_correction_bias = nn.Parameter(
+            self.gate.e_score_correction_bias = nn.Parameter(
                 torch.empty(config.n_routed_experts, dtype=torch.float32),
                 requires_grad=False,
             )
@@ -847,9 +838,10 @@ class DeepseekV4MoE(nn.Module):
         self.n_local_experts = config.n_routed_experts // self.tp_size
         self.experts_start_idx = self.tp_rank * self.n_local_experts
         self.experts_end_idx = self.experts_start_idx + self.n_local_experts
-        # We don't pass `gate` into FusedMoE
+
         self.experts = FusedMoE(
             shared_experts=self.shared_experts,
+            gate=self.gate,
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
@@ -859,8 +851,8 @@ class DeepseekV4MoE(nn.Module):
             prefix=f"{prefix}.experts",
             scoring_func=self.scoring_func,
             routed_scaling_factor=self.routed_scaling_factor,
-            e_score_correction_bias=self.norm_gate.e_score_correction_bias,
-            hash_indices_table=self.norm_gate.tid2eid,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            hash_indices_table=self.gate.tid2eid,
             swiglu_limit=self.swiglu_limit,
             router_logits_dtype=torch.float32,
         )
@@ -868,40 +860,40 @@ class DeepseekV4MoE(nn.Module):
     def forward(
         self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None
     ) -> torch.Tensor:
-        if self.norm_gate.tid2eid is not None and input_ids is None:
+        if self.gate.tid2eid is not None and input_ids is None:
             raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.")
 
         if not self.use_mega_moe:
             return self._forward_fused_moe(hidden_states, input_ids)
 
         org_shape = hidden_states.shape
-        normed_x, router_logits = self.norm_gate(hidden_states)
+        router_logits, _ = self.gate(hidden_states)
         topk_weights, topk_ids = fused_topk_bias(
-            hidden_states=normed_x,
+            hidden_states=hidden_states,
             gating_output=router_logits,
             scoring_func=self.scoring_func,
-            e_score_correction_bias=self.norm_gate.e_score_correction_bias.data
-            if self.norm_gate.e_score_correction_bias is not None
+            e_score_correction_bias=self.gate.e_score_correction_bias.data
+            if self.gate.e_score_correction_bias is not None
             else None,
             topk=self.n_activated_experts,
             renormalize=self.renormalize,
             indices_type=self.hash_indices_dtype,
             input_tokens=input_ids,
-            hash_indices_table=self.norm_gate.tid2eid,
+            hash_indices_table=self.gate.tid2eid,
             routed_scaling_factor=self.routed_scaling_factor,
         )
         activation_clamp = (
             float(self.swiglu_limit) if self.swiglu_limit is not None else None
         )
         final_hidden_states = self.experts(
-            normed_x,
+            hidden_states,
             topk_weights,
             topk_ids,
             activation_clamp=activation_clamp,
         )
 
         if self.shared_experts is not None:
-            shared_output = self.shared_experts(normed_x)
+            shared_output = self.shared_experts(hidden_states)
             final_hidden_states += shared_output
 
         return final_hidden_states.view(org_shape)
@@ -909,14 +901,21 @@ class DeepseekV4MoE(nn.Module):
     def _forward_fused_moe(
         self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None
     ) -> torch.Tensor:
-        assert not self.experts.is_internal_router
         org_shape = hidden_states.shape
-        normed_x, router_logits = self.norm_gate(hidden_states)
-        final_hidden_states = self.experts(
-            hidden_states=normed_x,
-            router_logits=router_logits,
-            input_ids=input_ids,
-        )
+        if self.experts.is_internal_router:
+            # In this case, the gate/router runs inside the FusedMoE class
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states,
+                router_logits=hidden_states,
+                input_ids=input_ids,
+            )
+        else:
+            router_logits, _ = self.gate(hidden_states)
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+                input_ids=input_ids,
+            )
 
         return final_hidden_states.view(org_shape)
 
@@ -1122,8 +1121,7 @@ class DeepseekV4DecoderLayer(nn.Module):
         self.ffn = DeepseekV4MoE(vllm_config, prefix=f"{prefix}.ffn")
 
         self.attn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps)
-        # ``ffn_norm`` is owned by ``self.ffn.norm_gate`` (fused with the
-        # router gate matmul); see ``NormGatedLinear``.
+        self.ffn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps)
         self.hc_mult = config.hc_mult
         self.hc_sinkhorn_iters = config.hc_sinkhorn_iters
         self.hc_eps = config.hc_eps
@@ -1172,9 +1170,6 @@ class DeepseekV4DecoderLayer(nn.Module):
             ),
             requires_grad=False,
         )
-        self.mhc_pre = MHCPreOp()
-        self.mhc_post = MHCPostOp()
-        self.mhc_fused_post_pre = MHCFusedPostPreOp()
 
     def hc_pre(
         self,
@@ -1183,7 +1178,7 @@ class DeepseekV4DecoderLayer(nn.Module):
         hc_scale: torch.Tensor,
         hc_base: torch.Tensor,
     ):
-        post_mix, res_mix, layer_input = self.mhc_pre(
+        post_mix, res_mix, layer_input = torch.ops.vllm.mhc_pre(
             residual=x,
             fn=hc_fn,
             hc_scale=hc_scale,
@@ -1203,17 +1198,17 @@ class DeepseekV4DecoderLayer(nn.Module):
         post: torch.Tensor,
         comb: torch.Tensor,
     ):
-        return self.mhc_post(x, residual, post, comb)
+        return torch.ops.vllm.mhc_post(x, residual, post, comb)
 
-    def _forward_cuda(
+    def forward(
         self,
         x: torch.Tensor,
         positions: torch.Tensor,
         input_ids: torch.Tensor | None,
-        post_mix: torch.Tensor | None = None,
-        res_mix: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        post_mix: torch.Tensor | None,
+        res_mix: torch.Tensor | None,
+        residual: torch.Tensor | None,
+    ) -> torch.Tensor:
         if residual is None:
             # Run standalone hc_pre on first layer
             residual = x
@@ -1221,7 +1216,7 @@ class DeepseekV4DecoderLayer(nn.Module):
                 x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
             )
         else:
-            residual, post_mix, res_mix, x = self.mhc_fused_post_pre(
+            residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
                 x,
                 residual,
                 post_mix,
@@ -1239,7 +1234,7 @@ class DeepseekV4DecoderLayer(nn.Module):
         x = self.attn_norm(x)
         x = self.attn(positions, x, None)
 
-        residual, post_mix, res_mix, x = self.mhc_fused_post_pre(
+        residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
             x,
             residual,
             post_mix,
@@ -1253,58 +1248,11 @@ class DeepseekV4DecoderLayer(nn.Module):
             self.hc_post_alpha,
             self.hc_sinkhorn_iters,
         )
-        # ffn_norm is now folded into self.ffn.norm_gate; ffn() takes
-        # the pre-norm activation directly.
+
+        x = self.ffn_norm(x)
         x = self.ffn(x, input_ids)
         return x, residual, post_mix, res_mix
 
-    def _forward_rocm(
-        self,
-        x: torch.Tensor,
-        positions: torch.Tensor,
-        input_ids: torch.Tensor | None,
-        post_mix: torch.Tensor | None = None,
-        res_mix: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-    ) -> tuple[
-        torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None
-    ]:
-        residual = x
-        x, post, comb = self.hc_pre(
-            x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
-        )
-        x = self.attn_norm(x)
-        x = self.attn(positions, x, None)
-        x = self.hc_post(x, residual, post, comb)
-
-        residual = x
-        x, post, comb = self.hc_pre(
-            x, self.hc_ffn_fn, self.hc_ffn_scale, self.hc_ffn_base
-        )
-        # ffn_norm is now folded into self.ffn.norm_gate; ffn() takes
-        # the pre-norm activation directly.
-        x = self.ffn(x, input_ids)
-        x = self.hc_post(x, residual, post, comb)
-        return x, None, None, None
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        positions: torch.Tensor,
-        input_ids: torch.Tensor | None,
-        post_mix: torch.Tensor | None = None,
-        res_mix: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-    ) -> tuple[
-        torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None
-    ]:
-        if current_platform.is_rocm():
-            return self._forward_rocm(
-                x, positions, input_ids, post_mix, res_mix, residual
-            )
-
-        return self._forward_cuda(x, positions, input_ids, post_mix, res_mix, residual)
-
 
 @support_torch_compile
 class DeepseekV4Model(nn.Module):
@@ -1394,7 +1342,7 @@ class DeepseekV4Model(nn.Module):
             torch.empty(1, dtype=torch.float32),
             requires_grad=False,
         )
-        self.hc_head_op = HCHeadOp()
+
         # Pre-hc_head residual stream buffer for the MTP draft. Stable
         # address (outside the cudagraph pool) so the copy_ in forward()
         # refreshes it correctly across captured shapes.
@@ -1464,7 +1412,7 @@ class DeepseekV4Model(nn.Module):
                 res_mix,
                 residual,
             )
-        if layer is not None and current_platform.is_cuda():
+        else:
             hidden_states = layer.hc_post(hidden_states, residual, post_mix, res_mix)
 
         if not get_pp_group().is_last_rank:
@@ -1474,7 +1422,7 @@ class DeepseekV4Model(nn.Module):
         num_tokens = hidden_states.shape[0]
         self._mtp_hidden_buffer[:num_tokens].copy_(hidden_states.flatten(1))
 
-        hidden_states = self.hc_head_op(
+        hidden_states = hc_head(
             hidden_states,
             self.hc_head_fn,
             self.hc_head_scale,
@@ -1603,6 +1551,36 @@ class DeepseekV4Model(nn.Module):
             layer.ffn.finalize_mega_moe_weights()
 
 
+@torch.compile(backend=current_platform.simple_compile_backend)
+def hc_head(
+    hidden_states: torch.Tensor,
+    hc_fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    rms_norm_eps: float,
+    hc_eps: float,
+) -> torch.Tensor:
+    hc_mult, hidden_size = hidden_states.shape[-2:]
+    outer_shape = hidden_states.shape[:-2]
+    hs_flat = hidden_states.view(-1, hc_mult, hidden_size)
+    num_tokens = hs_flat.shape[0]
+    out = torch.empty(
+        num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device
+    )
+    torch.ops.vllm.hc_head_fused_kernel(
+        hs_flat,
+        hc_fn,
+        hc_scale,
+        hc_base,
+        out,
+        hidden_size,
+        rms_norm_eps,
+        hc_eps,
+        hc_mult,
+    )
+    return out.view(*outer_shape, hidden_size)
+
+
 def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
     if expert_dtype == "fp4":
         # MXFP4 experts use Mxfp4MoEMethod, which registers scales as
@@ -1632,13 +1610,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
         orig_to_new_suffix={
             "head.weight": "lm_head.weight",
             "embed.weight": "embed_tokens.weight",
-            # Pre-MoE norm + gate are now owned by ``DeepseekV4MoE.norm_gate``
-            # (see NormGatedLinear).
-            ".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
-            ".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
-            ".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
-            # Hash MoE table also moved off the inner gate.
-            ".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
+            ".ffn.gate.bias": ".ffn.gate.e_score_correction_bias",
         },
         orig_to_new_substr={
             ".attn.compressor.": ".attn.mla_attn.compressor.",
@@ -1650,7 +1622,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
 def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
     """Weight mapper for NVFP4 (ModelOpt) DeepSeek-V4 checkpoints.
 
-    NVFP4 checkpoints use different key naming than the upstream MXFP4 format:
+    NVFP4 checkpoints use different key naming than the default MXFP4 format:
     - ``self_attn`` prefix instead of ``attn``
     - ``mlp`` prefix instead of ``ffn``
     - Expert weights: gate_proj/up_proj/down_proj (not w1/w3/w2)
@@ -1663,7 +1635,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
         re.compile(r"(\.experts\.\d+\.)up_proj\."): r"\1w3.",
         re.compile(r"(\.experts\.\d+\.)down_proj\."): r"\1w2.",
     }
-
     return WeightsMapper(
         orig_to_new_prefix={
             "layers.": "model.layers.",
@@ -1673,22 +1644,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
             "mtp.": "model.mtp.",
         },
         orig_to_new_regex=expert_rename_regex,
-        # No suffix renames needed — NVFP4 checkpoint uses
-        # .weight_scale / .weight_scale_2 / .input_scale directly.
         orig_to_new_suffix={
             "head.weight": "lm_head.weight",
             "embed.weight": "embed_tokens.weight",
-            # Pre-MoE norm + gate are now owned by DeepseekV4MoE.norm_gate
-            ".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
-            ".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
-            ".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
-            ".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
+            ".ffn.gate.bias": ".ffn.gate.e_score_correction_bias",
         },
-        # Specific renames MUST come before general ones (applied in order).
         orig_to_new_substr={
-            # Indexer params (MUST come before .self_attn.compressor.
-            # so indexer keys are captured before the compressor prefix
-            # rewrite moves them under mla_attn.compressor).
+            # Indexer params (MUST come before general compressor renames)
             ".self_attn.compressor.indexer.q_b_proj.":
                 ".attn.indexer.wq_b.",
             ".self_attn.compressor.indexer.weights_proj.":
@@ -1701,14 +1663,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
                 ".attn.indexer.compressor.wgate.",
             ".self_attn.compressor.indexer.position_bias":
                 ".attn.indexer.compressor.ape",
-            # Compressor (non-indexer) renames
+            # Compressor renames (non-indexer)
             "compressor.kv_proj.": "compressor.wkv.",
             "compressor.gate_proj.": "compressor.wgate.",
             "compressor.kv_norm.": "compressor.norm.",
             "compressor.position_bias": "compressor.ape",
-            # Attention compressor (after indexer renames)
             ".self_attn.compressor.": ".attn.compressor.",
-            # Attention projections (specific before .self_attn. → .attn.)
+            # Attention projections
             ".self_attn.q_a_proj.": ".attn.wq_a.",
             ".self_attn.kv_proj.": ".attn.wkv.",
             ".self_attn.q_b_proj.": ".attn.wq_b.",
@@ -1717,7 +1678,7 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
             ".self_attn.q_a_norm.": ".attn.q_norm.",
             ".self_attn.kv_norm.": ".attn.kv_norm.",
             ".self_attn.sinks": ".attn.attn_sink",
-            # Shared expert projections (specific before .mlp. → .ffn.)
+            # Shared experts
             ".mlp.shared_experts.gate_proj.":
                 ".ffn.shared_experts.w1.",
             ".mlp.shared_experts.up_proj.":
@@ -1727,7 +1688,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
             # General renames
             ".mlp.": ".ffn.",
             ".self_attn.": ".attn.",
-            # Layer norms
             "input_layernorm.": "attn_norm.",
             "post_attention_layernorm.": "ffn_norm.",
             # HC params
diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py
index 252d9234..8cac925f 100644
--- a/vllm/patches/deepseek_v4_attention.py
+++ b/vllm/patches/deepseek_v4_attention.py
@@ -14,12 +14,6 @@ import torch.nn.functional as F
 from transformers import DeepseekV2Config, DeepseekV3Config
 
 import vllm.envs as envs
-try:
-    from vllm.compilation.breakable_cudagraph import eager_break_during_capture
-except ImportError:
-    # Older vLLM versions don't have this module; use identity decorator
-    def eager_break_during_capture(fn):
-        return fn
 from vllm.model_executor.layers.linear import (
     ReplicatedLinear,
 )
@@ -52,7 +46,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.custom_op import PluggableLayer
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.deepseek_compressor import DeepseekCompressor
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
 from vllm.model_executor.layers.quantization import QuantizationConfig
 
 from vllm.platforms import current_platform
@@ -327,7 +321,7 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
             z = z.view(num_tokens, self.n_local_groups, self.o_lora_rank)
             return self.wo_b(z.flatten(1))
 
-        # FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum (SM90 only)
+        # FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum
         o_fp8, o_scale = fused_inv_rope_fp8_quant(
             o,
             positions,
@@ -572,7 +566,7 @@ def _apply_inv_rope_bf16(
 ) -> torch.Tensor:
     """Apply inverse RoPE to attention output in BF16.
 
-    Inverse RoPE is just RoPE with cos → cos, sin → -sin.
+    Inverse RoPE is just RoPE with sin -> -sin.
     Uses GPT-J style (interleaved) rotary embedding.
     """
     if rope_dim == 0 or o.numel() == 0:
@@ -588,7 +582,7 @@ def _apply_inv_rope_bf16(
     rope = o_f32[..., nope_dim:]
     y_even = rope[..., 0::2]
     y_odd = rope[..., 1::2]
-    # Inverse: sin → -sin (swap signs on the cross terms)
+    # Inverse: sin → -sin (swap signs on cross terms)
     rope_out = torch.stack(
         (y_even * cos + y_odd * sin, y_odd * cos - y_even * sin),
         dim=-1,
@@ -598,7 +592,6 @@ def _apply_inv_rope_bf16(
     return o_f32.to(o.dtype)
 
 
-@eager_break_during_capture
 def deepseek_v4_attention(
     hidden_states: torch.Tensor,
     positions: torch.Tensor,
@@ -1148,8 +1141,6 @@ class DeepseekV4Indexer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.wq_b",
         )
-        # weights_proj is NVFP4-quantized in the modelopt checkpoint.
-        # Upstream uses quant_config=None for the MXFP4 checkpoint.
         self.weights_proj = ReplicatedLinear(
             hidden_size,
             self.n_head,
@@ -1157,6 +1148,7 @@ class DeepseekV4Indexer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.weights_proj",
         )
+        self.k_norm = LayerNorm(self.head_dim, eps=1e-6)
         self.softmax_scale = self.head_dim**-0.5
 
         self.scale_fmt = "ue8m0"