Patch from Docker image's vLLM (0.20.2rc1) instead of newer upstream

The nightly Docker image uses an older vLLM that doesn't have NormGateLinear, breakable_cudagraph, etc. Patching the Docker image's own files ensures compatibility. - deepseek_v4.py: Patches from Docker image + NVFP4 mapper + wo_a BF16 - deepseek_v4_attention.py: Patches from Docker image + inv rope BF16 + weights_proj quant + removed QuantFP8/GroupShape imports
2026-05-19 03:35:15 +00:00
parent df5a496f5d
commit db5192fe41
2 changed files with 95 additions and 143 deletions
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -23,14 +23,11 @@ from vllm.model_executor.layers.deepseek_v4_attention import (
    DeepseekV4MLAModules,
    DeepseekV4MultiHeadLatentAttentionWrapper,
 )
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear
 from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import (
    fused_topk_bias,
 )
-from vllm.model_executor.layers.fused_moe.router.norm_gate_linear import (
-    NormGateLinear,
-)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
    ColumnParallelLinear,
@@ -38,12 +35,6 @@ from vllm.model_executor.layers.linear import (
    RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mhc import (
-    HCHeadOp,
-    MHCFusedPostPreOp,
-    MHCPostOp,
-    MHCPreOp,
-)
 from vllm.model_executor.layers.quantization import (
    QuantizationConfig,
    QuantizationMethods,
@@ -758,23 +749,23 @@ class DeepseekV4MoE(nn.Module):
                "deep_gemm_mega_moe for this checkpoint."
            )

-        # Fused RMSNorm + gate: owns both ffn_norm and the gate matmul.
-        self.norm_gate = NormGateLinear(
-            hidden_size=config.hidden_size,
-            num_experts=config.n_routed_experts,
-            rms_eps=config.rms_norm_eps,
-            prefix=f"{prefix}.norm_gate",
+        self.gate = GateLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            out_dtype=torch.float32,
+            bias=False,
+            prefix=f"{prefix}.gate",
        )
-        # Routing-side tensors live on ``norm_gate`` directly (not on the
-        # inner gate); they are initialized to None in NormGatedLinear and
-        # populated below depending on the MoE variant.
+        self.gate.e_score_correction_bias = None
+        self.gate.tid2eid = None
        is_hash_moe = extract_layer_index(prefix) < config.num_hash_layers
        self.hash_indices_dtype = torch.int64 if self.use_mega_moe else torch.int32
+
        if is_hash_moe:
            # hash MoE doesn't use e_score_correction_bias
            # Use randint instead of empty to avoid garbage values causing
            # invalid memory access in dummy mode (--load-format="dummy")
-            self.norm_gate.tid2eid = nn.Parameter(
+            self.gate.tid2eid = nn.Parameter(
                torch.randint(
                    0,
                    config.n_routed_experts,
@@ -784,7 +775,7 @@ class DeepseekV4MoE(nn.Module):
                requires_grad=False,
            )
        elif getattr(config, "topk_method", None) == "noaux_tc":
-            self.norm_gate.e_score_correction_bias = nn.Parameter(
+            self.gate.e_score_correction_bias = nn.Parameter(
                torch.empty(config.n_routed_experts, dtype=torch.float32),
                requires_grad=False,
            )
@@ -847,9 +838,10 @@ class DeepseekV4MoE(nn.Module):
        self.n_local_experts = config.n_routed_experts // self.tp_size
        self.experts_start_idx = self.tp_rank * self.n_local_experts
        self.experts_end_idx = self.experts_start_idx + self.n_local_experts
-        # We don't pass `gate` into FusedMoE
+
        self.experts = FusedMoE(
            shared_experts=self.shared_experts,
+            gate=self.gate,
            num_experts=config.n_routed_experts,
            top_k=config.num_experts_per_tok,
            hidden_size=config.hidden_size,
@@ -859,8 +851,8 @@ class DeepseekV4MoE(nn.Module):
            prefix=f"{prefix}.experts",
            scoring_func=self.scoring_func,
            routed_scaling_factor=self.routed_scaling_factor,
-            e_score_correction_bias=self.norm_gate.e_score_correction_bias,
-            hash_indices_table=self.norm_gate.tid2eid,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            hash_indices_table=self.gate.tid2eid,
            swiglu_limit=self.swiglu_limit,
            router_logits_dtype=torch.float32,
        )
@@ -868,40 +860,40 @@ class DeepseekV4MoE(nn.Module):
    def forward(
        self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None
    ) -> torch.Tensor:
-        if self.norm_gate.tid2eid is not None and input_ids is None:
+        if self.gate.tid2eid is not None and input_ids is None:
            raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.")

        if not self.use_mega_moe:
            return self._forward_fused_moe(hidden_states, input_ids)

        org_shape = hidden_states.shape
-        normed_x, router_logits = self.norm_gate(hidden_states)
+        router_logits, _ = self.gate(hidden_states)
        topk_weights, topk_ids = fused_topk_bias(
-            hidden_states=normed_x,
+            hidden_states=hidden_states,
            gating_output=router_logits,
            scoring_func=self.scoring_func,
-            e_score_correction_bias=self.norm_gate.e_score_correction_bias.data
-            if self.norm_gate.e_score_correction_bias is not None
+            e_score_correction_bias=self.gate.e_score_correction_bias.data
+            if self.gate.e_score_correction_bias is not None
            else None,
            topk=self.n_activated_experts,
            renormalize=self.renormalize,
            indices_type=self.hash_indices_dtype,
            input_tokens=input_ids,
-            hash_indices_table=self.norm_gate.tid2eid,
+            hash_indices_table=self.gate.tid2eid,
            routed_scaling_factor=self.routed_scaling_factor,
        )
        activation_clamp = (
            float(self.swiglu_limit) if self.swiglu_limit is not None else None
        )
        final_hidden_states = self.experts(
-            normed_x,
+            hidden_states,
            topk_weights,
            topk_ids,
            activation_clamp=activation_clamp,
        )

        if self.shared_experts is not None:
-            shared_output = self.shared_experts(normed_x)
+            shared_output = self.shared_experts(hidden_states)
            final_hidden_states += shared_output

        return final_hidden_states.view(org_shape)
@@ -909,14 +901,21 @@ class DeepseekV4MoE(nn.Module):
    def _forward_fused_moe(
        self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None
    ) -> torch.Tensor:
-        assert not self.experts.is_internal_router
        org_shape = hidden_states.shape
-        normed_x, router_logits = self.norm_gate(hidden_states)
-        final_hidden_states = self.experts(
-            hidden_states=normed_x,
-            router_logits=router_logits,
-            input_ids=input_ids,
-        )
+        if self.experts.is_internal_router:
+            # In this case, the gate/router runs inside the FusedMoE class
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states,
+                router_logits=hidden_states,
+                input_ids=input_ids,
+            )
+        else:
+            router_logits, _ = self.gate(hidden_states)
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+                input_ids=input_ids,
+            )

        return final_hidden_states.view(org_shape)

@@ -1122,8 +1121,7 @@ class DeepseekV4DecoderLayer(nn.Module):
        self.ffn = DeepseekV4MoE(vllm_config, prefix=f"{prefix}.ffn")

        self.attn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps)
-        # ``ffn_norm`` is owned by ``self.ffn.norm_gate`` (fused with the
-        # router gate matmul); see ``NormGatedLinear``.
+        self.ffn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps)
        self.hc_mult = config.hc_mult
        self.hc_sinkhorn_iters = config.hc_sinkhorn_iters
        self.hc_eps = config.hc_eps
@@ -1172,9 +1170,6 @@ class DeepseekV4DecoderLayer(nn.Module):
            ),
            requires_grad=False,
        )
-        self.mhc_pre = MHCPreOp()
-        self.mhc_post = MHCPostOp()
-        self.mhc_fused_post_pre = MHCFusedPostPreOp()

    def hc_pre(
        self,
@@ -1183,7 +1178,7 @@ class DeepseekV4DecoderLayer(nn.Module):
        hc_scale: torch.Tensor,
        hc_base: torch.Tensor,
    ):
-        post_mix, res_mix, layer_input = self.mhc_pre(
+        post_mix, res_mix, layer_input = torch.ops.vllm.mhc_pre(
            residual=x,
            fn=hc_fn,
            hc_scale=hc_scale,
@@ -1203,17 +1198,17 @@ class DeepseekV4DecoderLayer(nn.Module):
        post: torch.Tensor,
        comb: torch.Tensor,
    ):
-        return self.mhc_post(x, residual, post, comb)
+        return torch.ops.vllm.mhc_post(x, residual, post, comb)

-    def _forward_cuda(
+    def forward(
        self,
        x: torch.Tensor,
        positions: torch.Tensor,
        input_ids: torch.Tensor | None,
-        post_mix: torch.Tensor | None = None,
-        res_mix: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        post_mix: torch.Tensor | None,
+        res_mix: torch.Tensor | None,
+        residual: torch.Tensor | None,
+    ) -> torch.Tensor:
        if residual is None:
            # Run standalone hc_pre on first layer
            residual = x
@@ -1221,7 +1216,7 @@ class DeepseekV4DecoderLayer(nn.Module):
                x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
            )
        else:
-            residual, post_mix, res_mix, x = self.mhc_fused_post_pre(
+            residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
                x,
                residual,
                post_mix,
@@ -1239,7 +1234,7 @@ class DeepseekV4DecoderLayer(nn.Module):
        x = self.attn_norm(x)
        x = self.attn(positions, x, None)

-        residual, post_mix, res_mix, x = self.mhc_fused_post_pre(
+        residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
            x,
            residual,
            post_mix,
@@ -1253,58 +1248,11 @@ class DeepseekV4DecoderLayer(nn.Module):
            self.hc_post_alpha,
            self.hc_sinkhorn_iters,
        )
-        # ffn_norm is now folded into self.ffn.norm_gate; ffn() takes
-        # the pre-norm activation directly.
+
+        x = self.ffn_norm(x)
        x = self.ffn(x, input_ids)
        return x, residual, post_mix, res_mix

-    def _forward_rocm(
-        self,
-        x: torch.Tensor,
-        positions: torch.Tensor,
-        input_ids: torch.Tensor | None,
-        post_mix: torch.Tensor | None = None,
-        res_mix: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-    ) -> tuple[
-        torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None
-    ]:
-        residual = x
-        x, post, comb = self.hc_pre(
-            x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
-        )
-        x = self.attn_norm(x)
-        x = self.attn(positions, x, None)
-        x = self.hc_post(x, residual, post, comb)
-
-        residual = x
-        x, post, comb = self.hc_pre(
-            x, self.hc_ffn_fn, self.hc_ffn_scale, self.hc_ffn_base
-        )
-        # ffn_norm is now folded into self.ffn.norm_gate; ffn() takes
-        # the pre-norm activation directly.
-        x = self.ffn(x, input_ids)
-        x = self.hc_post(x, residual, post, comb)
-        return x, None, None, None
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        positions: torch.Tensor,
-        input_ids: torch.Tensor | None,
-        post_mix: torch.Tensor | None = None,
-        res_mix: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-    ) -> tuple[
-        torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None
-    ]:
-        if current_platform.is_rocm():
-            return self._forward_rocm(
-                x, positions, input_ids, post_mix, res_mix, residual
-            )
-
-        return self._forward_cuda(x, positions, input_ids, post_mix, res_mix, residual)
-

@support_torch_compile
 class DeepseekV4Model(nn.Module):
@@ -1394,7 +1342,7 @@ class DeepseekV4Model(nn.Module):
            torch.empty(1, dtype=torch.float32),
            requires_grad=False,
        )
-        self.hc_head_op = HCHeadOp()
+
        # Pre-hc_head residual stream buffer for the MTP draft. Stable
        # address (outside the cudagraph pool) so the copy_ in forward()
        # refreshes it correctly across captured shapes.
@@ -1464,7 +1412,7 @@ class DeepseekV4Model(nn.Module):
                res_mix,
                residual,
            )
-        if layer is not None and current_platform.is_cuda():
+        else:
            hidden_states = layer.hc_post(hidden_states, residual, post_mix, res_mix)

        if not get_pp_group().is_last_rank:
@@ -1474,7 +1422,7 @@ class DeepseekV4Model(nn.Module):
        num_tokens = hidden_states.shape[0]
        self._mtp_hidden_buffer[:num_tokens].copy_(hidden_states.flatten(1))

-        hidden_states = self.hc_head_op(
+        hidden_states = hc_head(
            hidden_states,
            self.hc_head_fn,
            self.hc_head_scale,
@@ -1603,6 +1551,36 @@ class DeepseekV4Model(nn.Module):
            layer.ffn.finalize_mega_moe_weights()


+@torch.compile(backend=current_platform.simple_compile_backend)
+def hc_head(
+    hidden_states: torch.Tensor,
+    hc_fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    rms_norm_eps: float,
+    hc_eps: float,
+) -> torch.Tensor:
+    hc_mult, hidden_size = hidden_states.shape[-2:]
+    outer_shape = hidden_states.shape[:-2]
+    hs_flat = hidden_states.view(-1, hc_mult, hidden_size)
+    num_tokens = hs_flat.shape[0]
+    out = torch.empty(
+        num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device
+    )
+    torch.ops.vllm.hc_head_fused_kernel(
+        hs_flat,
+        hc_fn,
+        hc_scale,
+        hc_base,
+        out,
+        hidden_size,
+        rms_norm_eps,
+        hc_eps,
+        hc_mult,
+    )
+    return out.view(*outer_shape, hidden_size)
+
+
 def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
    if expert_dtype == "fp4":
        # MXFP4 experts use Mxfp4MoEMethod, which registers scales as
@@ -1632,13 +1610,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
        orig_to_new_suffix={
            "head.weight": "lm_head.weight",
            "embed.weight": "embed_tokens.weight",
-            # Pre-MoE norm + gate are now owned by ``DeepseekV4MoE.norm_gate``
-            # (see NormGatedLinear).
-            ".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
-            ".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
-            ".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
-            # Hash MoE table also moved off the inner gate.
-            ".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
+            ".ffn.gate.bias": ".ffn.gate.e_score_correction_bias",
        },
        orig_to_new_substr={
            ".attn.compressor.": ".attn.mla_attn.compressor.",
@@ -1650,7 +1622,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
 def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
    """Weight mapper for NVFP4 (ModelOpt) DeepSeek-V4 checkpoints.

-    NVFP4 checkpoints use different key naming than the upstream MXFP4 format:
+    NVFP4 checkpoints use different key naming than the default MXFP4 format:
    - ``self_attn`` prefix instead of ``attn``
    - ``mlp`` prefix instead of ``ffn``
    - Expert weights: gate_proj/up_proj/down_proj (not w1/w3/w2)
@@ -1663,7 +1635,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
        re.compile(r"(\.experts\.\d+\.)up_proj\."): r"\1w3.",
        re.compile(r"(\.experts\.\d+\.)down_proj\."): r"\1w2.",
    }
-
    return WeightsMapper(
        orig_to_new_prefix={
            "layers.": "model.layers.",
@@ -1673,22 +1644,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
            "mtp.": "model.mtp.",
        },
        orig_to_new_regex=expert_rename_regex,
-        # No suffix renames needed — NVFP4 checkpoint uses
-        # .weight_scale / .weight_scale_2 / .input_scale directly.
        orig_to_new_suffix={
            "head.weight": "lm_head.weight",
            "embed.weight": "embed_tokens.weight",
-            # Pre-MoE norm + gate are now owned by DeepseekV4MoE.norm_gate
-            ".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
-            ".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
-            ".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
-            ".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
+            ".ffn.gate.bias": ".ffn.gate.e_score_correction_bias",
        },
-        # Specific renames MUST come before general ones (applied in order).
        orig_to_new_substr={
-            # Indexer params (MUST come before .self_attn.compressor.
-            # so indexer keys are captured before the compressor prefix
-            # rewrite moves them under mla_attn.compressor).
+            # Indexer params (MUST come before general compressor renames)
            ".self_attn.compressor.indexer.q_b_proj.":
                ".attn.indexer.wq_b.",
            ".self_attn.compressor.indexer.weights_proj.":
@@ -1701,14 +1663,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
                ".attn.indexer.compressor.wgate.",
            ".self_attn.compressor.indexer.position_bias":
                ".attn.indexer.compressor.ape",
-            # Compressor (non-indexer) renames
+            # Compressor renames (non-indexer)
            "compressor.kv_proj.": "compressor.wkv.",
            "compressor.gate_proj.": "compressor.wgate.",
            "compressor.kv_norm.": "compressor.norm.",
            "compressor.position_bias": "compressor.ape",
-            # Attention compressor (after indexer renames)
            ".self_attn.compressor.": ".attn.compressor.",
-            # Attention projections (specific before .self_attn. → .attn.)
+            # Attention projections
            ".self_attn.q_a_proj.": ".attn.wq_a.",
            ".self_attn.kv_proj.": ".attn.wkv.",
            ".self_attn.q_b_proj.": ".attn.wq_b.",
@@ -1717,7 +1678,7 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
            ".self_attn.q_a_norm.": ".attn.q_norm.",
            ".self_attn.kv_norm.": ".attn.kv_norm.",
            ".self_attn.sinks": ".attn.attn_sink",
-            # Shared expert projections (specific before .mlp. → .ffn.)
+            # Shared experts
            ".mlp.shared_experts.gate_proj.":
                ".ffn.shared_experts.w1.",
            ".mlp.shared_experts.up_proj.":
@@ -1727,7 +1688,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
            # General renames
            ".mlp.": ".ffn.",
            ".self_attn.": ".attn.",
-            # Layer norms
            "input_layernorm.": "attn_norm.",
            "post_attention_layernorm.": "ffn_norm.",
            # HC params
--- a/vllm/patches/deepseek_v4_attention.py
+++ b/vllm/patches/deepseek_v4_attention.py
@@ -14,12 +14,6 @@ import torch.nn.functional as F
 from transformers import DeepseekV2Config, DeepseekV3Config

 import vllm.envs as envs
-try:
-    from vllm.compilation.breakable_cudagraph import eager_break_during_capture
-except ImportError:
-    # Older vLLM versions don't have this module; use identity decorator
-    def eager_break_during_capture(fn):
-        return fn
 from vllm.model_executor.layers.linear import (
    ReplicatedLinear,
 )
@@ -52,7 +46,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.custom_op import PluggableLayer
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.deepseek_compressor import DeepseekCompressor
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
 from vllm.model_executor.layers.quantization import QuantizationConfig

 from vllm.platforms import current_platform
@@ -327,7 +321,7 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
            z = z.view(num_tokens, self.n_local_groups, self.o_lora_rank)
            return self.wo_b(z.flatten(1))

-        # FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum (SM90 only)
+        # FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum
        o_fp8, o_scale = fused_inv_rope_fp8_quant(
            o,
            positions,
@@ -572,7 +566,7 @@ def _apply_inv_rope_bf16(
 ) -> torch.Tensor:
    """Apply inverse RoPE to attention output in BF16.

-    Inverse RoPE is just RoPE with cos → cos, sin → -sin.
+    Inverse RoPE is just RoPE with sin -> -sin.
    Uses GPT-J style (interleaved) rotary embedding.
    """
    if rope_dim == 0 or o.numel() == 0:
@@ -588,7 +582,7 @@ def _apply_inv_rope_bf16(
    rope = o_f32[..., nope_dim:]
    y_even = rope[..., 0::2]
    y_odd = rope[..., 1::2]
-    # Inverse: sin → -sin (swap signs on the cross terms)
+    # Inverse: sin → -sin (swap signs on cross terms)
    rope_out = torch.stack(
        (y_even * cos + y_odd * sin, y_odd * cos - y_even * sin),
        dim=-1,
@@ -598,7 +592,6 @@ def _apply_inv_rope_bf16(
    return o_f32.to(o.dtype)


-@eager_break_during_capture
 def deepseek_v4_attention(
    hidden_states: torch.Tensor,
    positions: torch.Tensor,
@@ -1148,8 +1141,6 @@ class DeepseekV4Indexer(nn.Module):
            quant_config=quant_config,
            prefix=f"{prefix}.wq_b",
        )
-        # weights_proj is NVFP4-quantized in the modelopt checkpoint.
-        # Upstream uses quant_config=None for the MXFP4 checkpoint.
        self.weights_proj = ReplicatedLinear(
            hidden_size,
            self.n_head,
@@ -1157,6 +1148,7 @@ class DeepseekV4Indexer(nn.Module):
            quant_config=quant_config,
            prefix=f"{prefix}.weights_proj",
        )
+        self.k_norm = LayerNorm(self.head_dim, eps=1e-6)
        self.softmax_scale = self.head_dim**-0.5

        self.scale_fmt = "ue8m0"