From db5192fe419abf25ce5a7f2fad48fb9cd00b20cb Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 19 May 2026 03:35:15 +0000 Subject: [PATCH] Patch from Docker image's vLLM (0.20.2rc1) instead of newer upstream The nightly Docker image uses an older vLLM that doesn't have NormGateLinear, breakable_cudagraph, etc. Patching the Docker image's own files ensures compatibility. - deepseek_v4.py: Patches from Docker image + NVFP4 mapper + wo_a BF16 - deepseek_v4_attention.py: Patches from Docker image + inv rope BF16 + weights_proj quant + removed QuantFP8/GroupShape imports --- vllm/patches/deepseek_v4.py | 220 +++++++++++--------------- vllm/patches/deepseek_v4_attention.py | 18 +-- 2 files changed, 95 insertions(+), 143 deletions(-) diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index 6c99cdef..3af73362 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -23,14 +23,11 @@ from vllm.model_executor.layers.deepseek_v4_attention import ( DeepseekV4MLAModules, DeepseekV4MultiHeadLatentAttentionWrapper, ) -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import ( fused_topk_bias, ) -from vllm.model_executor.layers.fused_moe.router.norm_gate_linear import ( - NormGateLinear, -) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -38,12 +35,6 @@ from vllm.model_executor.layers.linear import ( RowParallelLinear, ) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mhc import ( - HCHeadOp, - MHCFusedPostPreOp, - MHCPostOp, - MHCPreOp, -) from vllm.model_executor.layers.quantization import ( QuantizationConfig, QuantizationMethods, @@ -758,23 +749,23 @@ class DeepseekV4MoE(nn.Module): "deep_gemm_mega_moe for this checkpoint." ) - # Fused RMSNorm + gate: owns both ffn_norm and the gate matmul. - self.norm_gate = NormGateLinear( - hidden_size=config.hidden_size, - num_experts=config.n_routed_experts, - rms_eps=config.rms_norm_eps, - prefix=f"{prefix}.norm_gate", + self.gate = GateLinear( + config.hidden_size, + config.n_routed_experts, + out_dtype=torch.float32, + bias=False, + prefix=f"{prefix}.gate", ) - # Routing-side tensors live on ``norm_gate`` directly (not on the - # inner gate); they are initialized to None in NormGatedLinear and - # populated below depending on the MoE variant. + self.gate.e_score_correction_bias = None + self.gate.tid2eid = None is_hash_moe = extract_layer_index(prefix) < config.num_hash_layers self.hash_indices_dtype = torch.int64 if self.use_mega_moe else torch.int32 + if is_hash_moe: # hash MoE doesn't use e_score_correction_bias # Use randint instead of empty to avoid garbage values causing # invalid memory access in dummy mode (--load-format="dummy") - self.norm_gate.tid2eid = nn.Parameter( + self.gate.tid2eid = nn.Parameter( torch.randint( 0, config.n_routed_experts, @@ -784,7 +775,7 @@ class DeepseekV4MoE(nn.Module): requires_grad=False, ) elif getattr(config, "topk_method", None) == "noaux_tc": - self.norm_gate.e_score_correction_bias = nn.Parameter( + self.gate.e_score_correction_bias = nn.Parameter( torch.empty(config.n_routed_experts, dtype=torch.float32), requires_grad=False, ) @@ -847,9 +838,10 @@ class DeepseekV4MoE(nn.Module): self.n_local_experts = config.n_routed_experts // self.tp_size self.experts_start_idx = self.tp_rank * self.n_local_experts self.experts_end_idx = self.experts_start_idx + self.n_local_experts - # We don't pass `gate` into FusedMoE + self.experts = FusedMoE( shared_experts=self.shared_experts, + gate=self.gate, num_experts=config.n_routed_experts, top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, @@ -859,8 +851,8 @@ class DeepseekV4MoE(nn.Module): prefix=f"{prefix}.experts", scoring_func=self.scoring_func, routed_scaling_factor=self.routed_scaling_factor, - e_score_correction_bias=self.norm_gate.e_score_correction_bias, - hash_indices_table=self.norm_gate.tid2eid, + e_score_correction_bias=self.gate.e_score_correction_bias, + hash_indices_table=self.gate.tid2eid, swiglu_limit=self.swiglu_limit, router_logits_dtype=torch.float32, ) @@ -868,40 +860,40 @@ class DeepseekV4MoE(nn.Module): def forward( self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None ) -> torch.Tensor: - if self.norm_gate.tid2eid is not None and input_ids is None: + if self.gate.tid2eid is not None and input_ids is None: raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.") if not self.use_mega_moe: return self._forward_fused_moe(hidden_states, input_ids) org_shape = hidden_states.shape - normed_x, router_logits = self.norm_gate(hidden_states) + router_logits, _ = self.gate(hidden_states) topk_weights, topk_ids = fused_topk_bias( - hidden_states=normed_x, + hidden_states=hidden_states, gating_output=router_logits, scoring_func=self.scoring_func, - e_score_correction_bias=self.norm_gate.e_score_correction_bias.data - if self.norm_gate.e_score_correction_bias is not None + e_score_correction_bias=self.gate.e_score_correction_bias.data + if self.gate.e_score_correction_bias is not None else None, topk=self.n_activated_experts, renormalize=self.renormalize, indices_type=self.hash_indices_dtype, input_tokens=input_ids, - hash_indices_table=self.norm_gate.tid2eid, + hash_indices_table=self.gate.tid2eid, routed_scaling_factor=self.routed_scaling_factor, ) activation_clamp = ( float(self.swiglu_limit) if self.swiglu_limit is not None else None ) final_hidden_states = self.experts( - normed_x, + hidden_states, topk_weights, topk_ids, activation_clamp=activation_clamp, ) if self.shared_experts is not None: - shared_output = self.shared_experts(normed_x) + shared_output = self.shared_experts(hidden_states) final_hidden_states += shared_output return final_hidden_states.view(org_shape) @@ -909,14 +901,21 @@ class DeepseekV4MoE(nn.Module): def _forward_fused_moe( self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None ) -> torch.Tensor: - assert not self.experts.is_internal_router org_shape = hidden_states.shape - normed_x, router_logits = self.norm_gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=normed_x, - router_logits=router_logits, - input_ids=input_ids, - ) + if self.experts.is_internal_router: + # In this case, the gate/router runs inside the FusedMoE class + final_hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=hidden_states, + input_ids=input_ids, + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits, + input_ids=input_ids, + ) return final_hidden_states.view(org_shape) @@ -1122,8 +1121,7 @@ class DeepseekV4DecoderLayer(nn.Module): self.ffn = DeepseekV4MoE(vllm_config, prefix=f"{prefix}.ffn") self.attn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps) - # ``ffn_norm`` is owned by ``self.ffn.norm_gate`` (fused with the - # router gate matmul); see ``NormGatedLinear``. + self.ffn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps) self.hc_mult = config.hc_mult self.hc_sinkhorn_iters = config.hc_sinkhorn_iters self.hc_eps = config.hc_eps @@ -1172,9 +1170,6 @@ class DeepseekV4DecoderLayer(nn.Module): ), requires_grad=False, ) - self.mhc_pre = MHCPreOp() - self.mhc_post = MHCPostOp() - self.mhc_fused_post_pre = MHCFusedPostPreOp() def hc_pre( self, @@ -1183,7 +1178,7 @@ class DeepseekV4DecoderLayer(nn.Module): hc_scale: torch.Tensor, hc_base: torch.Tensor, ): - post_mix, res_mix, layer_input = self.mhc_pre( + post_mix, res_mix, layer_input = torch.ops.vllm.mhc_pre( residual=x, fn=hc_fn, hc_scale=hc_scale, @@ -1203,17 +1198,17 @@ class DeepseekV4DecoderLayer(nn.Module): post: torch.Tensor, comb: torch.Tensor, ): - return self.mhc_post(x, residual, post, comb) + return torch.ops.vllm.mhc_post(x, residual, post, comb) - def _forward_cuda( + def forward( self, x: torch.Tensor, positions: torch.Tensor, input_ids: torch.Tensor | None, - post_mix: torch.Tensor | None = None, - res_mix: torch.Tensor | None = None, - residual: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + post_mix: torch.Tensor | None, + res_mix: torch.Tensor | None, + residual: torch.Tensor | None, + ) -> torch.Tensor: if residual is None: # Run standalone hc_pre on first layer residual = x @@ -1221,7 +1216,7 @@ class DeepseekV4DecoderLayer(nn.Module): x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base ) else: - residual, post_mix, res_mix, x = self.mhc_fused_post_pre( + residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre( x, residual, post_mix, @@ -1239,7 +1234,7 @@ class DeepseekV4DecoderLayer(nn.Module): x = self.attn_norm(x) x = self.attn(positions, x, None) - residual, post_mix, res_mix, x = self.mhc_fused_post_pre( + residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre( x, residual, post_mix, @@ -1253,58 +1248,11 @@ class DeepseekV4DecoderLayer(nn.Module): self.hc_post_alpha, self.hc_sinkhorn_iters, ) - # ffn_norm is now folded into self.ffn.norm_gate; ffn() takes - # the pre-norm activation directly. + + x = self.ffn_norm(x) x = self.ffn(x, input_ids) return x, residual, post_mix, res_mix - def _forward_rocm( - self, - x: torch.Tensor, - positions: torch.Tensor, - input_ids: torch.Tensor | None, - post_mix: torch.Tensor | None = None, - res_mix: torch.Tensor | None = None, - residual: torch.Tensor | None = None, - ) -> tuple[ - torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None - ]: - residual = x - x, post, comb = self.hc_pre( - x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base - ) - x = self.attn_norm(x) - x = self.attn(positions, x, None) - x = self.hc_post(x, residual, post, comb) - - residual = x - x, post, comb = self.hc_pre( - x, self.hc_ffn_fn, self.hc_ffn_scale, self.hc_ffn_base - ) - # ffn_norm is now folded into self.ffn.norm_gate; ffn() takes - # the pre-norm activation directly. - x = self.ffn(x, input_ids) - x = self.hc_post(x, residual, post, comb) - return x, None, None, None - - def forward( - self, - x: torch.Tensor, - positions: torch.Tensor, - input_ids: torch.Tensor | None, - post_mix: torch.Tensor | None = None, - res_mix: torch.Tensor | None = None, - residual: torch.Tensor | None = None, - ) -> tuple[ - torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None - ]: - if current_platform.is_rocm(): - return self._forward_rocm( - x, positions, input_ids, post_mix, res_mix, residual - ) - - return self._forward_cuda(x, positions, input_ids, post_mix, res_mix, residual) - @support_torch_compile class DeepseekV4Model(nn.Module): @@ -1394,7 +1342,7 @@ class DeepseekV4Model(nn.Module): torch.empty(1, dtype=torch.float32), requires_grad=False, ) - self.hc_head_op = HCHeadOp() + # Pre-hc_head residual stream buffer for the MTP draft. Stable # address (outside the cudagraph pool) so the copy_ in forward() # refreshes it correctly across captured shapes. @@ -1464,7 +1412,7 @@ class DeepseekV4Model(nn.Module): res_mix, residual, ) - if layer is not None and current_platform.is_cuda(): + else: hidden_states = layer.hc_post(hidden_states, residual, post_mix, res_mix) if not get_pp_group().is_last_rank: @@ -1474,7 +1422,7 @@ class DeepseekV4Model(nn.Module): num_tokens = hidden_states.shape[0] self._mtp_hidden_buffer[:num_tokens].copy_(hidden_states.flatten(1)) - hidden_states = self.hc_head_op( + hidden_states = hc_head( hidden_states, self.hc_head_fn, self.hc_head_scale, @@ -1603,6 +1551,36 @@ class DeepseekV4Model(nn.Module): layer.ffn.finalize_mega_moe_weights() +@torch.compile(backend=current_platform.simple_compile_backend) +def hc_head( + hidden_states: torch.Tensor, + hc_fn: torch.Tensor, + hc_scale: torch.Tensor, + hc_base: torch.Tensor, + rms_norm_eps: float, + hc_eps: float, +) -> torch.Tensor: + hc_mult, hidden_size = hidden_states.shape[-2:] + outer_shape = hidden_states.shape[:-2] + hs_flat = hidden_states.view(-1, hc_mult, hidden_size) + num_tokens = hs_flat.shape[0] + out = torch.empty( + num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device + ) + torch.ops.vllm.hc_head_fused_kernel( + hs_flat, + hc_fn, + hc_scale, + hc_base, + out, + hidden_size, + rms_norm_eps, + hc_eps, + hc_mult, + ) + return out.view(*outer_shape, hidden_size) + + def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper: if expert_dtype == "fp4": # MXFP4 experts use Mxfp4MoEMethod, which registers scales as @@ -1632,13 +1610,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper: orig_to_new_suffix={ "head.weight": "lm_head.weight", "embed.weight": "embed_tokens.weight", - # Pre-MoE norm + gate are now owned by ``DeepseekV4MoE.norm_gate`` - # (see NormGatedLinear). - ".ffn_norm.weight": ".ffn.norm_gate.norm.weight", - ".ffn.gate.weight": ".ffn.norm_gate.gate.weight", - ".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias", - # Hash MoE table also moved off the inner gate. - ".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid", + ".ffn.gate.bias": ".ffn.gate.e_score_correction_bias", }, orig_to_new_substr={ ".attn.compressor.": ".attn.mla_attn.compressor.", @@ -1650,7 +1622,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper: def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: """Weight mapper for NVFP4 (ModelOpt) DeepSeek-V4 checkpoints. - NVFP4 checkpoints use different key naming than the upstream MXFP4 format: + NVFP4 checkpoints use different key naming than the default MXFP4 format: - ``self_attn`` prefix instead of ``attn`` - ``mlp`` prefix instead of ``ffn`` - Expert weights: gate_proj/up_proj/down_proj (not w1/w3/w2) @@ -1663,7 +1635,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: re.compile(r"(\.experts\.\d+\.)up_proj\."): r"\1w3.", re.compile(r"(\.experts\.\d+\.)down_proj\."): r"\1w2.", } - return WeightsMapper( orig_to_new_prefix={ "layers.": "model.layers.", @@ -1673,22 +1644,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: "mtp.": "model.mtp.", }, orig_to_new_regex=expert_rename_regex, - # No suffix renames needed — NVFP4 checkpoint uses - # .weight_scale / .weight_scale_2 / .input_scale directly. orig_to_new_suffix={ "head.weight": "lm_head.weight", "embed.weight": "embed_tokens.weight", - # Pre-MoE norm + gate are now owned by DeepseekV4MoE.norm_gate - ".ffn_norm.weight": ".ffn.norm_gate.norm.weight", - ".ffn.gate.weight": ".ffn.norm_gate.gate.weight", - ".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias", - ".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid", + ".ffn.gate.bias": ".ffn.gate.e_score_correction_bias", }, - # Specific renames MUST come before general ones (applied in order). orig_to_new_substr={ - # Indexer params (MUST come before .self_attn.compressor. - # so indexer keys are captured before the compressor prefix - # rewrite moves them under mla_attn.compressor). + # Indexer params (MUST come before general compressor renames) ".self_attn.compressor.indexer.q_b_proj.": ".attn.indexer.wq_b.", ".self_attn.compressor.indexer.weights_proj.": @@ -1701,14 +1663,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: ".attn.indexer.compressor.wgate.", ".self_attn.compressor.indexer.position_bias": ".attn.indexer.compressor.ape", - # Compressor (non-indexer) renames + # Compressor renames (non-indexer) "compressor.kv_proj.": "compressor.wkv.", "compressor.gate_proj.": "compressor.wgate.", "compressor.kv_norm.": "compressor.norm.", "compressor.position_bias": "compressor.ape", - # Attention compressor (after indexer renames) ".self_attn.compressor.": ".attn.compressor.", - # Attention projections (specific before .self_attn. → .attn.) + # Attention projections ".self_attn.q_a_proj.": ".attn.wq_a.", ".self_attn.kv_proj.": ".attn.wkv.", ".self_attn.q_b_proj.": ".attn.wq_b.", @@ -1717,7 +1678,7 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: ".self_attn.q_a_norm.": ".attn.q_norm.", ".self_attn.kv_norm.": ".attn.kv_norm.", ".self_attn.sinks": ".attn.attn_sink", - # Shared expert projections (specific before .mlp. → .ffn.) + # Shared experts ".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.", ".mlp.shared_experts.up_proj.": @@ -1727,7 +1688,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: # General renames ".mlp.": ".ffn.", ".self_attn.": ".attn.", - # Layer norms "input_layernorm.": "attn_norm.", "post_attention_layernorm.": "ffn_norm.", # HC params diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py index 252d9234..8cac925f 100644 --- a/vllm/patches/deepseek_v4_attention.py +++ b/vllm/patches/deepseek_v4_attention.py @@ -14,12 +14,6 @@ import torch.nn.functional as F from transformers import DeepseekV2Config, DeepseekV3Config import vllm.envs as envs -try: - from vllm.compilation.breakable_cudagraph import eager_break_during_capture -except ImportError: - # Older vLLM versions don't have this module; use identity decorator - def eager_break_during_capture(fn): - return fn from vllm.model_executor.layers.linear import ( ReplicatedLinear, ) @@ -52,7 +46,7 @@ from vllm.logger import init_logger from vllm.model_executor.custom_op import PluggableLayer from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.deepseek_compressor import DeepseekCompressor -from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.platforms import current_platform @@ -327,7 +321,7 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): z = z.view(num_tokens, self.n_local_groups, self.o_lora_rank) return self.wo_b(z.flatten(1)) - # FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum (SM90 only) + # FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum o_fp8, o_scale = fused_inv_rope_fp8_quant( o, positions, @@ -572,7 +566,7 @@ def _apply_inv_rope_bf16( ) -> torch.Tensor: """Apply inverse RoPE to attention output in BF16. - Inverse RoPE is just RoPE with cos → cos, sin → -sin. + Inverse RoPE is just RoPE with sin -> -sin. Uses GPT-J style (interleaved) rotary embedding. """ if rope_dim == 0 or o.numel() == 0: @@ -588,7 +582,7 @@ def _apply_inv_rope_bf16( rope = o_f32[..., nope_dim:] y_even = rope[..., 0::2] y_odd = rope[..., 1::2] - # Inverse: sin → -sin (swap signs on the cross terms) + # Inverse: sin → -sin (swap signs on cross terms) rope_out = torch.stack( (y_even * cos + y_odd * sin, y_odd * cos - y_even * sin), dim=-1, @@ -598,7 +592,6 @@ def _apply_inv_rope_bf16( return o_f32.to(o.dtype) -@eager_break_during_capture def deepseek_v4_attention( hidden_states: torch.Tensor, positions: torch.Tensor, @@ -1148,8 +1141,6 @@ class DeepseekV4Indexer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.wq_b", ) - # weights_proj is NVFP4-quantized in the modelopt checkpoint. - # Upstream uses quant_config=None for the MXFP4 checkpoint. self.weights_proj = ReplicatedLinear( hidden_size, self.n_head, @@ -1157,6 +1148,7 @@ class DeepseekV4Indexer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.weights_proj", ) + self.k_norm = LayerNorm(self.head_dim, eps=1e-6) self.softmax_scale = self.head_dim**-0.5 self.scale_fmt = "ue8m0"