Patch from Docker image's vLLM (0.20.2rc1) instead of newer upstream

The nightly Docker image uses an older vLLM that doesn't have
NormGateLinear, breakable_cudagraph, etc. Patching the Docker
image's own files ensures compatibility.

- deepseek_v4.py: Patches from Docker image + NVFP4 mapper + wo_a BF16
- deepseek_v4_attention.py: Patches from Docker image + inv rope BF16
  + weights_proj quant + removed QuantFP8/GroupShape imports
This commit is contained in:
2026-05-19 03:35:15 +00:00
parent df5a496f5d
commit db5192fe41
2 changed files with 95 additions and 143 deletions

View File

@@ -23,14 +23,11 @@ from vllm.model_executor.layers.deepseek_v4_attention import (
DeepseekV4MLAModules,
DeepseekV4MultiHeadLatentAttentionWrapper,
)
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear
from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import (
fused_topk_bias,
)
from vllm.model_executor.layers.fused_moe.router.norm_gate_linear import (
NormGateLinear,
)
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
@@ -38,12 +35,6 @@ from vllm.model_executor.layers.linear import (
RowParallelLinear,
)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.mhc import (
HCHeadOp,
MHCFusedPostPreOp,
MHCPostOp,
MHCPreOp,
)
from vllm.model_executor.layers.quantization import (
QuantizationConfig,
QuantizationMethods,
@@ -758,23 +749,23 @@ class DeepseekV4MoE(nn.Module):
"deep_gemm_mega_moe for this checkpoint."
)
# Fused RMSNorm + gate: owns both ffn_norm and the gate matmul.
self.norm_gate = NormGateLinear(
hidden_size=config.hidden_size,
num_experts=config.n_routed_experts,
rms_eps=config.rms_norm_eps,
prefix=f"{prefix}.norm_gate",
self.gate = GateLinear(
config.hidden_size,
config.n_routed_experts,
out_dtype=torch.float32,
bias=False,
prefix=f"{prefix}.gate",
)
# Routing-side tensors live on ``norm_gate`` directly (not on the
# inner gate); they are initialized to None in NormGatedLinear and
# populated below depending on the MoE variant.
self.gate.e_score_correction_bias = None
self.gate.tid2eid = None
is_hash_moe = extract_layer_index(prefix) < config.num_hash_layers
self.hash_indices_dtype = torch.int64 if self.use_mega_moe else torch.int32
if is_hash_moe:
# hash MoE doesn't use e_score_correction_bias
# Use randint instead of empty to avoid garbage values causing
# invalid memory access in dummy mode (--load-format="dummy")
self.norm_gate.tid2eid = nn.Parameter(
self.gate.tid2eid = nn.Parameter(
torch.randint(
0,
config.n_routed_experts,
@@ -784,7 +775,7 @@ class DeepseekV4MoE(nn.Module):
requires_grad=False,
)
elif getattr(config, "topk_method", None) == "noaux_tc":
self.norm_gate.e_score_correction_bias = nn.Parameter(
self.gate.e_score_correction_bias = nn.Parameter(
torch.empty(config.n_routed_experts, dtype=torch.float32),
requires_grad=False,
)
@@ -847,9 +838,10 @@ class DeepseekV4MoE(nn.Module):
self.n_local_experts = config.n_routed_experts // self.tp_size
self.experts_start_idx = self.tp_rank * self.n_local_experts
self.experts_end_idx = self.experts_start_idx + self.n_local_experts
# We don't pass `gate` into FusedMoE
self.experts = FusedMoE(
shared_experts=self.shared_experts,
gate=self.gate,
num_experts=config.n_routed_experts,
top_k=config.num_experts_per_tok,
hidden_size=config.hidden_size,
@@ -859,8 +851,8 @@ class DeepseekV4MoE(nn.Module):
prefix=f"{prefix}.experts",
scoring_func=self.scoring_func,
routed_scaling_factor=self.routed_scaling_factor,
e_score_correction_bias=self.norm_gate.e_score_correction_bias,
hash_indices_table=self.norm_gate.tid2eid,
e_score_correction_bias=self.gate.e_score_correction_bias,
hash_indices_table=self.gate.tid2eid,
swiglu_limit=self.swiglu_limit,
router_logits_dtype=torch.float32,
)
@@ -868,40 +860,40 @@ class DeepseekV4MoE(nn.Module):
def forward(
self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None
) -> torch.Tensor:
if self.norm_gate.tid2eid is not None and input_ids is None:
if self.gate.tid2eid is not None and input_ids is None:
raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.")
if not self.use_mega_moe:
return self._forward_fused_moe(hidden_states, input_ids)
org_shape = hidden_states.shape
normed_x, router_logits = self.norm_gate(hidden_states)
router_logits, _ = self.gate(hidden_states)
topk_weights, topk_ids = fused_topk_bias(
hidden_states=normed_x,
hidden_states=hidden_states,
gating_output=router_logits,
scoring_func=self.scoring_func,
e_score_correction_bias=self.norm_gate.e_score_correction_bias.data
if self.norm_gate.e_score_correction_bias is not None
e_score_correction_bias=self.gate.e_score_correction_bias.data
if self.gate.e_score_correction_bias is not None
else None,
topk=self.n_activated_experts,
renormalize=self.renormalize,
indices_type=self.hash_indices_dtype,
input_tokens=input_ids,
hash_indices_table=self.norm_gate.tid2eid,
hash_indices_table=self.gate.tid2eid,
routed_scaling_factor=self.routed_scaling_factor,
)
activation_clamp = (
float(self.swiglu_limit) if self.swiglu_limit is not None else None
)
final_hidden_states = self.experts(
normed_x,
hidden_states,
topk_weights,
topk_ids,
activation_clamp=activation_clamp,
)
if self.shared_experts is not None:
shared_output = self.shared_experts(normed_x)
shared_output = self.shared_experts(hidden_states)
final_hidden_states += shared_output
return final_hidden_states.view(org_shape)
@@ -909,14 +901,21 @@ class DeepseekV4MoE(nn.Module):
def _forward_fused_moe(
self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None
) -> torch.Tensor:
assert not self.experts.is_internal_router
org_shape = hidden_states.shape
normed_x, router_logits = self.norm_gate(hidden_states)
final_hidden_states = self.experts(
hidden_states=normed_x,
router_logits=router_logits,
input_ids=input_ids,
)
if self.experts.is_internal_router:
# In this case, the gate/router runs inside the FusedMoE class
final_hidden_states = self.experts(
hidden_states=hidden_states,
router_logits=hidden_states,
input_ids=input_ids,
)
else:
router_logits, _ = self.gate(hidden_states)
final_hidden_states = self.experts(
hidden_states=hidden_states,
router_logits=router_logits,
input_ids=input_ids,
)
return final_hidden_states.view(org_shape)
@@ -1122,8 +1121,7 @@ class DeepseekV4DecoderLayer(nn.Module):
self.ffn = DeepseekV4MoE(vllm_config, prefix=f"{prefix}.ffn")
self.attn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps)
# ``ffn_norm`` is owned by ``self.ffn.norm_gate`` (fused with the
# router gate matmul); see ``NormGatedLinear``.
self.ffn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps)
self.hc_mult = config.hc_mult
self.hc_sinkhorn_iters = config.hc_sinkhorn_iters
self.hc_eps = config.hc_eps
@@ -1172,9 +1170,6 @@ class DeepseekV4DecoderLayer(nn.Module):
),
requires_grad=False,
)
self.mhc_pre = MHCPreOp()
self.mhc_post = MHCPostOp()
self.mhc_fused_post_pre = MHCFusedPostPreOp()
def hc_pre(
self,
@@ -1183,7 +1178,7 @@ class DeepseekV4DecoderLayer(nn.Module):
hc_scale: torch.Tensor,
hc_base: torch.Tensor,
):
post_mix, res_mix, layer_input = self.mhc_pre(
post_mix, res_mix, layer_input = torch.ops.vllm.mhc_pre(
residual=x,
fn=hc_fn,
hc_scale=hc_scale,
@@ -1203,17 +1198,17 @@ class DeepseekV4DecoderLayer(nn.Module):
post: torch.Tensor,
comb: torch.Tensor,
):
return self.mhc_post(x, residual, post, comb)
return torch.ops.vllm.mhc_post(x, residual, post, comb)
def _forward_cuda(
def forward(
self,
x: torch.Tensor,
positions: torch.Tensor,
input_ids: torch.Tensor | None,
post_mix: torch.Tensor | None = None,
res_mix: torch.Tensor | None = None,
residual: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
post_mix: torch.Tensor | None,
res_mix: torch.Tensor | None,
residual: torch.Tensor | None,
) -> torch.Tensor:
if residual is None:
# Run standalone hc_pre on first layer
residual = x
@@ -1221,7 +1216,7 @@ class DeepseekV4DecoderLayer(nn.Module):
x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
)
else:
residual, post_mix, res_mix, x = self.mhc_fused_post_pre(
residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
x,
residual,
post_mix,
@@ -1239,7 +1234,7 @@ class DeepseekV4DecoderLayer(nn.Module):
x = self.attn_norm(x)
x = self.attn(positions, x, None)
residual, post_mix, res_mix, x = self.mhc_fused_post_pre(
residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
x,
residual,
post_mix,
@@ -1253,58 +1248,11 @@ class DeepseekV4DecoderLayer(nn.Module):
self.hc_post_alpha,
self.hc_sinkhorn_iters,
)
# ffn_norm is now folded into self.ffn.norm_gate; ffn() takes
# the pre-norm activation directly.
x = self.ffn_norm(x)
x = self.ffn(x, input_ids)
return x, residual, post_mix, res_mix
def _forward_rocm(
self,
x: torch.Tensor,
positions: torch.Tensor,
input_ids: torch.Tensor | None,
post_mix: torch.Tensor | None = None,
res_mix: torch.Tensor | None = None,
residual: torch.Tensor | None = None,
) -> tuple[
torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None
]:
residual = x
x, post, comb = self.hc_pre(
x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
)
x = self.attn_norm(x)
x = self.attn(positions, x, None)
x = self.hc_post(x, residual, post, comb)
residual = x
x, post, comb = self.hc_pre(
x, self.hc_ffn_fn, self.hc_ffn_scale, self.hc_ffn_base
)
# ffn_norm is now folded into self.ffn.norm_gate; ffn() takes
# the pre-norm activation directly.
x = self.ffn(x, input_ids)
x = self.hc_post(x, residual, post, comb)
return x, None, None, None
def forward(
self,
x: torch.Tensor,
positions: torch.Tensor,
input_ids: torch.Tensor | None,
post_mix: torch.Tensor | None = None,
res_mix: torch.Tensor | None = None,
residual: torch.Tensor | None = None,
) -> tuple[
torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None
]:
if current_platform.is_rocm():
return self._forward_rocm(
x, positions, input_ids, post_mix, res_mix, residual
)
return self._forward_cuda(x, positions, input_ids, post_mix, res_mix, residual)
@support_torch_compile
class DeepseekV4Model(nn.Module):
@@ -1394,7 +1342,7 @@ class DeepseekV4Model(nn.Module):
torch.empty(1, dtype=torch.float32),
requires_grad=False,
)
self.hc_head_op = HCHeadOp()
# Pre-hc_head residual stream buffer for the MTP draft. Stable
# address (outside the cudagraph pool) so the copy_ in forward()
# refreshes it correctly across captured shapes.
@@ -1464,7 +1412,7 @@ class DeepseekV4Model(nn.Module):
res_mix,
residual,
)
if layer is not None and current_platform.is_cuda():
else:
hidden_states = layer.hc_post(hidden_states, residual, post_mix, res_mix)
if not get_pp_group().is_last_rank:
@@ -1474,7 +1422,7 @@ class DeepseekV4Model(nn.Module):
num_tokens = hidden_states.shape[0]
self._mtp_hidden_buffer[:num_tokens].copy_(hidden_states.flatten(1))
hidden_states = self.hc_head_op(
hidden_states = hc_head(
hidden_states,
self.hc_head_fn,
self.hc_head_scale,
@@ -1603,6 +1551,36 @@ class DeepseekV4Model(nn.Module):
layer.ffn.finalize_mega_moe_weights()
@torch.compile(backend=current_platform.simple_compile_backend)
def hc_head(
hidden_states: torch.Tensor,
hc_fn: torch.Tensor,
hc_scale: torch.Tensor,
hc_base: torch.Tensor,
rms_norm_eps: float,
hc_eps: float,
) -> torch.Tensor:
hc_mult, hidden_size = hidden_states.shape[-2:]
outer_shape = hidden_states.shape[:-2]
hs_flat = hidden_states.view(-1, hc_mult, hidden_size)
num_tokens = hs_flat.shape[0]
out = torch.empty(
num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device
)
torch.ops.vllm.hc_head_fused_kernel(
hs_flat,
hc_fn,
hc_scale,
hc_base,
out,
hidden_size,
rms_norm_eps,
hc_eps,
hc_mult,
)
return out.view(*outer_shape, hidden_size)
def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
if expert_dtype == "fp4":
# MXFP4 experts use Mxfp4MoEMethod, which registers scales as
@@ -1632,13 +1610,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
orig_to_new_suffix={
"head.weight": "lm_head.weight",
"embed.weight": "embed_tokens.weight",
# Pre-MoE norm + gate are now owned by ``DeepseekV4MoE.norm_gate``
# (see NormGatedLinear).
".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
# Hash MoE table also moved off the inner gate.
".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
".ffn.gate.bias": ".ffn.gate.e_score_correction_bias",
},
orig_to_new_substr={
".attn.compressor.": ".attn.mla_attn.compressor.",
@@ -1650,7 +1622,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
"""Weight mapper for NVFP4 (ModelOpt) DeepSeek-V4 checkpoints.
NVFP4 checkpoints use different key naming than the upstream MXFP4 format:
NVFP4 checkpoints use different key naming than the default MXFP4 format:
- ``self_attn`` prefix instead of ``attn``
- ``mlp`` prefix instead of ``ffn``
- Expert weights: gate_proj/up_proj/down_proj (not w1/w3/w2)
@@ -1663,7 +1635,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
re.compile(r"(\.experts\.\d+\.)up_proj\."): r"\1w3.",
re.compile(r"(\.experts\.\d+\.)down_proj\."): r"\1w2.",
}
return WeightsMapper(
orig_to_new_prefix={
"layers.": "model.layers.",
@@ -1673,22 +1644,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
"mtp.": "model.mtp.",
},
orig_to_new_regex=expert_rename_regex,
# No suffix renames needed — NVFP4 checkpoint uses
# .weight_scale / .weight_scale_2 / .input_scale directly.
orig_to_new_suffix={
"head.weight": "lm_head.weight",
"embed.weight": "embed_tokens.weight",
# Pre-MoE norm + gate are now owned by DeepseekV4MoE.norm_gate
".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
".ffn.gate.bias": ".ffn.gate.e_score_correction_bias",
},
# Specific renames MUST come before general ones (applied in order).
orig_to_new_substr={
# Indexer params (MUST come before .self_attn.compressor.
# so indexer keys are captured before the compressor prefix
# rewrite moves them under mla_attn.compressor).
# Indexer params (MUST come before general compressor renames)
".self_attn.compressor.indexer.q_b_proj.":
".attn.indexer.wq_b.",
".self_attn.compressor.indexer.weights_proj.":
@@ -1701,14 +1663,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
".attn.indexer.compressor.wgate.",
".self_attn.compressor.indexer.position_bias":
".attn.indexer.compressor.ape",
# Compressor (non-indexer) renames
# Compressor renames (non-indexer)
"compressor.kv_proj.": "compressor.wkv.",
"compressor.gate_proj.": "compressor.wgate.",
"compressor.kv_norm.": "compressor.norm.",
"compressor.position_bias": "compressor.ape",
# Attention compressor (after indexer renames)
".self_attn.compressor.": ".attn.compressor.",
# Attention projections (specific before .self_attn. → .attn.)
# Attention projections
".self_attn.q_a_proj.": ".attn.wq_a.",
".self_attn.kv_proj.": ".attn.wkv.",
".self_attn.q_b_proj.": ".attn.wq_b.",
@@ -1717,7 +1678,7 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
".self_attn.q_a_norm.": ".attn.q_norm.",
".self_attn.kv_norm.": ".attn.kv_norm.",
".self_attn.sinks": ".attn.attn_sink",
# Shared expert projections (specific before .mlp. → .ffn.)
# Shared experts
".mlp.shared_experts.gate_proj.":
".ffn.shared_experts.w1.",
".mlp.shared_experts.up_proj.":
@@ -1727,7 +1688,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
# General renames
".mlp.": ".ffn.",
".self_attn.": ".attn.",
# Layer norms
"input_layernorm.": "attn_norm.",
"post_attention_layernorm.": "ffn_norm.",
# HC params

View File

@@ -14,12 +14,6 @@ import torch.nn.functional as F
from transformers import DeepseekV2Config, DeepseekV3Config
import vllm.envs as envs
try:
from vllm.compilation.breakable_cudagraph import eager_break_during_capture
except ImportError:
# Older vLLM versions don't have this module; use identity decorator
def eager_break_during_capture(fn):
return fn
from vllm.model_executor.layers.linear import (
ReplicatedLinear,
)
@@ -52,7 +46,7 @@ from vllm.logger import init_logger
from vllm.model_executor.custom_op import PluggableLayer
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.layers.deepseek_compressor import DeepseekCompressor
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.platforms import current_platform
@@ -327,7 +321,7 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
z = z.view(num_tokens, self.n_local_groups, self.o_lora_rank)
return self.wo_b(z.flatten(1))
# FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum (SM90 only)
# FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum
o_fp8, o_scale = fused_inv_rope_fp8_quant(
o,
positions,
@@ -572,7 +566,7 @@ def _apply_inv_rope_bf16(
) -> torch.Tensor:
"""Apply inverse RoPE to attention output in BF16.
Inverse RoPE is just RoPE with cos → cos, sin -sin.
Inverse RoPE is just RoPE with sin -> -sin.
Uses GPT-J style (interleaved) rotary embedding.
"""
if rope_dim == 0 or o.numel() == 0:
@@ -588,7 +582,7 @@ def _apply_inv_rope_bf16(
rope = o_f32[..., nope_dim:]
y_even = rope[..., 0::2]
y_odd = rope[..., 1::2]
# Inverse: sin → -sin (swap signs on the cross terms)
# Inverse: sin → -sin (swap signs on cross terms)
rope_out = torch.stack(
(y_even * cos + y_odd * sin, y_odd * cos - y_even * sin),
dim=-1,
@@ -598,7 +592,6 @@ def _apply_inv_rope_bf16(
return o_f32.to(o.dtype)
@eager_break_during_capture
def deepseek_v4_attention(
hidden_states: torch.Tensor,
positions: torch.Tensor,
@@ -1148,8 +1141,6 @@ class DeepseekV4Indexer(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.wq_b",
)
# weights_proj is NVFP4-quantized in the modelopt checkpoint.
# Upstream uses quant_config=None for the MXFP4 checkpoint.
self.weights_proj = ReplicatedLinear(
hidden_size,
self.n_head,
@@ -1157,6 +1148,7 @@ class DeepseekV4Indexer(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.weights_proj",
)
self.k_norm = LayerNorm(self.head_dim, eps=1e-6)
self.softmax_scale = self.head_dim**-0.5
self.scale_fmt = "ue8m0"