Patch from Docker image's vLLM (0.20.2rc1) instead of newer upstream
The nightly Docker image uses an older vLLM that doesn't have NormGateLinear, breakable_cudagraph, etc. Patching the Docker image's own files ensures compatibility. - deepseek_v4.py: Patches from Docker image + NVFP4 mapper + wo_a BF16 - deepseek_v4_attention.py: Patches from Docker image + inv rope BF16 + weights_proj quant + removed QuantFP8/GroupShape imports
This commit is contained in:
@@ -23,14 +23,11 @@ from vllm.model_executor.layers.deepseek_v4_attention import (
|
||||
DeepseekV4MLAModules,
|
||||
DeepseekV4MultiHeadLatentAttentionWrapper,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear
|
||||
from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
|
||||
from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import (
|
||||
fused_topk_bias,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.router.norm_gate_linear import (
|
||||
NormGateLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
@@ -38,12 +35,6 @@ from vllm.model_executor.layers.linear import (
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.mhc import (
|
||||
HCHeadOp,
|
||||
MHCFusedPostPreOp,
|
||||
MHCPostOp,
|
||||
MHCPreOp,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization import (
|
||||
QuantizationConfig,
|
||||
QuantizationMethods,
|
||||
@@ -758,23 +749,23 @@ class DeepseekV4MoE(nn.Module):
|
||||
"deep_gemm_mega_moe for this checkpoint."
|
||||
)
|
||||
|
||||
# Fused RMSNorm + gate: owns both ffn_norm and the gate matmul.
|
||||
self.norm_gate = NormGateLinear(
|
||||
hidden_size=config.hidden_size,
|
||||
num_experts=config.n_routed_experts,
|
||||
rms_eps=config.rms_norm_eps,
|
||||
prefix=f"{prefix}.norm_gate",
|
||||
self.gate = GateLinear(
|
||||
config.hidden_size,
|
||||
config.n_routed_experts,
|
||||
out_dtype=torch.float32,
|
||||
bias=False,
|
||||
prefix=f"{prefix}.gate",
|
||||
)
|
||||
# Routing-side tensors live on ``norm_gate`` directly (not on the
|
||||
# inner gate); they are initialized to None in NormGatedLinear and
|
||||
# populated below depending on the MoE variant.
|
||||
self.gate.e_score_correction_bias = None
|
||||
self.gate.tid2eid = None
|
||||
is_hash_moe = extract_layer_index(prefix) < config.num_hash_layers
|
||||
self.hash_indices_dtype = torch.int64 if self.use_mega_moe else torch.int32
|
||||
|
||||
if is_hash_moe:
|
||||
# hash MoE doesn't use e_score_correction_bias
|
||||
# Use randint instead of empty to avoid garbage values causing
|
||||
# invalid memory access in dummy mode (--load-format="dummy")
|
||||
self.norm_gate.tid2eid = nn.Parameter(
|
||||
self.gate.tid2eid = nn.Parameter(
|
||||
torch.randint(
|
||||
0,
|
||||
config.n_routed_experts,
|
||||
@@ -784,7 +775,7 @@ class DeepseekV4MoE(nn.Module):
|
||||
requires_grad=False,
|
||||
)
|
||||
elif getattr(config, "topk_method", None) == "noaux_tc":
|
||||
self.norm_gate.e_score_correction_bias = nn.Parameter(
|
||||
self.gate.e_score_correction_bias = nn.Parameter(
|
||||
torch.empty(config.n_routed_experts, dtype=torch.float32),
|
||||
requires_grad=False,
|
||||
)
|
||||
@@ -847,9 +838,10 @@ class DeepseekV4MoE(nn.Module):
|
||||
self.n_local_experts = config.n_routed_experts // self.tp_size
|
||||
self.experts_start_idx = self.tp_rank * self.n_local_experts
|
||||
self.experts_end_idx = self.experts_start_idx + self.n_local_experts
|
||||
# We don't pass `gate` into FusedMoE
|
||||
|
||||
self.experts = FusedMoE(
|
||||
shared_experts=self.shared_experts,
|
||||
gate=self.gate,
|
||||
num_experts=config.n_routed_experts,
|
||||
top_k=config.num_experts_per_tok,
|
||||
hidden_size=config.hidden_size,
|
||||
@@ -859,8 +851,8 @@ class DeepseekV4MoE(nn.Module):
|
||||
prefix=f"{prefix}.experts",
|
||||
scoring_func=self.scoring_func,
|
||||
routed_scaling_factor=self.routed_scaling_factor,
|
||||
e_score_correction_bias=self.norm_gate.e_score_correction_bias,
|
||||
hash_indices_table=self.norm_gate.tid2eid,
|
||||
e_score_correction_bias=self.gate.e_score_correction_bias,
|
||||
hash_indices_table=self.gate.tid2eid,
|
||||
swiglu_limit=self.swiglu_limit,
|
||||
router_logits_dtype=torch.float32,
|
||||
)
|
||||
@@ -868,40 +860,40 @@ class DeepseekV4MoE(nn.Module):
|
||||
def forward(
|
||||
self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None
|
||||
) -> torch.Tensor:
|
||||
if self.norm_gate.tid2eid is not None and input_ids is None:
|
||||
if self.gate.tid2eid is not None and input_ids is None:
|
||||
raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.")
|
||||
|
||||
if not self.use_mega_moe:
|
||||
return self._forward_fused_moe(hidden_states, input_ids)
|
||||
|
||||
org_shape = hidden_states.shape
|
||||
normed_x, router_logits = self.norm_gate(hidden_states)
|
||||
router_logits, _ = self.gate(hidden_states)
|
||||
topk_weights, topk_ids = fused_topk_bias(
|
||||
hidden_states=normed_x,
|
||||
hidden_states=hidden_states,
|
||||
gating_output=router_logits,
|
||||
scoring_func=self.scoring_func,
|
||||
e_score_correction_bias=self.norm_gate.e_score_correction_bias.data
|
||||
if self.norm_gate.e_score_correction_bias is not None
|
||||
e_score_correction_bias=self.gate.e_score_correction_bias.data
|
||||
if self.gate.e_score_correction_bias is not None
|
||||
else None,
|
||||
topk=self.n_activated_experts,
|
||||
renormalize=self.renormalize,
|
||||
indices_type=self.hash_indices_dtype,
|
||||
input_tokens=input_ids,
|
||||
hash_indices_table=self.norm_gate.tid2eid,
|
||||
hash_indices_table=self.gate.tid2eid,
|
||||
routed_scaling_factor=self.routed_scaling_factor,
|
||||
)
|
||||
activation_clamp = (
|
||||
float(self.swiglu_limit) if self.swiglu_limit is not None else None
|
||||
)
|
||||
final_hidden_states = self.experts(
|
||||
normed_x,
|
||||
hidden_states,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
activation_clamp=activation_clamp,
|
||||
)
|
||||
|
||||
if self.shared_experts is not None:
|
||||
shared_output = self.shared_experts(normed_x)
|
||||
shared_output = self.shared_experts(hidden_states)
|
||||
final_hidden_states += shared_output
|
||||
|
||||
return final_hidden_states.view(org_shape)
|
||||
@@ -909,14 +901,21 @@ class DeepseekV4MoE(nn.Module):
|
||||
def _forward_fused_moe(
|
||||
self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None
|
||||
) -> torch.Tensor:
|
||||
assert not self.experts.is_internal_router
|
||||
org_shape = hidden_states.shape
|
||||
normed_x, router_logits = self.norm_gate(hidden_states)
|
||||
final_hidden_states = self.experts(
|
||||
hidden_states=normed_x,
|
||||
router_logits=router_logits,
|
||||
input_ids=input_ids,
|
||||
)
|
||||
if self.experts.is_internal_router:
|
||||
# In this case, the gate/router runs inside the FusedMoE class
|
||||
final_hidden_states = self.experts(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=hidden_states,
|
||||
input_ids=input_ids,
|
||||
)
|
||||
else:
|
||||
router_logits, _ = self.gate(hidden_states)
|
||||
final_hidden_states = self.experts(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits,
|
||||
input_ids=input_ids,
|
||||
)
|
||||
|
||||
return final_hidden_states.view(org_shape)
|
||||
|
||||
@@ -1122,8 +1121,7 @@ class DeepseekV4DecoderLayer(nn.Module):
|
||||
self.ffn = DeepseekV4MoE(vllm_config, prefix=f"{prefix}.ffn")
|
||||
|
||||
self.attn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps)
|
||||
# ``ffn_norm`` is owned by ``self.ffn.norm_gate`` (fused with the
|
||||
# router gate matmul); see ``NormGatedLinear``.
|
||||
self.ffn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps)
|
||||
self.hc_mult = config.hc_mult
|
||||
self.hc_sinkhorn_iters = config.hc_sinkhorn_iters
|
||||
self.hc_eps = config.hc_eps
|
||||
@@ -1172,9 +1170,6 @@ class DeepseekV4DecoderLayer(nn.Module):
|
||||
),
|
||||
requires_grad=False,
|
||||
)
|
||||
self.mhc_pre = MHCPreOp()
|
||||
self.mhc_post = MHCPostOp()
|
||||
self.mhc_fused_post_pre = MHCFusedPostPreOp()
|
||||
|
||||
def hc_pre(
|
||||
self,
|
||||
@@ -1183,7 +1178,7 @@ class DeepseekV4DecoderLayer(nn.Module):
|
||||
hc_scale: torch.Tensor,
|
||||
hc_base: torch.Tensor,
|
||||
):
|
||||
post_mix, res_mix, layer_input = self.mhc_pre(
|
||||
post_mix, res_mix, layer_input = torch.ops.vllm.mhc_pre(
|
||||
residual=x,
|
||||
fn=hc_fn,
|
||||
hc_scale=hc_scale,
|
||||
@@ -1203,17 +1198,17 @@ class DeepseekV4DecoderLayer(nn.Module):
|
||||
post: torch.Tensor,
|
||||
comb: torch.Tensor,
|
||||
):
|
||||
return self.mhc_post(x, residual, post, comb)
|
||||
return torch.ops.vllm.mhc_post(x, residual, post, comb)
|
||||
|
||||
def _forward_cuda(
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
input_ids: torch.Tensor | None,
|
||||
post_mix: torch.Tensor | None = None,
|
||||
res_mix: torch.Tensor | None = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
post_mix: torch.Tensor | None,
|
||||
res_mix: torch.Tensor | None,
|
||||
residual: torch.Tensor | None,
|
||||
) -> torch.Tensor:
|
||||
if residual is None:
|
||||
# Run standalone hc_pre on first layer
|
||||
residual = x
|
||||
@@ -1221,7 +1216,7 @@ class DeepseekV4DecoderLayer(nn.Module):
|
||||
x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
|
||||
)
|
||||
else:
|
||||
residual, post_mix, res_mix, x = self.mhc_fused_post_pre(
|
||||
residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
|
||||
x,
|
||||
residual,
|
||||
post_mix,
|
||||
@@ -1239,7 +1234,7 @@ class DeepseekV4DecoderLayer(nn.Module):
|
||||
x = self.attn_norm(x)
|
||||
x = self.attn(positions, x, None)
|
||||
|
||||
residual, post_mix, res_mix, x = self.mhc_fused_post_pre(
|
||||
residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
|
||||
x,
|
||||
residual,
|
||||
post_mix,
|
||||
@@ -1253,58 +1248,11 @@ class DeepseekV4DecoderLayer(nn.Module):
|
||||
self.hc_post_alpha,
|
||||
self.hc_sinkhorn_iters,
|
||||
)
|
||||
# ffn_norm is now folded into self.ffn.norm_gate; ffn() takes
|
||||
# the pre-norm activation directly.
|
||||
|
||||
x = self.ffn_norm(x)
|
||||
x = self.ffn(x, input_ids)
|
||||
return x, residual, post_mix, res_mix
|
||||
|
||||
def _forward_rocm(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
input_ids: torch.Tensor | None,
|
||||
post_mix: torch.Tensor | None = None,
|
||||
res_mix: torch.Tensor | None = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
) -> tuple[
|
||||
torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None
|
||||
]:
|
||||
residual = x
|
||||
x, post, comb = self.hc_pre(
|
||||
x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
|
||||
)
|
||||
x = self.attn_norm(x)
|
||||
x = self.attn(positions, x, None)
|
||||
x = self.hc_post(x, residual, post, comb)
|
||||
|
||||
residual = x
|
||||
x, post, comb = self.hc_pre(
|
||||
x, self.hc_ffn_fn, self.hc_ffn_scale, self.hc_ffn_base
|
||||
)
|
||||
# ffn_norm is now folded into self.ffn.norm_gate; ffn() takes
|
||||
# the pre-norm activation directly.
|
||||
x = self.ffn(x, input_ids)
|
||||
x = self.hc_post(x, residual, post, comb)
|
||||
return x, None, None, None
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
input_ids: torch.Tensor | None,
|
||||
post_mix: torch.Tensor | None = None,
|
||||
res_mix: torch.Tensor | None = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
) -> tuple[
|
||||
torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None
|
||||
]:
|
||||
if current_platform.is_rocm():
|
||||
return self._forward_rocm(
|
||||
x, positions, input_ids, post_mix, res_mix, residual
|
||||
)
|
||||
|
||||
return self._forward_cuda(x, positions, input_ids, post_mix, res_mix, residual)
|
||||
|
||||
|
||||
@support_torch_compile
|
||||
class DeepseekV4Model(nn.Module):
|
||||
@@ -1394,7 +1342,7 @@ class DeepseekV4Model(nn.Module):
|
||||
torch.empty(1, dtype=torch.float32),
|
||||
requires_grad=False,
|
||||
)
|
||||
self.hc_head_op = HCHeadOp()
|
||||
|
||||
# Pre-hc_head residual stream buffer for the MTP draft. Stable
|
||||
# address (outside the cudagraph pool) so the copy_ in forward()
|
||||
# refreshes it correctly across captured shapes.
|
||||
@@ -1464,7 +1412,7 @@ class DeepseekV4Model(nn.Module):
|
||||
res_mix,
|
||||
residual,
|
||||
)
|
||||
if layer is not None and current_platform.is_cuda():
|
||||
else:
|
||||
hidden_states = layer.hc_post(hidden_states, residual, post_mix, res_mix)
|
||||
|
||||
if not get_pp_group().is_last_rank:
|
||||
@@ -1474,7 +1422,7 @@ class DeepseekV4Model(nn.Module):
|
||||
num_tokens = hidden_states.shape[0]
|
||||
self._mtp_hidden_buffer[:num_tokens].copy_(hidden_states.flatten(1))
|
||||
|
||||
hidden_states = self.hc_head_op(
|
||||
hidden_states = hc_head(
|
||||
hidden_states,
|
||||
self.hc_head_fn,
|
||||
self.hc_head_scale,
|
||||
@@ -1603,6 +1551,36 @@ class DeepseekV4Model(nn.Module):
|
||||
layer.ffn.finalize_mega_moe_weights()
|
||||
|
||||
|
||||
@torch.compile(backend=current_platform.simple_compile_backend)
|
||||
def hc_head(
|
||||
hidden_states: torch.Tensor,
|
||||
hc_fn: torch.Tensor,
|
||||
hc_scale: torch.Tensor,
|
||||
hc_base: torch.Tensor,
|
||||
rms_norm_eps: float,
|
||||
hc_eps: float,
|
||||
) -> torch.Tensor:
|
||||
hc_mult, hidden_size = hidden_states.shape[-2:]
|
||||
outer_shape = hidden_states.shape[:-2]
|
||||
hs_flat = hidden_states.view(-1, hc_mult, hidden_size)
|
||||
num_tokens = hs_flat.shape[0]
|
||||
out = torch.empty(
|
||||
num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device
|
||||
)
|
||||
torch.ops.vllm.hc_head_fused_kernel(
|
||||
hs_flat,
|
||||
hc_fn,
|
||||
hc_scale,
|
||||
hc_base,
|
||||
out,
|
||||
hidden_size,
|
||||
rms_norm_eps,
|
||||
hc_eps,
|
||||
hc_mult,
|
||||
)
|
||||
return out.view(*outer_shape, hidden_size)
|
||||
|
||||
|
||||
def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
|
||||
if expert_dtype == "fp4":
|
||||
# MXFP4 experts use Mxfp4MoEMethod, which registers scales as
|
||||
@@ -1632,13 +1610,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
|
||||
orig_to_new_suffix={
|
||||
"head.weight": "lm_head.weight",
|
||||
"embed.weight": "embed_tokens.weight",
|
||||
# Pre-MoE norm + gate are now owned by ``DeepseekV4MoE.norm_gate``
|
||||
# (see NormGatedLinear).
|
||||
".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
|
||||
".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
|
||||
".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
|
||||
# Hash MoE table also moved off the inner gate.
|
||||
".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
|
||||
".ffn.gate.bias": ".ffn.gate.e_score_correction_bias",
|
||||
},
|
||||
orig_to_new_substr={
|
||||
".attn.compressor.": ".attn.mla_attn.compressor.",
|
||||
@@ -1650,7 +1622,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
|
||||
def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
|
||||
"""Weight mapper for NVFP4 (ModelOpt) DeepSeek-V4 checkpoints.
|
||||
|
||||
NVFP4 checkpoints use different key naming than the upstream MXFP4 format:
|
||||
NVFP4 checkpoints use different key naming than the default MXFP4 format:
|
||||
- ``self_attn`` prefix instead of ``attn``
|
||||
- ``mlp`` prefix instead of ``ffn``
|
||||
- Expert weights: gate_proj/up_proj/down_proj (not w1/w3/w2)
|
||||
@@ -1663,7 +1635,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
|
||||
re.compile(r"(\.experts\.\d+\.)up_proj\."): r"\1w3.",
|
||||
re.compile(r"(\.experts\.\d+\.)down_proj\."): r"\1w2.",
|
||||
}
|
||||
|
||||
return WeightsMapper(
|
||||
orig_to_new_prefix={
|
||||
"layers.": "model.layers.",
|
||||
@@ -1673,22 +1644,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
|
||||
"mtp.": "model.mtp.",
|
||||
},
|
||||
orig_to_new_regex=expert_rename_regex,
|
||||
# No suffix renames needed — NVFP4 checkpoint uses
|
||||
# .weight_scale / .weight_scale_2 / .input_scale directly.
|
||||
orig_to_new_suffix={
|
||||
"head.weight": "lm_head.weight",
|
||||
"embed.weight": "embed_tokens.weight",
|
||||
# Pre-MoE norm + gate are now owned by DeepseekV4MoE.norm_gate
|
||||
".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
|
||||
".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
|
||||
".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
|
||||
".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
|
||||
".ffn.gate.bias": ".ffn.gate.e_score_correction_bias",
|
||||
},
|
||||
# Specific renames MUST come before general ones (applied in order).
|
||||
orig_to_new_substr={
|
||||
# Indexer params (MUST come before .self_attn.compressor.
|
||||
# so indexer keys are captured before the compressor prefix
|
||||
# rewrite moves them under mla_attn.compressor).
|
||||
# Indexer params (MUST come before general compressor renames)
|
||||
".self_attn.compressor.indexer.q_b_proj.":
|
||||
".attn.indexer.wq_b.",
|
||||
".self_attn.compressor.indexer.weights_proj.":
|
||||
@@ -1701,14 +1663,13 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
|
||||
".attn.indexer.compressor.wgate.",
|
||||
".self_attn.compressor.indexer.position_bias":
|
||||
".attn.indexer.compressor.ape",
|
||||
# Compressor (non-indexer) renames
|
||||
# Compressor renames (non-indexer)
|
||||
"compressor.kv_proj.": "compressor.wkv.",
|
||||
"compressor.gate_proj.": "compressor.wgate.",
|
||||
"compressor.kv_norm.": "compressor.norm.",
|
||||
"compressor.position_bias": "compressor.ape",
|
||||
# Attention compressor (after indexer renames)
|
||||
".self_attn.compressor.": ".attn.compressor.",
|
||||
# Attention projections (specific before .self_attn. → .attn.)
|
||||
# Attention projections
|
||||
".self_attn.q_a_proj.": ".attn.wq_a.",
|
||||
".self_attn.kv_proj.": ".attn.wkv.",
|
||||
".self_attn.q_b_proj.": ".attn.wq_b.",
|
||||
@@ -1717,7 +1678,7 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
|
||||
".self_attn.q_a_norm.": ".attn.q_norm.",
|
||||
".self_attn.kv_norm.": ".attn.kv_norm.",
|
||||
".self_attn.sinks": ".attn.attn_sink",
|
||||
# Shared expert projections (specific before .mlp. → .ffn.)
|
||||
# Shared experts
|
||||
".mlp.shared_experts.gate_proj.":
|
||||
".ffn.shared_experts.w1.",
|
||||
".mlp.shared_experts.up_proj.":
|
||||
@@ -1727,7 +1688,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
|
||||
# General renames
|
||||
".mlp.": ".ffn.",
|
||||
".self_attn.": ".attn.",
|
||||
# Layer norms
|
||||
"input_layernorm.": "attn_norm.",
|
||||
"post_attention_layernorm.": "ffn_norm.",
|
||||
# HC params
|
||||
|
||||
@@ -14,12 +14,6 @@ import torch.nn.functional as F
|
||||
from transformers import DeepseekV2Config, DeepseekV3Config
|
||||
|
||||
import vllm.envs as envs
|
||||
try:
|
||||
from vllm.compilation.breakable_cudagraph import eager_break_during_capture
|
||||
except ImportError:
|
||||
# Older vLLM versions don't have this module; use identity decorator
|
||||
def eager_break_during_capture(fn):
|
||||
return fn
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ReplicatedLinear,
|
||||
)
|
||||
@@ -52,7 +46,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.custom_op import PluggableLayer
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.layers.deepseek_compressor import DeepseekCompressor
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
@@ -327,7 +321,7 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
|
||||
z = z.view(num_tokens, self.n_local_groups, self.o_lora_rank)
|
||||
return self.wo_b(z.flatten(1))
|
||||
|
||||
# FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum (SM90 only)
|
||||
# FP8 wo_a path: fused inverse RoPE + FP8 quant + einsum
|
||||
o_fp8, o_scale = fused_inv_rope_fp8_quant(
|
||||
o,
|
||||
positions,
|
||||
@@ -572,7 +566,7 @@ def _apply_inv_rope_bf16(
|
||||
) -> torch.Tensor:
|
||||
"""Apply inverse RoPE to attention output in BF16.
|
||||
|
||||
Inverse RoPE is just RoPE with cos → cos, sin → -sin.
|
||||
Inverse RoPE is just RoPE with sin -> -sin.
|
||||
Uses GPT-J style (interleaved) rotary embedding.
|
||||
"""
|
||||
if rope_dim == 0 or o.numel() == 0:
|
||||
@@ -588,7 +582,7 @@ def _apply_inv_rope_bf16(
|
||||
rope = o_f32[..., nope_dim:]
|
||||
y_even = rope[..., 0::2]
|
||||
y_odd = rope[..., 1::2]
|
||||
# Inverse: sin → -sin (swap signs on the cross terms)
|
||||
# Inverse: sin → -sin (swap signs on cross terms)
|
||||
rope_out = torch.stack(
|
||||
(y_even * cos + y_odd * sin, y_odd * cos - y_even * sin),
|
||||
dim=-1,
|
||||
@@ -598,7 +592,6 @@ def _apply_inv_rope_bf16(
|
||||
return o_f32.to(o.dtype)
|
||||
|
||||
|
||||
@eager_break_during_capture
|
||||
def deepseek_v4_attention(
|
||||
hidden_states: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
@@ -1148,8 +1141,6 @@ class DeepseekV4Indexer(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.wq_b",
|
||||
)
|
||||
# weights_proj is NVFP4-quantized in the modelopt checkpoint.
|
||||
# Upstream uses quant_config=None for the MXFP4 checkpoint.
|
||||
self.weights_proj = ReplicatedLinear(
|
||||
hidden_size,
|
||||
self.n_head,
|
||||
@@ -1157,6 +1148,7 @@ class DeepseekV4Indexer(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.weights_proj",
|
||||
)
|
||||
self.k_norm = LayerNorm(self.head_dim, eps=1e-6)
|
||||
self.softmax_scale = self.head_dim**-0.5
|
||||
|
||||
self.scale_fmt = "ue8m0"
|
||||
|
||||
Reference in New Issue
Block a user