Weight mapper fixes: - Reorder substr renames: compressor renames first, then .self_attn.compressor. → .attn.mla_attn.compressor., then indexer renames (so indexer keys end up under mla_attn after the compressor rename already fired) - Add compressor param renames: kv_proj→wkv, gate_proj→wgate, kv_norm→norm, position_bias→ape (checkpoint uses NVFP4 naming, model uses internal names) - Add indexer param renames: q_b_proj→wq_b, kv_proj→compressor.wkv, gate_proj→compressor.wgate, kv_norm→k_norm, position_bias→compressor.ape, weights_proj stays (structural: compressor.indexer → indexer.compressor) - Remove broken suffix renames (already fixed in prior commit) Model architecture fixes: - Patch deepseek_compressor.py to pass quant_config (was None, but NVFP4 checkpoint has quantized compressor weights with input_scale/weight_scale) - Patch deepseek_v4_attention.py indexer: weights_proj now uses quant_config (was None, but checkpoint has quantized weights) - Add indexer.compressor.fused_wkv_wgate stacking in load_weights Infrastructure: - Add deepseek_compressor.py to Dockerfile - Force MoE backend to flashinfer_cutedsl (was auto-selecting FLASHINFER_TRTLLM) - Update unit test to 50 cases (compressor + indexer + quantization scales)
205 lines
9.5 KiB
Python
205 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Unit test for the NVFP4 weights mapper.
|
|
|
|
Validates that checkpoint key names from our ModelOpt-quantized
|
|
DeepSeek-V4-Pro checkpoint are correctly mapped to vLLM model
|
|
parameter names.
|
|
|
|
This can run WITHOUT vLLM or CUDA — it only tests the mapper logic.
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
from typing import Optional
|
|
|
|
|
|
class WeightsMapper:
|
|
"""Simplified WeightsMapper for testing."""
|
|
|
|
def __init__(
|
|
self,
|
|
orig_to_new_prefix: Optional[dict] = None,
|
|
orig_to_new_regex: Optional[dict] = None,
|
|
orig_to_new_suffix: Optional[dict] = None,
|
|
orig_to_new_substr: Optional[dict] = None,
|
|
):
|
|
self.prefix_map = orig_to_new_prefix or {}
|
|
self.regex_map = orig_to_new_regex or {}
|
|
self.suffix_map = orig_to_new_suffix or {}
|
|
self.substr_map = orig_to_new_substr or {}
|
|
|
|
def map_name(self, name: str) -> str:
|
|
# 1. Prefix
|
|
for old, new in self.prefix_map.items():
|
|
if name.startswith(old):
|
|
name = new + name[len(old):]
|
|
break
|
|
|
|
# 2. Regex
|
|
for pattern, replacement in self.regex_map.items():
|
|
name = pattern.sub(replacement, name)
|
|
|
|
# 3. Suffix
|
|
for old, new in self.suffix_map.items():
|
|
if name.endswith(old):
|
|
name = name[: -len(old)] + new
|
|
break
|
|
|
|
# 4. Substr (ordered dict — specific before general)
|
|
for old, new in self.substr_map.items():
|
|
if old in name:
|
|
name = name.replace(old, new, 1)
|
|
|
|
return name
|
|
|
|
|
|
def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
|
|
"""Exact copy of the mapper from deepseek_v4.py."""
|
|
expert_rename_regex = {
|
|
re.compile(r"(\.experts\.\d+\.)gate_proj\."): r"\1w1.",
|
|
re.compile(r"(\.experts\.\d+\.)up_proj\."): r"\1w3.",
|
|
re.compile(r"(\.experts\.\d+\.)down_proj\."): r"\1w2.",
|
|
}
|
|
|
|
suffix_renames = {}
|
|
|
|
substr_renames = {
|
|
# === Compressor (non-indexer) NVFP4 renames ===
|
|
"compressor.kv_proj.": "compressor.wkv.",
|
|
"compressor.gate_proj.": "compressor.wgate.",
|
|
"compressor.kv_norm.": "compressor.norm.",
|
|
"compressor.position_bias": "compressor.ape",
|
|
# === Attention compressor (before indexer renames) ===
|
|
".self_attn.compressor.": ".attn.mla_attn.compressor.",
|
|
# === Indexer params ===
|
|
"compressor.indexer.q_b_proj.": "indexer.wq_b.",
|
|
"compressor.indexer.weights_proj.": "indexer.weights_proj.",
|
|
"compressor.indexer.kv_norm.": "indexer.k_norm.",
|
|
"compressor.indexer.kv_proj.": "indexer.compressor.wkv.",
|
|
"compressor.indexer.gate_proj.": "indexer.compressor.wgate.",
|
|
"compressor.indexer.position_bias": "indexer.compressor.ape",
|
|
# === Attention projections ===
|
|
".self_attn.q_a_proj.": ".attn.wq_a.",
|
|
".self_attn.kv_proj.": ".attn.wkv.",
|
|
".self_attn.q_b_proj.": ".attn.wq_b.",
|
|
".self_attn.o_a_proj.": ".attn.wo_a.",
|
|
".self_attn.o_b_proj.": ".attn.wo_b.",
|
|
".self_attn.q_a_norm.": ".attn.q_a_norm.",
|
|
".self_attn.kv_norm.": ".attn.kv_norm.",
|
|
".self_attn.sinks": ".attn.sinks",
|
|
# Shared expert projections
|
|
".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
|
|
".mlp.shared_experts.up_proj.": ".ffn.shared_experts.w3.",
|
|
".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.",
|
|
# General renames
|
|
".mlp.": ".ffn.",
|
|
".self_attn.": ".attn.",
|
|
}
|
|
|
|
return WeightsMapper(
|
|
orig_to_new_prefix={
|
|
"layers.": "model.layers.",
|
|
"embed_tokens.": "model.embed_tokens.",
|
|
"norm.": "model.norm.",
|
|
"hc_head": "model.hc_head",
|
|
"mtp.": "model.mtp.",
|
|
},
|
|
orig_to_new_regex=expert_rename_regex,
|
|
orig_to_new_suffix=suffix_renames,
|
|
orig_to_new_substr=substr_renames,
|
|
)
|
|
|
|
|
|
TEST_CASES = [
|
|
# Embedding & top-level
|
|
("embed_tokens.weight", "model.embed_tokens.weight"),
|
|
("norm.weight", "model.norm.weight"),
|
|
("hc_head.hc_fn", "model.hc_head.hc_fn"),
|
|
("hc_head.hc_base", "model.hc_head.hc_base"),
|
|
("hc_head.hc_scale", "model.hc_head.hc_scale"),
|
|
("lm_head.weight", "lm_head.weight"),
|
|
|
|
# Attention — self_attn → attn
|
|
("layers.0.self_attn.q_a_proj.weight", "model.layers.0.attn.wq_a.weight"),
|
|
("layers.0.self_attn.q_a_proj.input_scale", "model.layers.0.attn.wq_a.input_scale"),
|
|
("layers.0.self_attn.kv_proj.weight", "model.layers.0.attn.wkv.weight"),
|
|
("layers.0.self_attn.q_b_proj.weight", "model.layers.0.attn.wq_b.weight"),
|
|
("layers.0.self_attn.o_a_proj.weight", "model.layers.0.attn.wo_a.weight"),
|
|
("layers.0.self_attn.o_b_proj.weight", "model.layers.0.attn.wo_b.weight"),
|
|
("layers.0.self_attn.o_b_proj.input_scale", "model.layers.0.attn.wo_b.input_scale"),
|
|
("layers.0.self_attn.q_a_norm.weight", "model.layers.0.attn.q_a_norm.weight"),
|
|
("layers.0.self_attn.kv_norm.weight", "model.layers.0.attn.kv_norm.weight"),
|
|
("layers.0.self_attn.sinks", "model.layers.0.attn.sinks"),
|
|
|
|
# Compressor (non-indexer): kv_proj → wkv, gate_proj → wgate
|
|
("layers.0.self_attn.compressor.kv_proj.weight", "model.layers.0.attn.mla_attn.compressor.wkv.weight"),
|
|
("layers.0.self_attn.compressor.kv_proj.input_scale", "model.layers.0.attn.mla_attn.compressor.wkv.input_scale"),
|
|
("layers.0.self_attn.compressor.kv_proj.weight_scale", "model.layers.0.attn.mla_attn.compressor.wkv.weight_scale"),
|
|
("layers.0.self_attn.compressor.kv_proj.weight_scale_2", "model.layers.0.attn.mla_attn.compressor.wkv.weight_scale_2"),
|
|
("layers.0.self_attn.compressor.gate_proj.weight", "model.layers.0.attn.mla_attn.compressor.wgate.weight"),
|
|
("layers.0.self_attn.compressor.gate_proj.input_scale", "model.layers.0.attn.mla_attn.compressor.wgate.input_scale"),
|
|
("layers.0.self_attn.compressor.gate_proj.weight_scale", "model.layers.0.attn.mla_attn.compressor.wgate.weight_scale"),
|
|
("layers.0.self_attn.compressor.gate_proj.weight_scale_2", "model.layers.0.attn.mla_attn.compressor.wgate.weight_scale_2"),
|
|
("layers.0.self_attn.compressor.kv_norm.weight", "model.layers.0.attn.mla_attn.compressor.norm.weight"),
|
|
("layers.0.self_attn.compressor.position_bias", "model.layers.0.attn.mla_attn.compressor.ape"),
|
|
|
|
# Indexer own params
|
|
("layers.10.self_attn.compressor.indexer.q_b_proj.weight", "model.layers.10.attn.mla_attn.indexer.wq_b.weight"),
|
|
("layers.10.self_attn.compressor.indexer.q_b_proj.input_scale", "model.layers.10.attn.mla_attn.indexer.wq_b.input_scale"),
|
|
("layers.10.self_attn.compressor.indexer.weights_proj.weight", "model.layers.10.attn.mla_attn.indexer.weights_proj.weight"),
|
|
("layers.10.self_attn.compressor.indexer.weights_proj.input_scale", "model.layers.10.attn.mla_attn.indexer.weights_proj.input_scale"),
|
|
("layers.10.self_attn.compressor.indexer.kv_norm.weight", "model.layers.10.attn.mla_attn.indexer.k_norm.weight"),
|
|
|
|
# Indexer's compressor
|
|
("layers.10.self_attn.compressor.indexer.kv_proj.weight", "model.layers.10.attn.mla_attn.indexer.compressor.wkv.weight"),
|
|
("layers.10.self_attn.compressor.indexer.kv_proj.input_scale", "model.layers.10.attn.mla_attn.indexer.compressor.wkv.input_scale"),
|
|
("layers.10.self_attn.compressor.indexer.gate_proj.weight", "model.layers.10.attn.mla_attn.indexer.compressor.wgate.weight"),
|
|
("layers.10.self_attn.compressor.indexer.gate_proj.input_scale", "model.layers.10.attn.mla_attn.indexer.compressor.wgate.input_scale"),
|
|
("layers.10.self_attn.compressor.indexer.position_bias", "model.layers.10.attn.mla_attn.indexer.compressor.ape"),
|
|
|
|
# MoE gate
|
|
("layers.0.mlp.gate.tid2eid", "model.layers.0.ffn.gate.tid2eid"),
|
|
("layers.0.mlp.gate.weight", "model.layers.0.ffn.gate.weight"),
|
|
|
|
# Expert weights — gate_proj → w1, up_proj → w3, down_proj → w2
|
|
("layers.0.mlp.experts.0.gate_proj.weight", "model.layers.0.ffn.experts.0.w1.weight"),
|
|
("layers.0.mlp.experts.0.up_proj.weight", "model.layers.0.ffn.experts.0.w3.weight"),
|
|
("layers.0.mlp.experts.0.down_proj.weight", "model.layers.0.ffn.experts.0.w2.weight"),
|
|
("layers.0.mlp.experts.0.gate_proj.input_scale", "model.layers.0.ffn.experts.0.w1.input_scale"),
|
|
("layers.0.mlp.experts.0.gate_proj.weight_scale", "model.layers.0.ffn.experts.0.w1.weight_scale"),
|
|
("layers.0.mlp.experts.0.gate_proj.weight_scale_2", "model.layers.0.ffn.experts.0.w1.weight_scale_2"),
|
|
|
|
# Shared experts
|
|
("layers.0.mlp.shared_experts.gate_proj.weight", "model.layers.0.ffn.shared_experts.w1.weight"),
|
|
("layers.0.mlp.shared_experts.up_proj.weight", "model.layers.0.ffn.shared_experts.w3.weight"),
|
|
("layers.0.mlp.shared_experts.down_proj.weight", "model.layers.0.ffn.shared_experts.down_proj.weight"),
|
|
("layers.0.mlp.shared_experts.gate_proj.input_scale", "model.layers.0.ffn.shared_experts.w1.input_scale"),
|
|
("layers.0.mlp.shared_experts.down_proj.weight_scale", "model.layers.0.ffn.shared_experts.down_proj.weight_scale"),
|
|
|
|
# Layer norm
|
|
("layers.0.post_attention_layernorm.weight", "model.layers.0.post_attention_layernorm.weight"),
|
|
]
|
|
|
|
|
|
def main():
|
|
mapper = _make_deepseek_v4_nvfp4_weights_mapper()
|
|
passed = 0
|
|
failed = 0
|
|
|
|
for ckpt_key, expected in TEST_CASES:
|
|
result = mapper.map_name(ckpt_key)
|
|
if result == expected:
|
|
passed += 1
|
|
else:
|
|
failed += 1
|
|
print(f"FAIL: {ckpt_key}")
|
|
print(f" expected: {expected}")
|
|
print(f" got: {result}")
|
|
|
|
print(f"\n{passed} passed, {failed} failed")
|
|
return 0 if failed == 0 else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|