deepseek-v4-quant/patches/patch_vllm_weights.py

#!/usr/bin/env python3
"""
Patch vllm's DeepSeek V4 weight mapper to handle modelopt's NVFP4 export naming.

modelopt exports weights with naming differences from what vllm's
_make_deepseek_v4_weights_mapper + load_weights code expects:

1. Expert projections: modelopt uses gate_proj/up_proj/down_proj, vllm expects w1/w3/w2
2. Shared expert projections: same gate_proj/up_proj naming, needs w1/w3 for stacking
3. Compressor projections: kv_proj→wkv, gate_proj→wgate for fused stacking
4. Attention projections: self_attn prefix, kv_proj→wkv for fused stacking, etc.
5. Expert NVFP4 scales: weight_scale_2 and input_scale have no matching mega_moe params

CRITICAL: DeepseekV4ForCausalLM.hf_to_vllm_mapper is a CLASS attribute set at
module import time. Simply patching _make_deepseek_v4_weights_mapper doesn't help
because the class already cached the old mapper. We must also update the class
attribute directly. Since expert_dtype=="fp4", __init__ doesn't recreate the mapper.

Drop into container as:
    python3 /patches/patch_vllm_weights.py

Or add to docker-compose.yml command before vllm serve.
"""

import re
import sys


# Save original function BEFORE patching
_original_make_mapper = None


def make_patched_mapper(expert_dtype: str):
    """Create a WeightsMapper with modelopt NVFP4 naming patches applied."""
    global _original_make_mapper
    # Use the saved original, not the (possibly patched) module attribute
    mapper = _original_make_mapper(expert_dtype)

    # ══════════════════════════════════════════════════════════════════
    # Regex mappings (applied FIRST by WeightsMapper, before substr)
    # Order matters: skip patterns must come before rename patterns.
    # ══════════════════════════════════════════════════════════════════
    ordered_regexes = {}

    # Skip expert NVFP4 scales that have no mega_moe params.
    # MUST come before gate_proj→w1 etc. because after renaming,
    # the key has "w1." not "gate_proj." and these patterns wouldn't match.
    #
    # modelopt's NVFP4 export includes weight_scale_2 (global scale) and
    # input_scale (activation scale) for each expert projection. But the
    # DeepseekV4MegaMoEExperts module only registers w13_weight_scale and
    # w2_weight_scale (E8M0 block scales) — no weight_scale_2 or input_scale.
    # Mapping to None tells WeightsMapper to skip these weights entirely.
    ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.weight_scale_2$")] = None
    ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.input_scale$")] = None

    # Routed expert projections: gate_proj→w1, up_proj→w3, down_proj→w2
    # We use regex (not substr) to match ONLY .experts.N. — NOT .shared_experts.
    # Using substr ".down_proj." → ".w2." would also affect
    # shared_experts.down_proj, breaking shared expert loading
    # (vllm model uses down_proj, not w2, for shared experts).
    ordered_regexes[re.compile(r"(\.experts\.\d+\.)gate_proj\.")] = r"\1w1."
    ordered_regexes[re.compile(r"(\.experts\.\d+\.)up_proj\.")] = r"\1w3."
    ordered_regexes[re.compile(r"(\.experts\.\d+\.)down_proj\.")] = r"\1w2."

    # Preserve any existing regex mappings from the original mapper
    if mapper.orig_to_new_regex:
        ordered_regexes.update(mapper.orig_to_new_regex)

    mapper.orig_to_new_regex = ordered_regexes

    # ══════════════════════════════════════════════════════════════════
    # Substr mappings (applied AFTER regex by WeightsMapper)
    # ══════════════════════════════════════════════════════════════════

    # 1. Attention: self_attn → attn.mla_attn mappings
    # modelopt uses "self_attn" but vllm expects "attn" (mapped to "attn.mla_attn")
    mapper.orig_to_new_substr[".self_attn.q_a_proj."] = ".attn.mla_attn.wq_a."
    mapper.orig_to_new_substr[".self_attn.q_b_proj."] = ".attn.mla_attn.wq_b."
    mapper.orig_to_new_substr[".self_attn.q_a_norm."] = ".attn.mla_attn.q_norm."
    mapper.orig_to_new_substr[".self_attn.o_a_proj."] = ".attn.mla_attn.wo_a."
    mapper.orig_to_new_substr[".self_attn.o_b_proj."] = ".attn.mla_attn.wo_b."
    mapper.orig_to_new_substr[".self_attn.sinks"] = ".attn.mla_attn.attn_sink"

    # CRITICAL: kv_proj must map to wkv (not kv_proj) because the stacking
    # code looks for "attn.wkv" to stack into fused_wqa_wkv.
    mapper.orig_to_new_substr[".self_attn.kv_proj."] = ".attn.mla_attn.wkv."
    mapper.orig_to_new_substr[".self_attn.kv_norm."] = ".attn.mla_attn.kv_norm."

    # Compressor: self_attn.compressor → attn.mla_attn.compressor
    mapper.orig_to_new_substr[".self_attn.compressor."] = ".attn.mla_attn.compressor."

    # Compressor projection renaming for stacking:
    # vllm stacks compressor.wkv + compressor.wgate → compressor.fused_wkv_wgate
    # modelopt exports as compressor.kv_proj and compressor.gate_proj
    mapper.orig_to_new_substr[".compressor.kv_proj."] = ".compressor.wkv."
    mapper.orig_to_new_substr[".compressor.gate_proj."] = ".compressor.wgate."

    # 2. Shared expert projections: gate_proj→w1, up_proj→w3
    # vllm stacks shared_experts.w1 + shared_experts.w3 into
    # shared_experts.gate_up_proj. modelopt uses gate_proj/up_proj naming.
    # down_proj stays as-is (vllm model uses down_proj directly).
    mapper.orig_to_new_substr[".shared_experts.gate_proj."] = ".shared_experts.w1."
    mapper.orig_to_new_substr[".shared_experts.up_proj."] = ".shared_experts.w3."

    return mapper


def patch():
    global _original_make_mapper
    from vllm.model_executor.models import deepseek_v4

    # 1. Save the original function BEFORE replacing it
    _original_make_mapper = deepseek_v4._make_deepseek_v4_weights_mapper

    # 2. Patch the function so __init__ calls also get our mapper
    deepseek_v4._make_deepseek_v4_weights_mapper = make_patched_mapper
    print("✓ Patched _make_deepseek_v4_weights_mapper function")

    # 3. CRITICAL: Also update the CLASS attribute directly.
    # DeepseekV4ForCausalLM.hf_to_vllm_mapper is set at class definition
    # time (module import). Our function patch above doesn't retroactively
    # update it. Since expert_dtype=="fp4", __init__ won't recreate it either.
    # We MUST update the class attribute directly.
    if hasattr(deepseek_v4, 'DeepseekV4ForCausalLM'):
        deepseek_v4.DeepseekV4ForCausalLM.hf_to_vllm_mapper = make_patched_mapper("fp4")
        print("✓ Updated DeepseekV4ForCausalLM.hf_to_vllm_mapper class attribute")
    else:
        print("⚠ DeepseekV4ForCausalLM not found (will be patched at import time)")

    print("✓ All modelopt NVFP4 weight mapping patches applied")


if __name__ == "__main__":
    patch()