Add vllm weight mapper patch and docker-compose

2026-05-10 09:33:48 +00:00
parent 30608e3834
commit d88793dee6
2 changed files with 109 additions and 0 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,46 @@
+services:
+  vllm:
+    image: atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build
+    pull_policy: always
+    entrypoint:
+      - bash
+      - -c
+      - |
+        python3 /patches/patch_vllm_weights.py
+        exec vllm serve "$$@"
+      - --
+    environment:
+      - HF_TOKEN=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
+    command:
+      - /model
+      - --trust-remote-code
+      - --kv-cache-dtype=fp8
+      - --block-size=256
+      - --enable-expert-parallel
+      - --tensor-parallel-size=8
+      - --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
+      - --attention_config.use_fp4_indexer_cache=True
+      - --moe-backend=deep_gemm_mega_moe
+      - --tokenizer-mode=deepseek_v4
+      - --tool-call-parser=deepseek_v4
+      - --enable-auto-tool-choice
+      - --reasoning-parser=deepseek_v4
+      - --speculative_config={"method":"mtp","num_speculative_tokens":2}
+      - --host=0.0.0.0
+      - --port=8000
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    ipc: host
+    security_opt:
+      - seccomp:unconfined
+    tty: true
+    stdin_open: true
+    volumes:
+      - /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
+      - /root/nvidia-meeting/deepseek-v4-quant/patches:/patches:ro
+    network_mode: host
--- a/patches/patch_vllm_weights.py
+++ b/patches/patch_vllm_weights.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""
+Patch vllm's DeepSeek V4 weight mapper to handle modelopt's NVFP4 export naming.
+
+modelopt exports weights with `self_attn` prefix and other naming differences
+that vllm's _make_deepseek_v4_weights_mapper doesn't account for.
+
+This patch adds the missing substring mappings so modelopt-exported NVFP4
+checkpoints load correctly.
+
+Drop into container as:
+    python3 /patches/patch_vllm_weights.py
+
+Or add to docker-compose.yml command before vllm serve.
+"""
+
+import sys
+import importlib
+
+def patch():
+    from vllm.model_executor.models import deepseek_v4
+
+    original_make_mapper = deepseek_v4._make_deepseek_v4_weights_mapper
+
+    def patched_make_mapper(expert_dtype: str):
+        mapper = original_make_mapper(expert_dtype)
+
+        # modelopt uses "self_attn" but vllm expects "attn" (which it then
+        # maps to "attn.mla_attn" via the substr mapper)
+        # We need: self_attn -> attn.mla_attn (skip the intermediate step)
+        mapper.orig_to_new_substr[".self_attn.compressor."] = ".attn.mla_attn.compressor."
+        mapper.orig_to_new_substr[".self_attn.kv_norm."] = ".attn.mla_attn.kv_norm."
+        mapper.orig_to_new_substr[".self_attn.kv_proj."] = ".attn.mla_attn.kv_proj."
+        mapper.orig_to_new_substr[".self_attn.o_a_proj."] = ".attn.mla_attn.wo_a."
+        mapper.orig_to_new_substr[".self_attn.o_b_proj."] = ".attn.mla_attn.wo_b."
+        mapper.orig_to_new_substr[".self_attn.q_a_proj."] = ".attn.mla_attn.wq_a."
+        mapper.orig_to_new_substr[".self_attn.q_a_norm."] = ".attn.mla_attn.q_norm."
+        mapper.orig_to_new_substr[".self_attn.q_b_proj."] = ".attn.mla_attn.wq_b."
+        mapper.orig_to_new_substr[".self_attn.sinks"] = ".attn.mla_attn.attn_sink"
+
+        # modelopt names the indexer's sub-projects differently
+        mapper.orig_to_new_substr[".self_attn.compressor.indexer.q_b_proj."] = ".attn.mla_attn.indexer.wq_b."
+        mapper.orig_to_new_substr[".self_attn.compressor.indexer.kv_proj."] = ".attn.mla_attn.indexer.wkv."
+        mapper.orig_to_new_substr[".self_attn.compressor.indexer.gate_proj."] = ".attn.mla_attn.indexer.gate."
+        mapper.orig_to_new_substr[".self_attn.compressor.indexer.weights_proj."] = ".attn.mla_attn.indexer.wo_a."
+        mapper.orig_to_new_substr[".self_attn.compressor.indexer.kv_norm."] = ".attn.mla_attn.indexer.kv_norm."
+        mapper.orig_to_new_substr[".self_attn.compressor.indexer.position_bias"] = ".attn.mla_attn.indexer.position_bias"
+
+        # modelopt puts shared experts under mlp.shared_experts with correct names
+        # but the mapper may try to rename .shared_experts. differently
+        # Our model already has model.layers.N.mlp.shared_experts.down_proj etc.
+
+        # modelopt adds hc_head as a separate module (hc = hidden compression)
+        # vllm doesn't have this in the mapper, but it should be handled by
+        # the general weight loading if we don't filter it out
+
+        return mapper
+
+    deepseek_v4._make_deepseek_v4_weights_mapper = patched_make_mapper
+    print("✓ Patched _make_deepseek_v4_weights_mapper for modelopt NVFP4 naming")
+
+if __name__ == "__main__":
+    patch()