Add vllm weight mapper patch and docker-compose

This commit is contained in:
2026-05-10 09:33:48 +00:00
parent 30608e3834
commit d88793dee6
2 changed files with 109 additions and 0 deletions

46
docker-compose.yml Normal file
View File

@@ -0,0 +1,46 @@
services:
vllm:
image: atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build
pull_policy: always
entrypoint:
- bash
- -c
- |
python3 /patches/patch_vllm_weights.py
exec vllm serve "$$@"
- --
environment:
- HF_TOKEN=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
command:
- /model
- --trust-remote-code
- --kv-cache-dtype=fp8
- --block-size=256
- --enable-expert-parallel
- --tensor-parallel-size=8
- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
- --attention_config.use_fp4_indexer_cache=True
- --moe-backend=deep_gemm_mega_moe
- --tokenizer-mode=deepseek_v4
- --tool-call-parser=deepseek_v4
- --enable-auto-tool-choice
- --reasoning-parser=deepseek_v4
- --speculative_config={"method":"mtp","num_speculative_tokens":2}
- --host=0.0.0.0
- --port=8000
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
ipc: host
security_opt:
- seccomp:unconfined
tty: true
stdin_open: true
volumes:
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
- /root/nvidia-meeting/deepseek-v4-quant/patches:/patches:ro
network_mode: host

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""
Patch vllm's DeepSeek V4 weight mapper to handle modelopt's NVFP4 export naming.
modelopt exports weights with `self_attn` prefix and other naming differences
that vllm's _make_deepseek_v4_weights_mapper doesn't account for.
This patch adds the missing substring mappings so modelopt-exported NVFP4
checkpoints load correctly.
Drop into container as:
python3 /patches/patch_vllm_weights.py
Or add to docker-compose.yml command before vllm serve.
"""
import sys
import importlib
def patch():
from vllm.model_executor.models import deepseek_v4
original_make_mapper = deepseek_v4._make_deepseek_v4_weights_mapper
def patched_make_mapper(expert_dtype: str):
mapper = original_make_mapper(expert_dtype)
# modelopt uses "self_attn" but vllm expects "attn" (which it then
# maps to "attn.mla_attn" via the substr mapper)
# We need: self_attn -> attn.mla_attn (skip the intermediate step)
mapper.orig_to_new_substr[".self_attn.compressor."] = ".attn.mla_attn.compressor."
mapper.orig_to_new_substr[".self_attn.kv_norm."] = ".attn.mla_attn.kv_norm."
mapper.orig_to_new_substr[".self_attn.kv_proj."] = ".attn.mla_attn.kv_proj."
mapper.orig_to_new_substr[".self_attn.o_a_proj."] = ".attn.mla_attn.wo_a."
mapper.orig_to_new_substr[".self_attn.o_b_proj."] = ".attn.mla_attn.wo_b."
mapper.orig_to_new_substr[".self_attn.q_a_proj."] = ".attn.mla_attn.wq_a."
mapper.orig_to_new_substr[".self_attn.q_a_norm."] = ".attn.mla_attn.q_norm."
mapper.orig_to_new_substr[".self_attn.q_b_proj."] = ".attn.mla_attn.wq_b."
mapper.orig_to_new_substr[".self_attn.sinks"] = ".attn.mla_attn.attn_sink"
# modelopt names the indexer's sub-projects differently
mapper.orig_to_new_substr[".self_attn.compressor.indexer.q_b_proj."] = ".attn.mla_attn.indexer.wq_b."
mapper.orig_to_new_substr[".self_attn.compressor.indexer.kv_proj."] = ".attn.mla_attn.indexer.wkv."
mapper.orig_to_new_substr[".self_attn.compressor.indexer.gate_proj."] = ".attn.mla_attn.indexer.gate."
mapper.orig_to_new_substr[".self_attn.compressor.indexer.weights_proj."] = ".attn.mla_attn.indexer.wo_a."
mapper.orig_to_new_substr[".self_attn.compressor.indexer.kv_norm."] = ".attn.mla_attn.indexer.kv_norm."
mapper.orig_to_new_substr[".self_attn.compressor.indexer.position_bias"] = ".attn.mla_attn.indexer.position_bias"
# modelopt puts shared experts under mlp.shared_experts with correct names
# but the mapper may try to rename .shared_experts. differently
# Our model already has model.layers.N.mlp.shared_experts.down_proj etc.
# modelopt adds hc_head as a separate module (hc = hidden compression)
# vllm doesn't have this in the mapper, but it should be handled by
# the general weight loading if we don't filter it out
return mapper
deepseek_v4._make_deepseek_v4_weights_mapper = patched_make_mapper
print("✓ Patched _make_deepseek_v4_weights_mapper for modelopt NVFP4 naming")
if __name__ == "__main__":
patch()