Add vllm weight mapper patch and docker-compose
This commit is contained in:
46
docker-compose.yml
Normal file
46
docker-compose.yml
Normal file
@@ -0,0 +1,46 @@
|
||||
services:
|
||||
vllm:
|
||||
image: atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build
|
||||
pull_policy: always
|
||||
entrypoint:
|
||||
- bash
|
||||
- -c
|
||||
- |
|
||||
python3 /patches/patch_vllm_weights.py
|
||||
exec vllm serve "$$@"
|
||||
- --
|
||||
environment:
|
||||
- HF_TOKEN=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
|
||||
command:
|
||||
- /model
|
||||
- --trust-remote-code
|
||||
- --kv-cache-dtype=fp8
|
||||
- --block-size=256
|
||||
- --enable-expert-parallel
|
||||
- --tensor-parallel-size=8
|
||||
- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
|
||||
- --attention_config.use_fp4_indexer_cache=True
|
||||
- --moe-backend=deep_gemm_mega_moe
|
||||
- --tokenizer-mode=deepseek_v4
|
||||
- --tool-call-parser=deepseek_v4
|
||||
- --enable-auto-tool-choice
|
||||
- --reasoning-parser=deepseek_v4
|
||||
- --speculative_config={"method":"mtp","num_speculative_tokens":2}
|
||||
- --host=0.0.0.0
|
||||
- --port=8000
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
ipc: host
|
||||
security_opt:
|
||||
- seccomp:unconfined
|
||||
tty: true
|
||||
stdin_open: true
|
||||
volumes:
|
||||
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
|
||||
- /root/nvidia-meeting/deepseek-v4-quant/patches:/patches:ro
|
||||
network_mode: host
|
||||
63
patches/patch_vllm_weights.py
Normal file
63
patches/patch_vllm_weights.py
Normal file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Patch vllm's DeepSeek V4 weight mapper to handle modelopt's NVFP4 export naming.
|
||||
|
||||
modelopt exports weights with `self_attn` prefix and other naming differences
|
||||
that vllm's _make_deepseek_v4_weights_mapper doesn't account for.
|
||||
|
||||
This patch adds the missing substring mappings so modelopt-exported NVFP4
|
||||
checkpoints load correctly.
|
||||
|
||||
Drop into container as:
|
||||
python3 /patches/patch_vllm_weights.py
|
||||
|
||||
Or add to docker-compose.yml command before vllm serve.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import importlib
|
||||
|
||||
def patch():
|
||||
from vllm.model_executor.models import deepseek_v4
|
||||
|
||||
original_make_mapper = deepseek_v4._make_deepseek_v4_weights_mapper
|
||||
|
||||
def patched_make_mapper(expert_dtype: str):
|
||||
mapper = original_make_mapper(expert_dtype)
|
||||
|
||||
# modelopt uses "self_attn" but vllm expects "attn" (which it then
|
||||
# maps to "attn.mla_attn" via the substr mapper)
|
||||
# We need: self_attn -> attn.mla_attn (skip the intermediate step)
|
||||
mapper.orig_to_new_substr[".self_attn.compressor."] = ".attn.mla_attn.compressor."
|
||||
mapper.orig_to_new_substr[".self_attn.kv_norm."] = ".attn.mla_attn.kv_norm."
|
||||
mapper.orig_to_new_substr[".self_attn.kv_proj."] = ".attn.mla_attn.kv_proj."
|
||||
mapper.orig_to_new_substr[".self_attn.o_a_proj."] = ".attn.mla_attn.wo_a."
|
||||
mapper.orig_to_new_substr[".self_attn.o_b_proj."] = ".attn.mla_attn.wo_b."
|
||||
mapper.orig_to_new_substr[".self_attn.q_a_proj."] = ".attn.mla_attn.wq_a."
|
||||
mapper.orig_to_new_substr[".self_attn.q_a_norm."] = ".attn.mla_attn.q_norm."
|
||||
mapper.orig_to_new_substr[".self_attn.q_b_proj."] = ".attn.mla_attn.wq_b."
|
||||
mapper.orig_to_new_substr[".self_attn.sinks"] = ".attn.mla_attn.attn_sink"
|
||||
|
||||
# modelopt names the indexer's sub-projects differently
|
||||
mapper.orig_to_new_substr[".self_attn.compressor.indexer.q_b_proj."] = ".attn.mla_attn.indexer.wq_b."
|
||||
mapper.orig_to_new_substr[".self_attn.compressor.indexer.kv_proj."] = ".attn.mla_attn.indexer.wkv."
|
||||
mapper.orig_to_new_substr[".self_attn.compressor.indexer.gate_proj."] = ".attn.mla_attn.indexer.gate."
|
||||
mapper.orig_to_new_substr[".self_attn.compressor.indexer.weights_proj."] = ".attn.mla_attn.indexer.wo_a."
|
||||
mapper.orig_to_new_substr[".self_attn.compressor.indexer.kv_norm."] = ".attn.mla_attn.indexer.kv_norm."
|
||||
mapper.orig_to_new_substr[".self_attn.compressor.indexer.position_bias"] = ".attn.mla_attn.indexer.position_bias"
|
||||
|
||||
# modelopt puts shared experts under mlp.shared_experts with correct names
|
||||
# but the mapper may try to rename .shared_experts. differently
|
||||
# Our model already has model.layers.N.mlp.shared_experts.down_proj etc.
|
||||
|
||||
# modelopt adds hc_head as a separate module (hc = hidden compression)
|
||||
# vllm doesn't have this in the mapper, but it should be handled by
|
||||
# the general weight loading if we don't filter it out
|
||||
|
||||
return mapper
|
||||
|
||||
deepseek_v4._make_deepseek_v4_weights_mapper = patched_make_mapper
|
||||
print("✓ Patched _make_deepseek_v4_weights_mapper for modelopt NVFP4 naming")
|
||||
|
||||
if __name__ == "__main__":
|
||||
patch()
|
||||
Reference in New Issue
Block a user