diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..aee3085 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,46 @@ +services: + vllm: + image: atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build + pull_policy: always + entrypoint: + - bash + - -c + - | + python3 /patches/patch_vllm_weights.py + exec vllm serve "$$@" + - -- + environment: + - HF_TOKEN=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 + command: + - /model + - --trust-remote-code + - --kv-cache-dtype=fp8 + - --block-size=256 + - --enable-expert-parallel + - --tensor-parallel-size=8 + - --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]} + - --attention_config.use_fp4_indexer_cache=True + - --moe-backend=deep_gemm_mega_moe + - --tokenizer-mode=deepseek_v4 + - --tool-call-parser=deepseek_v4 + - --enable-auto-tool-choice + - --reasoning-parser=deepseek_v4 + - --speculative_config={"method":"mtp","num_speculative_tokens":2} + - --host=0.0.0.0 + - --port=8000 + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + ipc: host + security_opt: + - seccomp:unconfined + tty: true + stdin_open: true + volumes: + - /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro + - /root/nvidia-meeting/deepseek-v4-quant/patches:/patches:ro + network_mode: host diff --git a/patches/patch_vllm_weights.py b/patches/patch_vllm_weights.py new file mode 100644 index 0000000..049e411 --- /dev/null +++ b/patches/patch_vllm_weights.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +""" +Patch vllm's DeepSeek V4 weight mapper to handle modelopt's NVFP4 export naming. + +modelopt exports weights with `self_attn` prefix and other naming differences +that vllm's _make_deepseek_v4_weights_mapper doesn't account for. + +This patch adds the missing substring mappings so modelopt-exported NVFP4 +checkpoints load correctly. + +Drop into container as: + python3 /patches/patch_vllm_weights.py + +Or add to docker-compose.yml command before vllm serve. +""" + +import sys +import importlib + +def patch(): + from vllm.model_executor.models import deepseek_v4 + + original_make_mapper = deepseek_v4._make_deepseek_v4_weights_mapper + + def patched_make_mapper(expert_dtype: str): + mapper = original_make_mapper(expert_dtype) + + # modelopt uses "self_attn" but vllm expects "attn" (which it then + # maps to "attn.mla_attn" via the substr mapper) + # We need: self_attn -> attn.mla_attn (skip the intermediate step) + mapper.orig_to_new_substr[".self_attn.compressor."] = ".attn.mla_attn.compressor." + mapper.orig_to_new_substr[".self_attn.kv_norm."] = ".attn.mla_attn.kv_norm." + mapper.orig_to_new_substr[".self_attn.kv_proj."] = ".attn.mla_attn.kv_proj." + mapper.orig_to_new_substr[".self_attn.o_a_proj."] = ".attn.mla_attn.wo_a." + mapper.orig_to_new_substr[".self_attn.o_b_proj."] = ".attn.mla_attn.wo_b." + mapper.orig_to_new_substr[".self_attn.q_a_proj."] = ".attn.mla_attn.wq_a." + mapper.orig_to_new_substr[".self_attn.q_a_norm."] = ".attn.mla_attn.q_norm." + mapper.orig_to_new_substr[".self_attn.q_b_proj."] = ".attn.mla_attn.wq_b." + mapper.orig_to_new_substr[".self_attn.sinks"] = ".attn.mla_attn.attn_sink" + + # modelopt names the indexer's sub-projects differently + mapper.orig_to_new_substr[".self_attn.compressor.indexer.q_b_proj."] = ".attn.mla_attn.indexer.wq_b." + mapper.orig_to_new_substr[".self_attn.compressor.indexer.kv_proj."] = ".attn.mla_attn.indexer.wkv." + mapper.orig_to_new_substr[".self_attn.compressor.indexer.gate_proj."] = ".attn.mla_attn.indexer.gate." + mapper.orig_to_new_substr[".self_attn.compressor.indexer.weights_proj."] = ".attn.mla_attn.indexer.wo_a." + mapper.orig_to_new_substr[".self_attn.compressor.indexer.kv_norm."] = ".attn.mla_attn.indexer.kv_norm." + mapper.orig_to_new_substr[".self_attn.compressor.indexer.position_bias"] = ".attn.mla_attn.indexer.position_bias" + + # modelopt puts shared experts under mlp.shared_experts with correct names + # but the mapper may try to rename .shared_experts. differently + # Our model already has model.layers.N.mlp.shared_experts.down_proj etc. + + # modelopt adds hc_head as a separate module (hc = hidden compression) + # vllm doesn't have this in the mapper, but it should be handled by + # the general weight loading if we don't filter it out + + return mapper + + deepseek_v4._make_deepseek_v4_weights_mapper = patched_make_mapper + print("✓ Patched _make_deepseek_v4_weights_mapper for modelopt NVFP4 naming") + +if __name__ == "__main__": + patch()