- Add patches/deepseek_v4.py: patched vllm source file with modelopt NVFP4 weight name mappings (expert gate_proj→w1, mlp→ffn, self_attn→attn.mla_attn, compressor.kv_proj→wkv, etc.), E2M1 FP4→BF16 unpacking for stacked params, skip patterns for NVFP4 scale tensors on MergedColumnParallelLinear, and resilient loading for unknown params. - Update docker-compose.yml: copy patched deepseek_v4.py over original at container startup, remove --moe-backend=deep_gemm_mega_moe (no NVFP4 kernel). - Update patches/patch_vllm_weights.py: legacy runtime monkey-patch approach (doesn't work with worker processes), kept for reference. - Update README.md: added vLLM serving run history table (S1-S10), documented all open issues (MergedColumnParallelLinear+NVFP4, no mega_moe kernel, resilient loading), added vLLM-specific bug list and key notes. - Update scripts/serve_vllm.py: add WARN comment on mega_moe flag.
47 lines
1.4 KiB
YAML
47 lines
1.4 KiB
YAML
services:
|
|
vllm:
|
|
image: atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build
|
|
pull_policy: always
|
|
entrypoint:
|
|
- bash
|
|
- -c
|
|
- |
|
|
cp /patches/deepseek_v4.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py
|
|
exec vllm serve "$$@"
|
|
- --
|
|
environment:
|
|
- HF_TOKEN=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
|
|
command:
|
|
- /model
|
|
- --trust-remote-code
|
|
- --kv-cache-dtype=fp8
|
|
- --block-size=256
|
|
- --enable-expert-parallel
|
|
- --tensor-parallel-size=8
|
|
- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
|
|
- --attention_config.use_fp4_indexer_cache=True
|
|
- --tokenizer-mode=deepseek_v4
|
|
- --tool-call-parser=deepseek_v4
|
|
- --enable-auto-tool-choice
|
|
- --reasoning-parser=deepseek_v4
|
|
- --speculative_config={"method":"mtp","num_speculative_tokens":2}
|
|
- --host=0.0.0.0
|
|
- --port=8000
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
ipc: host
|
|
security_opt:
|
|
- seccomp:unconfined
|
|
tty: true
|
|
stdin_open: true
|
|
volumes:
|
|
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
|
|
- /root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py:/patches/deepseek_v4.py:ro
|
|
- /root/nvidia-meeting/deepseek-v4-quant/patches:/patches:ro
|
|
network_mode: host
|