- Add patches/deepseek_v4.py: patched vllm source file with modelopt NVFP4 weight name mappings (expert gate_proj→w1, mlp→ffn, self_attn→attn.mla_attn, compressor.kv_proj→wkv, etc.), E2M1 FP4→BF16 unpacking for stacked params, skip patterns for NVFP4 scale tensors on MergedColumnParallelLinear, and resilient loading for unknown params. - Update docker-compose.yml: copy patched deepseek_v4.py over original at container startup, remove --moe-backend=deep_gemm_mega_moe (no NVFP4 kernel). - Update patches/patch_vllm_weights.py: legacy runtime monkey-patch approach (doesn't work with worker processes), kept for reference. - Update README.md: added vLLM serving run history table (S1-S10), documented all open issues (MergedColumnParallelLinear+NVFP4, no mega_moe kernel, resilient loading), added vLLM-specific bug list and key notes. - Update scripts/serve_vllm.py: add WARN comment on mega_moe flag.
101 lines
4.4 KiB
Python
101 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
DeepSeek V4 Pro NVFP4 — vLLM OpenAI-compatible server.
|
||
|
||
Run from the venv on the B200 node:
|
||
source /root/nvidia-meeting/venv/bin/activate
|
||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py
|
||
|
||
Or in the background:
|
||
nohup python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py \
|
||
> /root/nvidia-meeting/vllm_serve.log 2>&1 &
|
||
"""
|
||
|
||
import subprocess
|
||
import sys
|
||
|
||
# ── Patch: Add compress_ratios to DeepseekV4Config ──────────────────────────
|
||
# transformers 5.8.0 renamed compress_ratios → compress_rates (dict format).
|
||
# vllm 0.20.2 still expects compress_ratios as a list indexed by layer_id.
|
||
# We patch the Config class to expose compress_ratios as a property that
|
||
# converts the new dict format back to the list format vllm expects.
|
||
import transformers
|
||
try:
|
||
from transformers import DeepseekV4Config
|
||
|
||
_orig_init = DeepseekV4Config.__init__
|
||
|
||
def _patched_init(self, *args, **kwargs):
|
||
_orig_init(self, *args, **kwargs)
|
||
# If compress_ratios already exists as a list, leave it alone
|
||
if hasattr(self, 'compress_ratios') and isinstance(self.compress_ratios, list):
|
||
return
|
||
# Convert compress_rates dict → compress_ratios list
|
||
if hasattr(self, 'compress_rates') and isinstance(self.compress_rates, dict):
|
||
rates = self.compress_rates
|
||
# Build per-layer list from the dict schema
|
||
# V4 pattern: layers 0-1=128, then alternating 4/128, last=0
|
||
n_layers = getattr(self, 'num_hidden_layers', 61)
|
||
cr = rates.get('compressed_sparse_attention', 4)
|
||
hr = rates.get('heavily_compressed_attention', 128)
|
||
ratios = []
|
||
for i in range(n_layers):
|
||
if i < 2:
|
||
ratios.append(hr)
|
||
elif i == n_layers - 1:
|
||
ratios.append(0)
|
||
else:
|
||
ratios.append(cr if i % 2 == 0 else hr)
|
||
self.compress_ratios = ratios
|
||
elif hasattr(self, 'compress_rates') and isinstance(self.compress_rates, list):
|
||
self.compress_ratios = self.compress_rates
|
||
|
||
DeepseekV4Config.__init__ = _patched_init
|
||
print("✓ Patched DeepseekV4Config.__init__ to add compress_ratios")
|
||
except ImportError:
|
||
print("⚠ DeepseekV4Config not found, skipping compress_ratios patch")
|
||
|
||
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
|
||
|
||
# These flags are critical for V4 — do not change without understanding why:
|
||
# --trust-remote-code V4 needs custom modeling code
|
||
# --kv-cache-dtype fp8 Match our kv_cache_qformat=fp8_cast quantization
|
||
# --block-size 256 V4 recommended block size
|
||
# --enable-expert-parallel Distribute expert computation across GPUs (critical for 256-expert MoE)
|
||
# --tensor-parallel-size 8 8× B200
|
||
# --compilation-config CUDA graphs for throughput — FULL_AND_PIECEWISE + all custom ops
|
||
# --attention_config FP4 indexer cache for V4 MLA attention
|
||
# --moe-backend deep_gemm_mega_moe — optimized MoE kernel for Blackwell
|
||
# --tokenizer-mode deepseek_v4 — V4-specific tokenizer
|
||
# --tool-call-parser deepseek_v4 — native tool calling
|
||
# --enable-auto-tool-choice Auto tool choice for function calling
|
||
# --reasoning-parser deepseek_v4 — reasoning/thinking output parsing
|
||
# --speculative_config MTP speculative decoding (2 speculative tokens)
|
||
|
||
cmd = [
|
||
sys.executable, "-m", "vllm.entrypoints.openai.api_server",
|
||
"--model", MODEL,
|
||
"--trust-remote-code",
|
||
"--kv-cache-dtype", "fp8",
|
||
"--block-size", "256",
|
||
"--enable-expert-parallel",
|
||
"--tensor-parallel-size", "8",
|
||
"--compilation-config", '{"cudagraph_mode":"FULL_AND_PIECEWISE", "custom_ops":["all"]}',
|
||
"--attention_config.use_fp4_indexer_cache=True",
|
||
"--moe-backend", "deep_gemm_mega_moe", # WARN: No NVFP4 mega_moe kernel. Use docker-compose (omits this flag) instead.
|
||
"--tokenizer-mode", "deepseek_v4",
|
||
"--tool-call-parser", "deepseek_v4",
|
||
"--enable-auto-tool-choice",
|
||
"--reasoning-parser", "deepseek_v4",
|
||
"--speculative_config", '{"method":"mtp","num_speculative_tokens":2}',
|
||
"--host", "0.0.0.0",
|
||
"--port", "8000",
|
||
]
|
||
|
||
print(f"Starting vLLM server for {MODEL}")
|
||
print(f"Command: {' '.join(cmd)}")
|
||
print(f"Log: /root/nvidia-meeting/vllm_serve.log")
|
||
print()
|
||
|
||
sys.exit(subprocess.call(cmd))
|