Files
deepseek-v4-quant/scripts/serve_vllm.py

60 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
DeepSeek V4 Pro NVFP4 — vLLM OpenAI-compatible server.
Run from the venv on the B200 node:
source /root/nvidia-meeting/venv/bin/activate
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py
Or in the background:
nohup python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py \
> /root/nvidia-meeting/vllm_serve.log 2>&1 &
"""
import subprocess
import sys
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
# These flags are critical for V4 — do not change without understanding why:
# --trust-remote-code V4 needs custom modeling code
# --kv-cache-dtype fp8 Match our kv_cache_qformat=fp8_cast quantization
# --block-size 256 V4 recommended block size
# --enable-expert-parallel Distribute expert computation across GPUs (critical for 256-expert MoE)
# --tensor-parallel-size 8 8× B200
# --compilation-config CUDA graphs for throughput — FULL_AND_PIECEWISE + all custom ops
# --attention_config FP4 indexer cache for V4 MLA attention
# --moe-backend deep_gemm_mega_moe — optimized MoE kernel for Blackwell
# --tokenizer-mode deepseek_v4 — V4-specific tokenizer
# --tool-call-parser deepseek_v4 — native tool calling
# --enable-auto-tool-choice Auto tool choice for function calling
# --reasoning-parser deepseek_v4 — reasoning/thinking output parsing
# --speculative_config MTP speculative decoding (2 speculative tokens)
cmd = [
sys.executable, "-m", "vllm.entrypoints.openai.api_server",
"--model", MODEL,
"--trust-remote-code",
"--kv-cache-dtype", "fp8",
"--block-size", "256",
"--enable-expert-parallel",
"--tensor-parallel-size", "8",
"--compilation-config", '{"cudagraph_mode":"FULL_AND_PIECEWISE", "custom_ops":["all"]}',
"--attention_config.use_fp4_indexer_cache=True",
"--moe-backend", "deep_gemm_mega_moe",
"--tokenizer-mode", "deepseek_v4",
"--tool-call-parser", "deepseek_v4",
"--enable-auto-tool-choice",
"--reasoning-parser", "deepseek_v4",
"--speculative_config", '{"method":"mtp","num_speculative_tokens":2}',
"--host", "0.0.0.0",
"--port", "8000",
]
print(f"Starting vLLM server for {MODEL}")
print(f"Command: {' '.join(cmd)}")
print(f"Log: /root/nvidia-meeting/vllm_serve.log")
print()
sys.exit(subprocess.call(cmd))