scripts/serve_vllm.py

#!/usr/bin/env python3
"""
DeepSeek V4 Pro NVFP4 — vLLM OpenAI-compatible server.

Run from the venv on the B200 node:
    source /root/nvidia-meeting/venv/bin/activate
    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py

Or in the background:
    nohup python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py \
        > /root/nvidia-meeting/vllm_serve.log 2>&1 &
"""

import subprocess
import sys

# ── Patch: Add compress_ratios to DeepseekV4Config ──────────────────────────
# transformers 5.8.0 renamed compress_ratios → compress_rates (dict format).
# vllm 0.20.2 still expects compress_ratios as a list indexed by layer_id.
# We patch the Config class to expose compress_ratios as a property that
# converts the new dict format back to the list format vllm expects.
import transformers
try:
    from transformers import DeepseekV4Config

    _orig_init = DeepseekV4Config.__init__

    def _patched_init(self, *args, **kwargs):
        _orig_init(self, *args, **kwargs)
        # If compress_ratios already exists as a list, leave it alone
        if hasattr(self, 'compress_ratios') and isinstance(self.compress_ratios, list):
            return
        # Convert compress_rates dict → compress_ratios list
        if hasattr(self, 'compress_rates') and isinstance(self.compress_rates, dict):
            rates = self.compress_rates
            # Build per-layer list from the dict schema
            # V4 pattern: layers 0-1=128, then alternating 4/128, last=0
            n_layers = getattr(self, 'num_hidden_layers', 61)
            cr = rates.get('compressed_sparse_attention', 4)
            hr = rates.get('heavily_compressed_attention', 128)
            ratios = []
            for i in range(n_layers):
                if i < 2:
                    ratios.append(hr)
                elif i == n_layers - 1:
                    ratios.append(0)
                else:
                    ratios.append(cr if i % 2 == 0 else hr)
            self.compress_ratios = ratios
        elif hasattr(self, 'compress_rates') and isinstance(self.compress_rates, list):
            self.compress_ratios = self.compress_rates

    DeepseekV4Config.__init__ = _patched_init
    print("✓ Patched DeepseekV4Config.__init__ to add compress_ratios")
except ImportError:
    print("⚠ DeepseekV4Config not found, skipping compress_ratios patch")

MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"

# These flags are critical for V4 — do not change without understanding why:
# --trust-remote-code         V4 needs custom modeling code
# --kv-cache-dtype fp8        Match our kv_cache_qformat=fp8_cast quantization
# --block-size 256            V4 recommended block size
# --enable-expert-parallel    Distribute expert computation across GPUs (critical for 256-expert MoE)
# --tensor-parallel-size 8    8× B200
# --compilation-config        CUDA graphs for throughput — FULL_AND_PIECEWISE + all custom ops
# --attention_config          FP4 indexer cache for V4 MLA attention
# --moe-backend               deep_gemm_mega_moe — optimized MoE kernel for Blackwell
# --tokenizer-mode            deepseek_v4 — V4-specific tokenizer
# --tool-call-parser          deepseek_v4 — native tool calling
# --enable-auto-tool-choice   Auto tool choice for function calling
# --reasoning-parser          deepseek_v4 — reasoning/thinking output parsing
# --speculative_config        MTP speculative decoding (2 speculative tokens)

cmd = [
    sys.executable, "-m", "vllm.entrypoints.openai.api_server",
    "--model", MODEL,
    "--trust-remote-code",
    "--kv-cache-dtype", "fp8",
    "--block-size", "256",
    "--enable-expert-parallel",
    "--tensor-parallel-size", "8",
    "--compilation-config", '{"cudagraph_mode":"FULL_AND_PIECEWISE", "custom_ops":["all"]}',
    "--attention_config.use_fp4_indexer_cache=True",
    "--moe-backend", "deep_gemm_mega_moe",  # WARN: No NVFP4 mega_moe kernel. Use docker-compose (omits this flag) instead.
    "--tokenizer-mode", "deepseek_v4",
    "--tool-call-parser", "deepseek_v4",
    "--enable-auto-tool-choice",
    "--reasoning-parser", "deepseek_v4",
    "--speculative_config", '{"method":"mtp","num_speculative_tokens":2}',
    "--host", "0.0.0.0",
    "--port", "8000",
]

print(f"Starting vLLM server for {MODEL}")
print(f"Command: {' '.join(cmd)}")
print(f"Log: /root/nvidia-meeting/vllm_serve.log")
print()

sys.exit(subprocess.call(cmd))