#!/usr/bin/env python3 """ DeepSeek V4 Pro NVFP4 — vLLM OpenAI-compatible server. Run from the venv on the B200 node: source /root/nvidia-meeting/venv/bin/activate python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py Or in the background: nohup python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py \ > /root/nvidia-meeting/vllm_serve.log 2>&1 & """ import subprocess import sys # ── Patch: Add compress_ratios to DeepseekV4Config ────────────────────────── # transformers 5.8.0 renamed compress_ratios → compress_rates (dict format). # vllm 0.20.2 still expects compress_ratios as a list indexed by layer_id. # We patch the Config class to expose compress_ratios as a property that # converts the new dict format back to the list format vllm expects. import transformers try: from transformers import DeepseekV4Config _orig_init = DeepseekV4Config.__init__ def _patched_init(self, *args, **kwargs): _orig_init(self, *args, **kwargs) # If compress_ratios already exists as a list, leave it alone if hasattr(self, 'compress_ratios') and isinstance(self.compress_ratios, list): return # Convert compress_rates dict → compress_ratios list if hasattr(self, 'compress_rates') and isinstance(self.compress_rates, dict): rates = self.compress_rates # Build per-layer list from the dict schema # V4 pattern: layers 0-1=128, then alternating 4/128, last=0 n_layers = getattr(self, 'num_hidden_layers', 61) cr = rates.get('compressed_sparse_attention', 4) hr = rates.get('heavily_compressed_attention', 128) ratios = [] for i in range(n_layers): if i < 2: ratios.append(hr) elif i == n_layers - 1: ratios.append(0) else: ratios.append(cr if i % 2 == 0 else hr) self.compress_ratios = ratios elif hasattr(self, 'compress_rates') and isinstance(self.compress_rates, list): self.compress_ratios = self.compress_rates DeepseekV4Config.__init__ = _patched_init print("✓ Patched DeepseekV4Config.__init__ to add compress_ratios") except ImportError: print("⚠ DeepseekV4Config not found, skipping compress_ratios patch") MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4" # These flags are critical for V4 — do not change without understanding why: # --trust-remote-code V4 needs custom modeling code # --kv-cache-dtype fp8 Match our kv_cache_qformat=fp8_cast quantization # --block-size 256 V4 recommended block size # --enable-expert-parallel Distribute expert computation across GPUs (critical for 256-expert MoE) # --tensor-parallel-size 8 8× B200 # --compilation-config CUDA graphs for throughput — FULL_AND_PIECEWISE + all custom ops # --attention_config FP4 indexer cache for V4 MLA attention # --moe-backend deep_gemm_mega_moe — optimized MoE kernel for Blackwell # --tokenizer-mode deepseek_v4 — V4-specific tokenizer # --tool-call-parser deepseek_v4 — native tool calling # --enable-auto-tool-choice Auto tool choice for function calling # --reasoning-parser deepseek_v4 — reasoning/thinking output parsing # --speculative_config MTP speculative decoding (2 speculative tokens) cmd = [ sys.executable, "-m", "vllm.entrypoints.openai.api_server", "--model", MODEL, "--trust-remote-code", "--kv-cache-dtype", "fp8", "--block-size", "256", "--enable-expert-parallel", "--tensor-parallel-size", "8", "--compilation-config", '{"cudagraph_mode":"FULL_AND_PIECEWISE", "custom_ops":["all"]}', "--attention_config.use_fp4_indexer_cache=True", "--moe-backend", "deep_gemm_mega_moe", # WARN: No NVFP4 mega_moe kernel. Use docker-compose (omits this flag) instead. "--tokenizer-mode", "deepseek_v4", "--tool-call-parser", "deepseek_v4", "--enable-auto-tool-choice", "--reasoning-parser", "deepseek_v4", "--speculative_config", '{"method":"mtp","num_speculative_tokens":2}', "--host", "0.0.0.0", "--port", "8000", ] print(f"Starting vLLM server for {MODEL}") print(f"Command: {' '.join(cmd)}") print(f"Log: /root/nvidia-meeting/vllm_serve.log") print() sys.exit(subprocess.call(cmd))