From 4d4cfa6b28d284002cfe9b0084237cecf398cd88 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sat, 16 May 2026 05:21:11 +0000
Subject: [PATCH] fix: tqdm over MoE layer warmup, compile every layer, no
 print spam
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The outer loop tqdm now covers the full finalize_weights + warmup for
each MoE layer. CuTeDSL caches by (M,N,K) so every layer shape gets
compiled during warmup — no RPC timeouts during inference.

  (JIT compile)NVFP4 MoE layers:  50%|██████████░░░░░░░░░░| 31/61
---
 vllm/patches/deepseek_v4.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py
index 0ad96432..9054a18c 100644
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -221,7 +221,7 @@ class DeepseekV4MegaMoEExperts(nn.Module):
     It handles NVFP4 natively with full Blackwell pipeline overlap (TMA → MMA → Epilogue).
     This replaces the broken C++ CUTLASS kernel (see README.md for the full story).
     """
-    _cutedsl_compiled: bool = False
+    _cutedsl_runner: 'CuTeDSLMoERunner | None' = None
 
     # NVFP4 E2M1 lookup table (positive values, sign from bit 3)
     E2M1_LUT = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
@@ -330,7 +330,6 @@ class DeepseekV4MegaMoEExperts(nn.Module):
         set_weight_attrs(self.w2_input_scale, weight_attrs)
 
         self._cutedsl_runner = None
-        self._cutedsl_compiled = False
 
         # Register in the static forward context so the custom-op wrapper
         # can look up this module by name from within a torch.compile graph.
@@ -506,18 +505,15 @@ class DeepseekV4MegaMoEExperts(nn.Module):
         self.w2_input_scale = None
 
         # Warm up the CuTeDSL kernel (JIT compiles MLIR→PTX on first call).
-        # CuTeDSL caches compiled kernels by (M, N, K) shape, so different
-        # layer shapes may trigger additional compiles. We only print the
-        # compile message once to avoid spam (61 layers × 8 ranks).
+        # CuTeDSL caches by (M, N, K) shape — different shapes trigger new
+        # compiles. Running warmup on every layer ensures all kernels are
+        # compiled before inference, preventing vLLM RPC timeouts.
         # The MMA tiler needs >= 128 tokens; using 128 for the warmup.
-        if not DeepseekV4MegaMoEExperts._cutedsl_compiled:
-            DeepseekV4MegaMoEExperts._cutedsl_compiled = True
-            print("  Compiling CuTeDSL NVFP4 MegaMoE kernels (one-time JIT, ~1-2 min)...", flush=True)
+        device = self._cutedsl_runner.l1_fp4[0].device
+        dummy_hidden = torch.randn(128, self.hidden_size, dtype=torch.bfloat16, device=device)
+        dummy_ids = torch.zeros(128, 1, dtype=torch.int32, device=device)
+        dummy_weights = torch.ones(128, 1, dtype=torch.float32, device=device)
         try:
-            device = self._cutedsl_runner.l1_fp4[0].device
-            dummy_hidden = torch.randn(128, self.hidden_size, dtype=torch.bfloat16, device=device)
-            dummy_ids = torch.zeros(128, 1, dtype=torch.int32, device=device)
-            dummy_weights = torch.ones(128, 1, dtype=torch.float32, device=device)
             self._cutedsl_runner.run(dummy_hidden, dummy_weights, dummy_ids, expert_indices=[0])
         except Exception as exc:
             print(f"  CuTeDSL warmup FAILED: {exc}", flush=True)
@@ -1607,7 +1603,7 @@ class DeepseekV4Model(nn.Module):
     def finalize_mega_moe_weights(self) -> None:
         from tqdm import tqdm
         layers = list(islice(self.layers, self.start_layer, self.end_layer))
-        for layer in tqdm(layers, desc="  (view-cast)uint8→NVFP4 experts", unit="layer"):
+        for layer in tqdm(layers, desc="  (JIT compile)NVFP4 MoE layers", unit="layer"):
             layer.ffn.finalize_mega_moe_weights()
 
     def _convert_nvfp4_post_load(self):