From 581d87f9a6e35eeed303b260478c4b6305bbfc91 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Tue, 19 May 2026 01:18:54 +0000
Subject: [PATCH] Remove warmup forward from process_weights_after_loading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The warmup custom op call hit cudaErrorIllegalAddress because our
custom op GEMM implementation doesn't match the runner's call convention.
Skip warmup for now — MoE kernel warmup handles CuTeDSL JIT cleanup.
---
 vllm/kernels/linear/nvfp4/cutedsl.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/vllm/kernels/linear/nvfp4/cutedsl.py b/vllm/kernels/linear/nvfp4/cutedsl.py
index c8368918..45aa119f 100644
--- a/vllm/kernels/linear/nvfp4/cutedsl.py
+++ b/vllm/kernels/linear/nvfp4/cutedsl.py
@@ -189,21 +189,6 @@ class CuTeDSLNvFp4LinearKernel(NvFp4LinearKernel):
         layer._cutedsl_global_scale_b = runner._gsb
         layer._cutedsl_activation_global_scale = activation_global_scale
 
-        # Warmup: CuTeDSL cute.compile corrupts GPU memory during JIT.
-        # Run a warmup forward to trigger compilation, then synchronize
-        # and verify GPU health. Matches cutedsl/runner.py MoE pattern.
-        with torch.no_grad():
-            warmup_x = torch.randn(1, in_features, dtype=torch.bfloat16,
-                                    device=device)
-            _ = torch.ops.cutedsl.nvfp4_linear(
-                warmup_x, runner._mat_b, runner._scale_b, runner._gsb,
-                activation_global_scale,
-            )
-            torch.cuda.synchronize()
-            # Verify GPU is still healthy after CuTeDSL JIT
-            test = torch.ones(1, device=device) + torch.ones(1, device=device)
-            assert test.item() == 2.0, "GPU corruption after CuTeDSL JIT"
-
         # Replace weight with dummy BF16 (vLLM module introspection may need it)
         layer.weight = torch.nn.Parameter(
             torch.zeros(out_features, in_features, dtype=torch.bfloat16,