From 581d87f9a6e35eeed303b260478c4b6305bbfc91 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 19 May 2026 01:18:54 +0000 Subject: [PATCH] Remove warmup forward from process_weights_after_loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The warmup custom op call hit cudaErrorIllegalAddress because our custom op GEMM implementation doesn't match the runner's call convention. Skip warmup for now — MoE kernel warmup handles CuTeDSL JIT cleanup. --- vllm/kernels/linear/nvfp4/cutedsl.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/vllm/kernels/linear/nvfp4/cutedsl.py b/vllm/kernels/linear/nvfp4/cutedsl.py index c8368918..45aa119f 100644 --- a/vllm/kernels/linear/nvfp4/cutedsl.py +++ b/vllm/kernels/linear/nvfp4/cutedsl.py @@ -189,21 +189,6 @@ class CuTeDSLNvFp4LinearKernel(NvFp4LinearKernel): layer._cutedsl_global_scale_b = runner._gsb layer._cutedsl_activation_global_scale = activation_global_scale - # Warmup: CuTeDSL cute.compile corrupts GPU memory during JIT. - # Run a warmup forward to trigger compilation, then synchronize - # and verify GPU health. Matches cutedsl/runner.py MoE pattern. - with torch.no_grad(): - warmup_x = torch.randn(1, in_features, dtype=torch.bfloat16, - device=device) - _ = torch.ops.cutedsl.nvfp4_linear( - warmup_x, runner._mat_b, runner._scale_b, runner._gsb, - activation_global_scale, - ) - torch.cuda.synchronize() - # Verify GPU is still healthy after CuTeDSL JIT - test = torch.ones(1, device=device) + torch.ones(1, device=device) - assert test.item() == 2.0, "GPU corruption after CuTeDSL JIT" - # Replace weight with dummy BF16 (vLLM module introspection may need it) layer.weight = torch.nn.Parameter( torch.zeros(out_features, in_features, dtype=torch.bfloat16,