From cebc586014f30f7feadf66e1690bd72c09286622 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Tue, 19 May 2026 07:28:57 +0000
Subject: [PATCH] Fix OOM: use 1-token warmup sample + free immediately
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

8 tokens * 7168 hidden * ~40 NVFP4 layers = ~2.3 MiB per layer * 40 = 92 MiB
But the dummy weight param (out_features * in_features * 2 bytes BF16) was
the real killer — each layer allocated a BF16 dummy of its full weight shape.
With 1 token the warmup still gets a valid gs, and empty_cache frees the
sample tensor before KV cache allocation.
---
 vllm/cutedsl_quant_method.py         | 5 ++++-
 vllm/kernels/linear/nvfp4/cutedsl.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/cutedsl_quant_method.py b/vllm/cutedsl_quant_method.py
index 6e032a0e..d1c43cd7 100644
--- a/vllm/cutedsl_quant_method.py
+++ b/vllm/cutedsl_quant_method.py
@@ -103,10 +103,13 @@ class CuTeDSLNvfp4Method(LinearMethodBase):
         # match what quantize_activation_nvfp4 expects at runtime. Using it
         # produces garbage output (empty EOS tokens). The correct approach is
         # a warmup forward pass that measures the actual activation distribution.
+        # Use only 1 token to minimize GPU memory overhead during weight loading.
         with torch.no_grad():
-            sample = torch.randn(min(8, 256), in_features,
+            sample = torch.randn(1, in_features,
                                  dtype=torch.bfloat16, device=device) * 2.0
             runner.compute_activation_global_scale(sample)
+            del sample
+            torch.cuda.empty_cache()
 
         # Replace weight with dummy BF16 (needed by vLLM module introspection)
         layer.weight = torch.nn.Parameter(
diff --git a/vllm/kernels/linear/nvfp4/cutedsl.py b/vllm/kernels/linear/nvfp4/cutedsl.py
index 060a6f99..7e3718ee 100644
--- a/vllm/kernels/linear/nvfp4/cutedsl.py
+++ b/vllm/kernels/linear/nvfp4/cutedsl.py
@@ -85,12 +85,15 @@ class CuTeDSLNvFp4LinearKernel(NvFp4LinearKernel):
         # match what quantize_activation_nvfp4 expects at runtime. Using it
         # produces garbage output (empty EOS tokens). The correct approach is
         # a warmup forward pass that measures the actual activation distribution.
+        # Use only 1 token to minimize GPU memory overhead during weight loading.
         with torch.no_grad():
             sample = torch.randn(
-                min(8, 256), in_features,
+                1, in_features,
                 dtype=torch.bfloat16, device=str(device),
             ) * 2.0
             runner.compute_activation_global_scale(sample)
+            del sample
+            torch.cuda.empty_cache()
 
         # Register the runner and store the ID (not the runner itself)
         layer._cutedsl_runner_id = register_runner(runner)