diff --git a/vllm/cutedsl_quant_method.py b/vllm/cutedsl_quant_method.py index 6e032a0e..d1c43cd7 100644 --- a/vllm/cutedsl_quant_method.py +++ b/vllm/cutedsl_quant_method.py @@ -103,10 +103,13 @@ class CuTeDSLNvfp4Method(LinearMethodBase): # match what quantize_activation_nvfp4 expects at runtime. Using it # produces garbage output (empty EOS tokens). The correct approach is # a warmup forward pass that measures the actual activation distribution. + # Use only 1 token to minimize GPU memory overhead during weight loading. with torch.no_grad(): - sample = torch.randn(min(8, 256), in_features, + sample = torch.randn(1, in_features, dtype=torch.bfloat16, device=device) * 2.0 runner.compute_activation_global_scale(sample) + del sample + torch.cuda.empty_cache() # Replace weight with dummy BF16 (needed by vLLM module introspection) layer.weight = torch.nn.Parameter( diff --git a/vllm/kernels/linear/nvfp4/cutedsl.py b/vllm/kernels/linear/nvfp4/cutedsl.py index 060a6f99..7e3718ee 100644 --- a/vllm/kernels/linear/nvfp4/cutedsl.py +++ b/vllm/kernels/linear/nvfp4/cutedsl.py @@ -85,12 +85,15 @@ class CuTeDSLNvFp4LinearKernel(NvFp4LinearKernel): # match what quantize_activation_nvfp4 expects at runtime. Using it # produces garbage output (empty EOS tokens). The correct approach is # a warmup forward pass that measures the actual activation distribution. + # Use only 1 token to minimize GPU memory overhead during weight loading. with torch.no_grad(): sample = torch.randn( - min(8, 256), in_features, + 1, in_features, dtype=torch.bfloat16, device=str(device), ) * 2.0 runner.compute_activation_global_scale(sample) + del sample + torch.cuda.empty_cache() # Register the runner and store the ID (not the runner itself) layer._cutedsl_runner_id = register_runner(runner)