[Kernel][Performance] Enable smaller Scaling Factor tiling for NVFP4 small-batch decoding (#30885)

Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
2026-01-14 00:22:53 +01:00
parent 2a60ac91d0
commit 8ef50d9a6b
9 changed files with 177 additions and 32 deletions
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -23,7 +23,10 @@ from vllm.model_executor.parameter import (
    ModelWeightParameter,
    PerTensorScaleParameter,
 )
-from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm, has_flashinfer
+from vllm.utils.flashinfer import (
+    flashinfer_scaled_fp4_mm,
+    has_flashinfer,
+)

 logger = init_logger(__name__)

@@ -187,7 +190,9 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
        output_shape = [*x.shape[:-1], layer.weight_packed.shape[0]]

        # quantize BF16 or FP16 to (FP4 and interleaved block scale)
-        x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
+        x_fp4, x_blockscale = scaled_fp4_quant(
+            x, layer.input_global_scale, self.backend
+        )

        mm_args = (
            x_fp4,
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1291,7 +1291,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
        output_shape = [x.shape[0], layer.weight.shape[0]]

        # quantize BF16 or FP16 to (FP4 and interleaved block scale)
-        x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv)
+        x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv, self.backend)

        # validate dtypes of quantized input, input block scale,
        # weight and weight_blockscale