[Kernel][Performance] Enable smaller Scaling Factor tiling for NVFP4 small-batch decoding (#30885)

Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
This commit is contained in:
Roberto L. Castro
2026-01-14 00:22:53 +01:00
committed by GitHub
parent 2a60ac91d0
commit 8ef50d9a6b
9 changed files with 177 additions and 32 deletions

View File

@@ -23,7 +23,10 @@ from vllm.model_executor.parameter import (
ModelWeightParameter,
PerTensorScaleParameter,
)
from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm, has_flashinfer
from vllm.utils.flashinfer import (
flashinfer_scaled_fp4_mm,
has_flashinfer,
)
logger = init_logger(__name__)
@@ -187,7 +190,9 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
output_shape = [*x.shape[:-1], layer.weight_packed.shape[0]]
# quantize BF16 or FP16 to (FP4 and interleaved block scale)
x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
x_fp4, x_blockscale = scaled_fp4_quant(
x, layer.input_global_scale, self.backend
)
mm_args = (
x_fp4,

View File

@@ -1291,7 +1291,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
output_shape = [x.shape[0], layer.weight.shape[0]]
# quantize BF16 or FP16 to (FP4 and interleaved block scale)
x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv)
x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv, self.backend)
# validate dtypes of quantized input, input block scale,
# weight and weight_blockscale