[Kernel][Performance] Enable smaller Scaling Factor tiling for NVFP4 small-batch decoding (#30885)
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
This commit is contained in:
committed by
GitHub
parent
2a60ac91d0
commit
8ef50d9a6b
@@ -23,7 +23,10 @@ from vllm.model_executor.parameter import (
|
||||
ModelWeightParameter,
|
||||
PerTensorScaleParameter,
|
||||
)
|
||||
from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm, has_flashinfer
|
||||
from vllm.utils.flashinfer import (
|
||||
flashinfer_scaled_fp4_mm,
|
||||
has_flashinfer,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -187,7 +190,9 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
|
||||
output_shape = [*x.shape[:-1], layer.weight_packed.shape[0]]
|
||||
|
||||
# quantize BF16 or FP16 to (FP4 and interleaved block scale)
|
||||
x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
|
||||
x_fp4, x_blockscale = scaled_fp4_quant(
|
||||
x, layer.input_global_scale, self.backend
|
||||
)
|
||||
|
||||
mm_args = (
|
||||
x_fp4,
|
||||
|
||||
@@ -1291,7 +1291,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
|
||||
output_shape = [x.shape[0], layer.weight.shape[0]]
|
||||
|
||||
# quantize BF16 or FP16 to (FP4 and interleaved block scale)
|
||||
x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv)
|
||||
x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv, self.backend)
|
||||
|
||||
# validate dtypes of quantized input, input block scale,
|
||||
# weight and weight_blockscale
|
||||
|
||||
Reference in New Issue
Block a user