[NVIDIA] Support SiluMul + NVFP4 quant fusion (#23671)

Signed-off-by: jindih <jindih@nvidia.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: jindih <jindih@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedic <lgovedic@redhat.com>
This commit is contained in:
elvischenv
2025-08-29 03:36:50 +08:00
committed by GitHub
parent 57d4ede520
commit 16a45b3a28
11 changed files with 746 additions and 64 deletions

View File

@@ -885,6 +885,10 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
layer.alpha = Parameter(layer.input_scale * layer.weight_scale_2,
requires_grad=False)
# Calculate `1 / input_scale` so that we don't need to do so at runtime
layer.input_scale_inv = Parameter(
(1 / layer.input_scale).to(torch.float32), requires_grad=False)
# Swizzle the weight blockscale.
# contracting dimension is input dimension
# block_size = 16;
@@ -941,8 +945,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
output_shape = [x.shape[0], layer.weight.shape[0]]
# quantize BF16 or FP16 to (FP4 and interleaved block scale)
s_quant = 1 / layer.input_scale
x_fp4, x_blockscale = scaled_fp4_quant(x, s_quant)
x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv)
# validate dtypes of quantized input, input block scale,
# weight and weight_blockscale