[NVIDIA] Fix Llama4 Scout FP4 functionality issues (#21499)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
This commit is contained in:
Po-Han Huang (NVIDIA)
2025-07-30 22:33:40 +08:00
committed by GitHub
parent 8f4a1c9a04
commit ff08e51940
3 changed files with 218 additions and 69 deletions

View File

@@ -778,8 +778,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
# Swizzle the weight blockscale.
# contracting dimension is input dimension
# block_size = 16;
assert (layer.weight_scale.shape[1] % 16 == 0), (
"Expected weight_scale.dim(1) to be divisible by 16")
assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
"Weight Block scale must be represented as FP8-E4M3")
swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)