[NVIDIA] Fix Llama4 Scout FP4 functionality issues (#21499)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-07-30 22:33:40 +08:00
parent 8f4a1c9a04
commit ff08e51940
3 changed files with 218 additions and 69 deletions
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -778,8 +778,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
        # Swizzle the weight blockscale.
        # contracting dimension is input dimension
        # block_size = 16;
-        assert (layer.weight_scale.shape[1] % 16 == 0), (
-            "Expected weight_scale.dim(1) to be divisible by 16")
        assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
            "Weight Block scale must be represented as FP8-E4M3")
        swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)