[Kernels] Support blocked fp8 quantization for compressed tensors MoE (#25219)

Signed-off-by: Bill Nell <bnell@redhat.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-23 12:11:34 -04:00
parent 875d6def90
commit f11e3c516b
4 changed files with 112 additions and 29 deletions
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1014,3 +1014,9 @@ def apply_fp8_block_linear(layer: torch.nn.Module, input: torch.Tensor,
        cutlass_block_fp8_supported=cutlass_block_fp8_supported,
        use_aiter_and_is_supported=use_aiter_and_is_supported,
    )
+
+
+def expert_weight_is_col_major(x: torch.Tensor) -> bool:
+    assert x.dim() == 3
+    b, m, n = x.shape
+    return x.stride(0) == m * n and x.stride(1) == 1 and x.stride(2) == m