Support W8A8 INT8 MoE for compressed-tensors (#16745)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-02 08:03:32 -06:00
parent 99404f53c7
commit 868c546da4
2 changed files with 136 additions and 1 deletions
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -111,7 +111,7 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
        # * dynamic, i_s is None and x_s computed from x.
        # * static, i_s is scalar and x_s is i_s.
        symmetric = azp_adj is None
-        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(),
                                               i_s,
                                               i_zp,
                                               symmetric=symmetric)