[Kernels] Support blocked fp8 quantization for compressed tensors MoE (#25219)

Signed-off-by: Bill Nell <bnell@redhat.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-23 12:11:34 -04:00
parent 875d6def90
commit f11e3c516b
4 changed files with 112 additions and 29 deletions
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -53,9 +53,9 @@ def _extract_data_from_fused_moe_module(
    """
    assert isinstance(m, FusedMoE)
    w13 = m.w13_weight
-    w13_s = m.w13_weight_scale_inv
+    w13_s = getattr(m, "w13_weight_scale_inv", m.w13_weight_scale)
    w2 = m.w2_weight
-    w2_s = m.w2_weight_scale_inv
+    w2_s = getattr(m, "w2_weight_scale_inv", m.w2_weight_scale)
    num_topk = m.top_k

    assert isinstance(w13, torch.Tensor)