[Kernels] Support blocked fp8 quantization for compressed tensors MoE (#25219)

Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
bnellnm
2025-09-23 12:11:34 -04:00
committed by GitHub
parent 875d6def90
commit f11e3c516b
4 changed files with 112 additions and 29 deletions

View File

@@ -53,9 +53,9 @@ def _extract_data_from_fused_moe_module(
"""
assert isinstance(m, FusedMoE)
w13 = m.w13_weight
w13_s = m.w13_weight_scale_inv
w13_s = getattr(m, "w13_weight_scale_inv", m.w13_weight_scale)
w2 = m.w2_weight
w2_s = m.w2_weight_scale_inv
w2_s = getattr(m, "w2_weight_scale_inv", m.w2_weight_scale)
num_topk = m.top_k
assert isinstance(w13, torch.Tensor)