[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-17 19:43:31 -04:00
parent e6585ddb45
commit 5963b98b46
68 changed files with 2698 additions and 2526 deletions
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -81,9 +81,14 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:


 def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
-    if not (isinstance(module, FusedMoE)
-            and module.moe_config.quant_dtype == torch.float8_e4m3fn
-            and module.moe_config.block_shape == deep_gemm_block_shape()):
+    if not isinstance(module, FusedMoE):
+        return False
+
+    moe_quant_config = module.quant_method.get_fused_moe_quant_config(module)
+
+    if (moe_quant_config is None
+            or moe_quant_config.quant_dtype != torch.float8_e4m3fn
+            or moe_quant_config.block_shape != deep_gemm_block_shape()):
        return False

    if not isinstance(module.quant_method.fused_experts,