[Bugfix] Fix Broken ModelOpt NVFP4 MoE (#31742)

Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-01-05 18:18:38 -05:00
parent 776ca1e187
commit f6c0009afa
4 changed files with 32 additions and 15 deletions
--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -15,9 +15,6 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEPrepareAndFinalize,
 )
-from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    build_flashinfer_fp8_cutlass_moe_prepare_finalize,
-)
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_deep_ep, has_pplx

@@ -80,17 +77,12 @@ def maybe_make_prepare_finalize(

    prepare_finalize: FusedMoEPrepareAndFinalize | None = None

-    if moe.use_flashinfer_cutlass_kernels:
-        assert quant_config is not None
-        use_deepseek_fp8_block_scale = (
-            quant_config is not None and quant_config.is_block_quantized
-        )
-        prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-            moe=moe,
-            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-        )
+    # TODO(rob): update this as part of the MoE refactor.
+    assert not moe.use_flashinfer_cutlass_kernels, (
+        "Must be created in modelopt.py or fp8.py"
+    )

-    elif moe.use_pplx_kernels:
+    if moe.use_pplx_kernels:
        assert quant_config is not None

        hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -241,7 +241,9 @@ def flashinfer_cutlass_moe_fp4(
    apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
    fused_experts = mk.FusedMoEModularKernel(
-        create_flashinfer_prepare_finalize(use_dp=False),
+        create_flashinfer_prepare_finalize(
+            use_dp=False, use_nvfp4=True, enable_alltoallv=False
+        ),
        FlashInferExperts(
            out_dtype=hidden_states.dtype,
            quant_config=quant_config,
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -48,6 +48,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
    FlashinferMoeBackend,
    apply_flashinfer_per_tensor_scale_fp8,
+    build_flashinfer_fp8_cutlass_moe_prepare_finalize,
    get_flashinfer_moe_backend,
    register_moe_scaling_factors,
    rotate_flashinfer_fp8_moe_weights,
@@ -149,7 +150,7 @@ def get_fp8_moe_backend(
            if block_quant and current_platform.is_device_capability_family(100):
                raise ValueError(
                    "FlashInfer FP8 MoE throughput backend does not "
-                    "support block quantization. Please use "
+                    "support block quantization on SM100. Please use "
                    "VLLM_FLASHINFER_MOE_BACKEND=latency "
                    "instead."
                )
@@ -1102,6 +1103,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
        ):
            return None
+        elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
+            prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+                self.moe,
+                use_deepseek_fp8_block_scale=self.block_quant,
+            )
+            logger.debug_once("%s", prepare_finalize.__class__.__name__)
+            return prepare_finalize
        return super().maybe_make_prepare_finalize(routing_tables)

    def select_gemm_impl(
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -46,6 +46,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
    FlashinferMoeBackend,
    apply_flashinfer_per_tensor_scale_fp8,
+    build_flashinfer_fp8_cutlass_moe_prepare_finalize,
    flashinfer_cutlass_moe_fp8,
    get_flashinfer_moe_backend,
    is_flashinfer_supporting_global_sf,
@@ -750,6 +751,17 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
        # TRT LLM not supported with all2all yet.
        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
            return None
+        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            # TP case: avoid convert to ModularKernelMethod - to be refactored.
+            if self.moe.dp_size == 1:
+                return None
+
+            prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+                self.moe,
+                use_deepseek_fp8_block_scale=False,
+            )
+            logger.debug_once("%s", prepare_finalize.__class__.__name__)
+            return prepare_finalize
        return super().maybe_make_prepare_finalize(routing_tables)

    def select_gemm_impl(
@@ -1444,6 +1456,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
            self.allow_flashinfer
            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
        ):
+            # TP case: avoid convert to ModularKernelMethod - to be refactored.
+            if self.moe.dp_size == 1:
+                return None
            # For now, fp4 moe only works with the flashinfer dispatcher.
            prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
                self.moe