[BugFix] Fix TRT-LLM NVFP4 DP/EP (#32349)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-01-19 11:32:24 -08:00
parent 9d1e611f0e
commit 7350331718
4 changed files with 21 additions and 11 deletions
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
@@ -0,0 +1,8 @@
+model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "latency"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
@@ -1,6 +1,7 @@
 Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
 Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml
 Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
+Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
 Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
 Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
 Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -53,7 +53,6 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig,
 )
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import (
    aux_stream,
@@ -1761,17 +1760,11 @@ class FusedMoE(CustomOp):
        with sp_ctx:
            extra_tensors = None
            if do_naive_dispatch_combine:
-                # Avoid circular import
-                from vllm.model_executor.layers.quantization.modelopt import (
-                    ModelOptNvFp4FusedMoE,
-                )
-
                post_quant_allgather = (
                    self.quant_method is not None
                    and self.dp_size > 1
                    and self.use_ep
-                    and isinstance(self.quant_method, ModelOptNvFp4FusedMoE)
-                    and has_flashinfer_trtllm_fused_moe()
+                    and getattr(self.quant_method, "do_post_quant_allgather", False)
                )
                if post_quant_allgather:
                    hidden_states_to_dispatch, extra_tensors = (
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1564,6 +1564,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                moe_config=self.moe,
            )

+    @property
+    def do_post_quant_allgather(self):
+        return self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM
+
    def prepare_dp_allgather_tensor(
        self,
        layer: FusedMoE,
@@ -1571,13 +1575,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
        router_logits: torch.Tensor,
    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
        """Optionally prepare extra tensors to carry through DP allgather/EP."""
+        if self.nvfp4_backend != NvFp4MoeBackend.FLASHINFER_TRTLLM:
+            raise RuntimeError(
+                "prepare_dp_allgather_tensor is only supported for "
+                "FlashInfer TRTLLM NVFP4 MoE backend."
+            )
+
        import flashinfer

-        assert self.moe_quant_config is not None
-        a1_gscale = self.moe_quant_config.a1_gscale
        hidden_states_fp4, hidden_states_sf = flashinfer.fp4_quantize(
            hidden_states,
-            a1_gscale,
+            layer.a1_gscale,
            is_sf_swizzled_layout=False,
        )
        extra_tensors: list[torch.Tensor] = [hidden_states_sf]