diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
new file mode 100644
index 000000000..91a220c4f
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
@@ -0,0 +1,8 @@
+model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "latency"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
index dfa67c76e..7fb1f4968 100644
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
@@ -1,6 +1,7 @@
 Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
 Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml
 Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
+Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
 Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
 Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
 Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index e24d60150..4f6604530 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -53,7 +53,6 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
 )
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import (
     aux_stream,
@@ -1761,17 +1760,11 @@ class FusedMoE(CustomOp):
         with sp_ctx:
             extra_tensors = None
             if do_naive_dispatch_combine:
-                # Avoid circular import
-                from vllm.model_executor.layers.quantization.modelopt import (
-                    ModelOptNvFp4FusedMoE,
-                )
-
                 post_quant_allgather = (
                     self.quant_method is not None
                     and self.dp_size > 1
                     and self.use_ep
-                    and isinstance(self.quant_method, ModelOptNvFp4FusedMoE)
-                    and has_flashinfer_trtllm_fused_moe()
+                    and getattr(self.quant_method, "do_post_quant_allgather", False)
                 )
                 if post_quant_allgather:
                     hidden_states_to_dispatch, extra_tensors = (
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 4c9fac39c..ed3f000d9 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1564,6 +1564,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 moe_config=self.moe,
             )
 
+    @property
+    def do_post_quant_allgather(self):
+        return self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM
+
     def prepare_dp_allgather_tensor(
         self,
         layer: FusedMoE,
@@ -1571,13 +1575,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         router_logits: torch.Tensor,
     ) -> tuple[torch.Tensor, list[torch.Tensor]]:
         """Optionally prepare extra tensors to carry through DP allgather/EP."""
+        if self.nvfp4_backend != NvFp4MoeBackend.FLASHINFER_TRTLLM:
+            raise RuntimeError(
+                "prepare_dp_allgather_tensor is only supported for "
+                "FlashInfer TRTLLM NVFP4 MoE backend."
+            )
+
         import flashinfer
 
-        assert self.moe_quant_config is not None
-        a1_gscale = self.moe_quant_config.a1_gscale
         hidden_states_fp4, hidden_states_sf = flashinfer.fp4_quantize(
             hidden_states,
-            a1_gscale,
+            layer.a1_gscale,
             is_sf_swizzled_layout=False,
         )
         extra_tensors: list[torch.Tensor] = [hidden_states_sf]