diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml new file mode 100644 index 000000000..91a220c4f --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "latency" diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt index dfa67c76e..7fb1f4968 100644 --- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt +++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt @@ -1,6 +1,7 @@ Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml +Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index e24d60150..4f6604530 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -53,7 +53,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, ) from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import ( aux_stream, @@ -1761,17 +1760,11 @@ class FusedMoE(CustomOp): with sp_ctx: extra_tensors = None if do_naive_dispatch_combine: - # Avoid circular import - from vllm.model_executor.layers.quantization.modelopt import ( - ModelOptNvFp4FusedMoE, - ) - post_quant_allgather = ( self.quant_method is not None and self.dp_size > 1 and self.use_ep - and isinstance(self.quant_method, ModelOptNvFp4FusedMoE) - and has_flashinfer_trtllm_fused_moe() + and getattr(self.quant_method, "do_post_quant_allgather", False) ) if post_quant_allgather: hidden_states_to_dispatch, extra_tensors = ( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 4c9fac39c..ed3f000d9 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1564,6 +1564,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): moe_config=self.moe, ) + @property + def do_post_quant_allgather(self): + return self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM + def prepare_dp_allgather_tensor( self, layer: FusedMoE, @@ -1571,13 +1575,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): router_logits: torch.Tensor, ) -> tuple[torch.Tensor, list[torch.Tensor]]: """Optionally prepare extra tensors to carry through DP allgather/EP.""" + if self.nvfp4_backend != NvFp4MoeBackend.FLASHINFER_TRTLLM: + raise RuntimeError( + "prepare_dp_allgather_tensor is only supported for " + "FlashInfer TRTLLM NVFP4 MoE backend." + ) + import flashinfer - assert self.moe_quant_config is not None - a1_gscale = self.moe_quant_config.a1_gscale hidden_states_fp4, hidden_states_sf = flashinfer.fp4_quantize( hidden_states, - a1_gscale, + layer.a1_gscale, is_sf_swizzled_layout=False, ) extra_tensors: list[torch.Tensor] = [hidden_states_sf]