[BugFix] Fix TRT-LLM NVFP4 DP/EP (#32349)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
jiahanc
2026-01-19 11:32:24 -08:00
committed by GitHub
parent 9d1e611f0e
commit 7350331718
4 changed files with 21 additions and 11 deletions

View File

@@ -0,0 +1,8 @@
model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "latency"

View File

@@ -1,6 +1,7 @@
Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml
Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml

View File

@@ -53,7 +53,6 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig,
)
from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe
from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import (
aux_stream,
@@ -1761,17 +1760,11 @@ class FusedMoE(CustomOp):
with sp_ctx:
extra_tensors = None
if do_naive_dispatch_combine:
# Avoid circular import
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptNvFp4FusedMoE,
)
post_quant_allgather = (
self.quant_method is not None
and self.dp_size > 1
and self.use_ep
and isinstance(self.quant_method, ModelOptNvFp4FusedMoE)
and has_flashinfer_trtllm_fused_moe()
and getattr(self.quant_method, "do_post_quant_allgather", False)
)
if post_quant_allgather:
hidden_states_to_dispatch, extra_tensors = (

View File

@@ -1564,6 +1564,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
moe_config=self.moe,
)
@property
def do_post_quant_allgather(self):
return self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM
def prepare_dp_allgather_tensor(
self,
layer: FusedMoE,
@@ -1571,13 +1575,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
router_logits: torch.Tensor,
) -> tuple[torch.Tensor, list[torch.Tensor]]:
"""Optionally prepare extra tensors to carry through DP allgather/EP."""
if self.nvfp4_backend != NvFp4MoeBackend.FLASHINFER_TRTLLM:
raise RuntimeError(
"prepare_dp_allgather_tensor is only supported for "
"FlashInfer TRTLLM NVFP4 MoE backend."
)
import flashinfer
assert self.moe_quant_config is not None
a1_gscale = self.moe_quant_config.a1_gscale
hidden_states_fp4, hidden_states_sf = flashinfer.fp4_quantize(
hidden_states,
a1_gscale,
layer.a1_gscale,
is_sf_swizzled_layout=False,
)
extra_tensors: list[torch.Tensor] = [hidden_states_sf]