[Model][gpt-oss] Support DP+EP for GPT-OSS with FlashInfer trtllm-gen MoE (#23819)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
This commit is contained in:
Po-Han Huang (NVIDIA)
2025-08-28 21:56:20 +08:00
committed by GitHub
parent 1f096f9b95
commit 95089607fa
3 changed files with 14 additions and 15 deletions

View File

@@ -623,8 +623,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
if should_use_flashinfer_mxfp4():
from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe
assert not self.moe.use_ep, (
"EP is not supported for flashinfer mxfp4 moe backend yet.")
if _should_use_flashinfer_mxfp4_bf16():
assert x.dtype == torch.bfloat16
x_quant = x
@@ -650,12 +648,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
None, # output1_scale_scalar
None, # output1_scale_gate_scalar
None, # output2_scale_scalar
self.num_experts,
global_num_experts,
top_k,
None, # n_group
None, # topk_group
self.intermediate_size, # padded to multiple of 256
0, # local_expert_offset
layer.ep_rank * layer.local_num_experts, # local_expert_offset
self.num_experts, # local num experts
None,
self._get_tile_tokens_dim(x, top_k),