From a5a623d9616337ed53e8fa0e1376e2d17b505e38 Mon Sep 17 00:00:00 2001 From: yzong-rh Date: Fri, 3 Apr 2026 13:48:17 -0400 Subject: [PATCH] [Bugfix] Re-enable Renormalize routing for TRT-LLM MoE experts (#38859) Signed-off-by: Yifan Zong --- .../layers/fused_moe/experts/trtllm_bf16_moe.py | 7 ++----- .../layers/fused_moe/experts/trtllm_fp8_moe.py | 12 +++++------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py index c4d2b96ec..ee6df1af1 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py @@ -79,11 +79,8 @@ class TrtLlmBf16Experts(mk.FusedMoEExpertsMonolithic): RoutingMethodType.Default, RoutingMethodType.DeepSeekV3, RoutingMethodType.Llama4, - # NOTE: TRTLLM Kernel has issue with Qwen3.5 router. - # Re-enable once the issue is resolved. - # https://github.com/vllm-project/vllm/issues/37591 - # RoutingMethodType.Renormalize, - # RoutingMethodType.RenormalizeNaive + RoutingMethodType.Renormalize, + RoutingMethodType.RenormalizeNaive, ] @staticmethod diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py index aa78e10b6..cf46eefa3 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py @@ -277,13 +277,7 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - """Monolithic kernels need to express router support. - Renormalize/RenormalizeNaive are excluded: the monolithic kernel's - internal routing for these methods produces output uncorrelated - with the modular kernel's output and with Triton kernel's output - for Qwen3.5-35B-A3B-FP8. - See: https://github.com/vllm-project/vllm/issues/37591 - """ + """Monolithic kernels need to express router support.""" # NOTE(dbari): TopK routing could also be enabled, but need to validate models # NOTE(dbari): Default is not implemented and should not be enabled until it is @@ -295,6 +289,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit return routing_method in [ RoutingMethodType.DeepSeekV3, RoutingMethodType.Simulated, + RoutingMethodType.Renormalize, + RoutingMethodType.RenormalizeNaive, ] elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym): # NOTE(dbari): as above, potentially allow others here. @@ -302,6 +298,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit RoutingMethodType.DeepSeekV3, RoutingMethodType.Llama4, RoutingMethodType.Simulated, + RoutingMethodType.Renormalize, + RoutingMethodType.RenormalizeNaive, ] else: raise ValueError("Unsupported quantization scheme.")