[Bugfix] Re-enable Renormalize routing for TRT-LLM MoE experts (#38859)

Signed-off-by: Yifan Zong <yzong@redhat.com>
2026-04-03 13:48:17 -04:00
parent f8c3af2d85
commit a5a623d961
2 changed files with 7 additions and 12 deletions
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py
@@ -79,11 +79,8 @@ class TrtLlmBf16Experts(mk.FusedMoEExpertsMonolithic):
            RoutingMethodType.Default,
            RoutingMethodType.DeepSeekV3,
            RoutingMethodType.Llama4,
-            # NOTE: TRTLLM Kernel has issue with Qwen3.5 router.
+            RoutingMethodType.Renormalize,
-            # Re-enable once the issue is resolved.
+            RoutingMethodType.RenormalizeNaive,
            # https://github.com/vllm-project/vllm/issues/37591
            # RoutingMethodType.Renormalize,
            # RoutingMethodType.RenormalizeNaive
        ]
    @staticmethod
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
@@ -277,13 +277,7 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
-        """Monolithic kernels need to express router support.
+        """Monolithic kernels need to express router support."""
        Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
        internal routing for these methods produces output uncorrelated
        with the modular kernel's output and with Triton kernel's output
        for Qwen3.5-35B-A3B-FP8.
        See: https://github.com/vllm-project/vllm/issues/37591
        """
        # NOTE(dbari): TopK routing could also be enabled, but need to validate models
        # NOTE(dbari): Default is not implemented and should not be enabled until it is
@@ -295,6 +289,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
            return routing_method in [
                RoutingMethodType.DeepSeekV3,
                RoutingMethodType.Simulated,
                RoutingMethodType.Renormalize,
                RoutingMethodType.RenormalizeNaive,
            ]
        elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
            # NOTE(dbari): as above, potentially allow others here.
@@ -302,6 +298,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
                RoutingMethodType.DeepSeekV3,
                RoutingMethodType.Llama4,
                RoutingMethodType.Simulated,
                RoutingMethodType.Renormalize,
                RoutingMethodType.RenormalizeNaive,
            ]
        else:
            raise ValueError("Unsupported quantization scheme.")