From daa2784bb9333067f3822fb05fc6e5ce459932b4 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 3 Feb 2026 05:37:15 -0500 Subject: [PATCH] [Bugfix] Disable RoutingMethodType.[Renormalize,RenormalizeNaive] TRTLLM per-tensor FP8 MoE (#33620) Signed-off-by: mgoin (cherry picked from commit e346e2d056a66bb84287e4fea049bde9a37bd72b) Signed-off-by: Robert Shaw --- .../layers/fused_moe/flashinfer_trtllm_moe.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py index 04336e9d3..0d7473aaf 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -69,9 +69,14 @@ def _supports_routing_method( RoutingMethodType.RenormalizeNaive, ] elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym): - # NOTE(rob): kernel requires Llama4. - return routing_method == RoutingMethodType.Llama4 - + # NOTE(dbari): as above, potentially allow others here. + return routing_method in [ + RoutingMethodType.Llama4, + # NOTE(mgoin): Disabled to investigate accuracy issues. + # See https://github.com/vllm-project/vllm/issues/33532 + # RoutingMethodType.Renormalize, + # RoutingMethodType.RenormalizeNaive, + ] else: raise ValueError("Unsupported quantization scheme.")