[Bugfix] Disable RoutingMethodType.[Renormalize,RenormalizeNaive] TRTLLM per-tensor FP8 MoE (#33620)

Signed-off-by: mgoin <mgoin64@gmail.com> (cherry picked from commit e346e2d056) Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2026-02-03 05:37:15 -05:00
parent e4bf6ed90d
commit daa2784bb9
1 changed files with 8 additions and 3 deletions
--- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -69,9 +69,14 @@ def _supports_routing_method(
            RoutingMethodType.RenormalizeNaive,
        ]
    elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
-        # NOTE(rob): kernel requires Llama4.
-        return routing_method == RoutingMethodType.Llama4
-
+        # NOTE(dbari): as above, potentially allow others here.
+        return routing_method in [
+            RoutingMethodType.Llama4,
+            # NOTE(mgoin): Disabled to investigate accuracy issues.
+            # See https://github.com/vllm-project/vllm/issues/33532
+            # RoutingMethodType.Renormalize,
+            # RoutingMethodType.RenormalizeNaive,
+        ]
    else:
        raise ValueError("Unsupported quantization scheme.")