[cherry-pick][Bugfix] Disable monolithic TRTLLM MoE for Renormalize routing (#37591)#37605

Signed-off-by: khluu <khluu000@gmail.com>
This commit is contained in:
khluu
2026-03-19 15:06:38 -07:00
parent 89138b21cc
commit bcf2be9612
5 changed files with 42 additions and 5 deletions

View File

@@ -253,23 +253,25 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
weight_key: QuantKey | None,
activation_key: QuantKey | None,
) -> bool:
"""Monolithic kernels need to express router support."""
"""Monolithic kernels need to express router support.
Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
internal routing for these methods produces output uncorrelated
with the modular kernel's output and with Triton kernel's output
for Qwen3.5-35B-A3B-FP8.
See: https://github.com/vllm-project/vllm/issues/37591
"""
# NOTE(dbari): TopK routing could also be enabled, but need to validate models
# NOTE(dbari): Default is not implemented and should not be enabled until it is
if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym):
# NOTE(rob): potentially allow others here. This is a conservative list.
return routing_method in [
RoutingMethodType.DeepSeekV3,
RoutingMethodType.Renormalize,
RoutingMethodType.RenormalizeNaive,
]
elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
# NOTE(dbari): as above, potentially allow others here.
return routing_method in [
RoutingMethodType.DeepSeekV3,
RoutingMethodType.Llama4,
RoutingMethodType.Renormalize,
RoutingMethodType.RenormalizeNaive,
]
else:
raise ValueError("Unsupported quantization scheme.")