[Bugfix] Re-enable Renormalize routing for TRT-LLM MoE experts (#38859)
Signed-off-by: Yifan Zong <yzong@redhat.com>
This commit is contained in:
@@ -79,11 +79,8 @@ class TrtLlmBf16Experts(mk.FusedMoEExpertsMonolithic):
|
|||||||
RoutingMethodType.Default,
|
RoutingMethodType.Default,
|
||||||
RoutingMethodType.DeepSeekV3,
|
RoutingMethodType.DeepSeekV3,
|
||||||
RoutingMethodType.Llama4,
|
RoutingMethodType.Llama4,
|
||||||
# NOTE: TRTLLM Kernel has issue with Qwen3.5 router.
|
RoutingMethodType.Renormalize,
|
||||||
# Re-enable once the issue is resolved.
|
RoutingMethodType.RenormalizeNaive,
|
||||||
# https://github.com/vllm-project/vllm/issues/37591
|
|
||||||
# RoutingMethodType.Renormalize,
|
|
||||||
# RoutingMethodType.RenormalizeNaive
|
|
||||||
]
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -277,13 +277,7 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
|
|||||||
weight_key: QuantKey | None,
|
weight_key: QuantKey | None,
|
||||||
activation_key: QuantKey | None,
|
activation_key: QuantKey | None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Monolithic kernels need to express router support.
|
"""Monolithic kernels need to express router support."""
|
||||||
Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
|
|
||||||
internal routing for these methods produces output uncorrelated
|
|
||||||
with the modular kernel's output and with Triton kernel's output
|
|
||||||
for Qwen3.5-35B-A3B-FP8.
|
|
||||||
See: https://github.com/vllm-project/vllm/issues/37591
|
|
||||||
"""
|
|
||||||
# NOTE(dbari): TopK routing could also be enabled, but need to validate models
|
# NOTE(dbari): TopK routing could also be enabled, but need to validate models
|
||||||
# NOTE(dbari): Default is not implemented and should not be enabled until it is
|
# NOTE(dbari): Default is not implemented and should not be enabled until it is
|
||||||
|
|
||||||
@@ -295,6 +289,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
|
|||||||
return routing_method in [
|
return routing_method in [
|
||||||
RoutingMethodType.DeepSeekV3,
|
RoutingMethodType.DeepSeekV3,
|
||||||
RoutingMethodType.Simulated,
|
RoutingMethodType.Simulated,
|
||||||
|
RoutingMethodType.Renormalize,
|
||||||
|
RoutingMethodType.RenormalizeNaive,
|
||||||
]
|
]
|
||||||
elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
|
elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
|
||||||
# NOTE(dbari): as above, potentially allow others here.
|
# NOTE(dbari): as above, potentially allow others here.
|
||||||
@@ -302,6 +298,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
|
|||||||
RoutingMethodType.DeepSeekV3,
|
RoutingMethodType.DeepSeekV3,
|
||||||
RoutingMethodType.Llama4,
|
RoutingMethodType.Llama4,
|
||||||
RoutingMethodType.Simulated,
|
RoutingMethodType.Simulated,
|
||||||
|
RoutingMethodType.Renormalize,
|
||||||
|
RoutingMethodType.RenormalizeNaive,
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unsupported quantization scheme.")
|
raise ValueError("Unsupported quantization scheme.")
|
||||||
|
|||||||
Reference in New Issue
Block a user