Compare commits

..

1 Commits

Author SHA1 Message Date
khluu
bcf2be9612 [cherry-pick][Bugfix] Disable monolithic TRTLLM MoE for Renormalize routing (#37591)#37605
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-19 15:06:38 -07:00
5 changed files with 42 additions and 5 deletions

View File

@@ -45,6 +45,22 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- label: LM Eval Qwen3.5 Models (B200)
timeout_in_minutes: 120
device: b200
optional: true
num_devices: 2
source_file_dependencies:
- vllm/model_executor/models/qwen3_5.py
- vllm/model_executor/models/qwen3_5_mtp.py
- vllm/transformers_utils/configs/qwen3_5.py
- vllm/transformers_utils/configs/qwen3_5_moe.py
- vllm/model_executor/models/qwen3_next.py
- vllm/model_executor/models/qwen3_next_mtp.py
- vllm/model_executor/layers/fla/ops/
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
- label: LM Eval Large Models (H200)
timeout_in_minutes: 60
device: h200

View File

@@ -0,0 +1,8 @@
model_name: "Qwen/Qwen3.5-35B-A3B"
accuracy_threshold: 0.86
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--data-parallel-size 2
--enable-expert-parallel

View File

@@ -0,0 +1,9 @@
model_name: "Qwen/Qwen3.5-35B-A3B-FP8"
accuracy_threshold: 0.86
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--data-parallel-size 2
--enable-expert-parallel
--kv-cache-dtype fp8

View File

@@ -0,0 +1,2 @@
Qwen3.5-35B-A3B-DEP2.yaml
Qwen3.5-35B-A3B-FP8-DEP2.yaml

View File

@@ -253,23 +253,25 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
weight_key: QuantKey | None,
activation_key: QuantKey | None,
) -> bool:
"""Monolithic kernels need to express router support."""
"""Monolithic kernels need to express router support.
Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
internal routing for these methods produces output uncorrelated
with the modular kernel's output and with Triton kernel's output
for Qwen3.5-35B-A3B-FP8.
See: https://github.com/vllm-project/vllm/issues/37591
"""
# NOTE(dbari): TopK routing could also be enabled, but need to validate models
# NOTE(dbari): Default is not implemented and should not be enabled until it is
if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym):
# NOTE(rob): potentially allow others here. This is a conservative list.
return routing_method in [
RoutingMethodType.DeepSeekV3,
RoutingMethodType.Renormalize,
RoutingMethodType.RenormalizeNaive,
]
elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
# NOTE(dbari): as above, potentially allow others here.
return routing_method in [
RoutingMethodType.DeepSeekV3,
RoutingMethodType.Llama4,
RoutingMethodType.Renormalize,
RoutingMethodType.RenormalizeNaive,
]
else:
raise ValueError("Unsupported quantization scheme.")