From bcf2be96120005e9aea171927f85055a6a5c0cf6 Mon Sep 17 00:00:00 2001 From: khluu Date: Thu, 19 Mar 2026 15:06:38 -0700 Subject: [PATCH] [cherry-pick][Bugfix] Disable monolithic TRTLLM MoE for Renormalize routing (#37591)#37605 Signed-off-by: khluu --- .buildkite/test_areas/lm_eval.yaml | 16 ++++++++++++++++ .../gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml | 8 ++++++++ .../gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml | 9 +++++++++ .../gsm8k/configs/models-qwen35-blackwell.txt | 2 ++ .../layers/fused_moe/experts/trtllm_fp8_moe.py | 12 +++++++----- 5 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml create mode 100644 tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml create mode 100644 tests/evals/gsm8k/configs/models-qwen35-blackwell.txt diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index 3e2610e70..183dd9d12 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -45,6 +45,22 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt +- label: LM Eval Qwen3.5 Models (B200) + timeout_in_minutes: 120 + device: b200 + optional: true + num_devices: 2 + source_file_dependencies: + - vllm/model_executor/models/qwen3_5.py + - vllm/model_executor/models/qwen3_5_mtp.py + - vllm/transformers_utils/configs/qwen3_5.py + - vllm/transformers_utils/configs/qwen3_5_moe.py + - vllm/model_executor/models/qwen3_next.py + - vllm/model_executor/models/qwen3_next_mtp.py + - vllm/model_executor/layers/fla/ops/ + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt + - label: LM Eval Large Models (H200) timeout_in_minutes: 60 device: h200 diff --git a/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml new file mode 100644 index 000000000..62be504e2 --- /dev/null +++ b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml @@ -0,0 +1,8 @@ +model_name: "Qwen/Qwen3.5-35B-A3B" +accuracy_threshold: 0.86 +num_questions: 1319 +num_fewshot: 5 +server_args: >- + --max-model-len 4096 + --data-parallel-size 2 + --enable-expert-parallel diff --git a/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml new file mode 100644 index 000000000..9380e0b25 --- /dev/null +++ b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml @@ -0,0 +1,9 @@ +model_name: "Qwen/Qwen3.5-35B-A3B-FP8" +accuracy_threshold: 0.86 +num_questions: 1319 +num_fewshot: 5 +server_args: >- + --max-model-len 4096 + --data-parallel-size 2 + --enable-expert-parallel + --kv-cache-dtype fp8 diff --git a/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt b/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt new file mode 100644 index 000000000..774ae8eb7 --- /dev/null +++ b/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt @@ -0,0 +1,2 @@ +Qwen3.5-35B-A3B-DEP2.yaml +Qwen3.5-35B-A3B-FP8-DEP2.yaml diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py index 1c86702e9..0f40d0be1 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py @@ -253,23 +253,25 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - """Monolithic kernels need to express router support.""" + """Monolithic kernels need to express router support. + Renormalize/RenormalizeNaive are excluded: the monolithic kernel's + internal routing for these methods produces output uncorrelated + with the modular kernel's output and with Triton kernel's output + for Qwen3.5-35B-A3B-FP8. + See: https://github.com/vllm-project/vllm/issues/37591 + """ # NOTE(dbari): TopK routing could also be enabled, but need to validate models # NOTE(dbari): Default is not implemented and should not be enabled until it is if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym): # NOTE(rob): potentially allow others here. This is a conservative list. return routing_method in [ RoutingMethodType.DeepSeekV3, - RoutingMethodType.Renormalize, - RoutingMethodType.RenormalizeNaive, ] elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym): # NOTE(dbari): as above, potentially allow others here. return routing_method in [ RoutingMethodType.DeepSeekV3, RoutingMethodType.Llama4, - RoutingMethodType.Renormalize, - RoutingMethodType.RenormalizeNaive, ] else: raise ValueError("Unsupported quantization scheme.")