[cherry-pick][Bugfix] Disable monolithic TRTLLM MoE for Renormalize routing (#37591 )#37605

Signed-off-by: khluu <khluu000@gmail.com>
2026-03-19 15:06:38 -07:00
5 changed files with 42 additions and 5 deletions
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -45,6 +45,22 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt

+- label: LM Eval Qwen3.5 Models (B200)
+  timeout_in_minutes: 120
+  device: b200
+  optional: true
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/model_executor/models/qwen3_5.py
+  - vllm/model_executor/models/qwen3_5_mtp.py
+  - vllm/transformers_utils/configs/qwen3_5.py
+  - vllm/transformers_utils/configs/qwen3_5_moe.py
+  - vllm/model_executor/models/qwen3_next.py
+  - vllm/model_executor/models/qwen3_next_mtp.py
+  - vllm/model_executor/layers/fla/ops/
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
+
 - label: LM Eval Large Models (H200)
  timeout_in_minutes: 60
  device: h200
--- a/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen3.5-35B-A3B"
+accuracy_threshold: 0.86
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --max-model-len 4096
+  --data-parallel-size 2
+  --enable-expert-parallel
--- a/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
@@ -0,0 +1,9 @@
+model_name: "Qwen/Qwen3.5-35B-A3B-FP8"
+accuracy_threshold: 0.86
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --max-model-len 4096
+  --data-parallel-size 2
+  --enable-expert-parallel
+  --kv-cache-dtype fp8
--- a/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
+++ b/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
@@ -0,0 +1,2 @@
+Qwen3.5-35B-A3B-DEP2.yaml
+Qwen3.5-35B-A3B-FP8-DEP2.yaml
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
@@ -253,23 +253,25 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
-        """Monolithic kernels need to express router support."""
+        """Monolithic kernels need to express router support.
+        Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
+        internal routing for these methods produces output uncorrelated
+        with the modular kernel's output and with Triton kernel's output
+        for Qwen3.5-35B-A3B-FP8.
+        See: https://github.com/vllm-project/vllm/issues/37591
+        """
        # NOTE(dbari): TopK routing could also be enabled, but need to validate models
        # NOTE(dbari): Default is not implemented and should not be enabled until it is
        if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym):
            # NOTE(rob): potentially allow others here. This is a conservative list.
            return routing_method in [
                RoutingMethodType.DeepSeekV3,
-                RoutingMethodType.Renormalize,
-                RoutingMethodType.RenormalizeNaive,
            ]
        elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
            # NOTE(dbari): as above, potentially allow others here.
            return routing_method in [
                RoutingMethodType.DeepSeekV3,
                RoutingMethodType.Llama4,
-                RoutingMethodType.Renormalize,
-                RoutingMethodType.RenormalizeNaive,
            ]
        else:
            raise ValueError("Unsupported quantization scheme.")