[MoE] Add routing simulation override for MXFP4 quantized MoE (#33595)

Signed-off-by: Jaewon Lee <jaewon@meta.com>
2026-03-12 17:30:44 -07:00
parent 87985077a4
commit aaa3092f51
1 changed files with 6 additions and 0 deletions
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1109,6 +1109,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
            layer.eplb_state.logical_replica_count,
        ), "MXFP4 are not supported with this configuration."

+        # Apply routing simulation strategy if specified.
+        # This applies to all monolithic backends (SM100_FI and TRITON).
+        routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
+        if routing_strategy == "uniform_random":
+            router_logits = torch.rand_like(router_logits)
+
        if (
            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
            or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16