Add support for Mistral Large 3 inference with Flashinfer MoE (#33174)

Signed-off-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com> Co-authored-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-01-31 07:48:27 +01:00
parent 73419abfae
commit f0bca83ee4
16 changed files with 1104 additions and 31 deletions
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -113,15 +113,19 @@ def apply_fi_trtllm_fp8_per_tensor_moe(
        and hasattr(layer, "output2_scales_scalar")
    )

-    # Added to the layer by: register_scales_for_trtllm_fp8_per_tensor_moe
-    assert (
-        hasattr(layer, "output1_scales_scalar")
-        and hasattr(layer, "output1_scales_gate_scalar")
-        and hasattr(layer, "output2_scales_scalar")
-    )
+    if layer.routing_method_type == RoutingMethodType.Llama4:
+        assert (
+            not layer.renormalize
+            and layer.custom_routing_function == Llama4MoE.custom_routing_function
+        ), (
+            "FusedMoE flashinfer kernels with Llama4 routing method are only "
+            "supported for Llama4"
+        )
+    else:
+        assert layer.custom_routing_function is None, (
+            "Custom routing function is only supported for Llama4"
+        )

-    is_llama4 = layer.custom_routing_function == Llama4MoE.custom_routing_function
-    assert is_llama4, "FusedMoE flashinfer kernels are only supported for Llama4"
    return torch.ops.vllm.fi_trtllm_fp8_per_tensor_moe(
        routing_logits=router_logits,
        routing_bias=routing_bias,
@@ -140,7 +144,7 @@ def apply_fi_trtllm_fp8_per_tensor_moe(
        local_expert_offset=layer.ep_rank * layer.local_num_experts,
        local_num_experts=layer.local_num_experts,
        use_routing_scales_on_input=apply_router_weight_on_input,
-        routing_method_type=RoutingMethodType.Llama4,
+        routing_method_type=layer.routing_method_type,
    )