Support Llama 4 for fused_marlin_moe (#20457)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-04 16:55:10 +09:00
parent 1caca5a589
commit 0e3fe896e2
6 changed files with 11 additions and 17 deletions
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -645,10 +645,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
                "EPLB not supported for `GPTQMarlinMoEMethod` yet.")

        assert activation == "silu", "Only SiLU activation is supported."
-        if apply_router_weight_on_input:
-            raise NotImplementedError(
-                "Apply router weight on input is not supported for "
-                "fused Marlin MoE method.")

        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
@@ -672,6 +668,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
            topk_weights,
            topk_ids,
            quant_type_id=self.quant_type.id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            g_idx1=layer.w13_g_idx,