Support Llama 4 for fused_marlin_moe (#20457)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-04 16:55:10 +09:00
parent 1caca5a589
commit 0e3fe896e2
6 changed files with 11 additions and 17 deletions
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -493,11 +493,6 @@ class AWQMoEMethod(FusedMoEMethodBase):

        assert activation == "silu", "Only SiLU activation is supported."

-        if apply_router_weight_on_input:
-            raise NotImplementedError(
-                "Apply router weight on input is not supported for"
-                "fused Marlin MoE method.")
-
        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
@@ -520,6 +515,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
            topk_weights,
            topk_ids,
            quant_type_id=self.quant_type.id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            w1_zeros=layer.w13_qzeros,