[Feature][Kernel]FusedMoE LoRA (#21229)

Signed-off-by: wuchen <cntryroa@gmail.com> Signed-off-by: banjuede <lmklhc@163.com> Signed-off-by: Chen Wu <cntryroa@gmail.com> Signed-off-by: Danielle Robinson <dmmaddix@amazon.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: bk-201 <joy25810@foxmail.com> Co-authored-by: wuchen <wuchen@zetyun.com> Co-authored-by: Nathan Van Gheem <vangheem@gmail.com> Co-authored-by: banjuede <lmklhc@163.com> Co-authored-by: Danielle Robinson <dmmaddix@amazon.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: bk-201 <joy25810@foxmail.com>
2025-10-21 11:01:37 +08:00
parent 3ada34f9cb
commit 5f6cbf60d6
28 changed files with 2084 additions and 55 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -2135,13 +2135,18 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
            B_bias=self.w2_bias,
        )

-        ops.moe_sum(intermediate_cache3, output)
+        # separate function is required for MoE + LoRA
+        self.moe_sum(intermediate_cache3, output)
+
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
+        ops.moe_sum(input, output)


 def modular_triton_fused_moe(
-    quant_config: FusedMoEQuantConfig,
+    quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
 ) -> mk.FusedMoEModularKernel:
    return mk.FusedMoEModularKernel(
        MoEPrepareAndFinalizeNoEP(),
        TritonExperts(quant_config),
+        shared_experts,
    )