[Feature][Kernel]FusedMoE LoRA (#21229)

Signed-off-by: wuchen <cntryroa@gmail.com>
Signed-off-by: banjuede <lmklhc@163.com>
Signed-off-by: Chen Wu <cntryroa@gmail.com>
Signed-off-by: Danielle Robinson <dmmaddix@amazon.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: bk-201 <joy25810@foxmail.com>
Co-authored-by: wuchen <wuchen@zetyun.com>
Co-authored-by: Nathan Van Gheem <vangheem@gmail.com>
Co-authored-by: banjuede <lmklhc@163.com>
Co-authored-by: Danielle Robinson <dmmaddix@amazon.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: bk-201 <joy25810@foxmail.com>
This commit is contained in:
Chen Wu
2025-10-21 11:01:37 +08:00
committed by GitHub
parent 3ada34f9cb
commit 5f6cbf60d6
28 changed files with 2084 additions and 55 deletions

View File

@@ -2135,13 +2135,18 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
B_bias=self.w2_bias,
)
ops.moe_sum(intermediate_cache3, output)
# separate function is required for MoE + LoRA
self.moe_sum(intermediate_cache3, output)
def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
ops.moe_sum(input, output)
def modular_triton_fused_moe(
quant_config: FusedMoEQuantConfig,
quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
) -> mk.FusedMoEModularKernel:
return mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
TritonExperts(quant_config),
shared_experts,
)