[Feature][Kernel]FusedMoE LoRA (#21229)
Signed-off-by: wuchen <cntryroa@gmail.com> Signed-off-by: banjuede <lmklhc@163.com> Signed-off-by: Chen Wu <cntryroa@gmail.com> Signed-off-by: Danielle Robinson <dmmaddix@amazon.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: bk-201 <joy25810@foxmail.com> Co-authored-by: wuchen <wuchen@zetyun.com> Co-authored-by: Nathan Van Gheem <vangheem@gmail.com> Co-authored-by: banjuede <lmklhc@163.com> Co-authored-by: Danielle Robinson <dmmaddix@amazon.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: bk-201 <joy25810@foxmail.com>
This commit is contained in:
@@ -2135,13 +2135,18 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
B_bias=self.w2_bias,
|
||||
)
|
||||
|
||||
ops.moe_sum(intermediate_cache3, output)
|
||||
# separate function is required for MoE + LoRA
|
||||
self.moe_sum(intermediate_cache3, output)
|
||||
|
||||
def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
|
||||
ops.moe_sum(input, output)
|
||||
|
||||
|
||||
def modular_triton_fused_moe(
|
||||
quant_config: FusedMoEQuantConfig,
|
||||
quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
|
||||
) -> mk.FusedMoEModularKernel:
|
||||
return mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
TritonExperts(quant_config),
|
||||
shared_experts,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user