[Performance][Kernel] Fused_moe Performance Improvement (#9384)

Signed-off-by: charlifu <charlifu@amd.com>
2024-10-24 17:37:52 -05:00
parent e26d37a185
commit 59449095ab
9 changed files with 118 additions and 38 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -589,9 +589,8 @@ def fused_experts(hidden_states: torch.Tensor,
                                use_fp8_w8a8=use_fp8_w8a8,
                                use_int8_w8a16=use_int8_w8a16)

-        torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                  dim=1,
-                  out=out_hidden_states[begin_chunk_idx:end_chunk_idx])
+        ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                    out_hidden_states[begin_chunk_idx:end_chunk_idx])
    return out_hidden_states