[Performance][Kernel] Fused_moe Performance Improvement (#9384)
Signed-off-by: charlifu <charlifu@amd.com>
This commit is contained in:
@@ -589,9 +589,8 @@ def fused_experts(hidden_states: torch.Tensor,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
use_int8_w8a16=use_int8_w8a16)
|
||||
|
||||
torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
|
||||
dim=1,
|
||||
out=out_hidden_states[begin_chunk_idx:end_chunk_idx])
|
||||
ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
|
||||
out_hidden_states[begin_chunk_idx:end_chunk_idx])
|
||||
return out_hidden_states
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user