[Kernel][MoE] optimize moe_align_block_size (#29642)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
Jinzhen Lin
2025-12-07 17:58:47 +08:00
committed by GitHub
parent 1b0482b9d1
commit 879ddb09c3
10 changed files with 195 additions and 63 deletions

View File

@@ -1887,7 +1887,11 @@ def fused_experts_impl(
)
sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
curr_topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map
curr_topk_ids,
config["BLOCK_SIZE_M"],
global_num_experts,
expert_map,
ignore_invalid_experts=True,
)
invoke_fused_moe_kernel(
@@ -1946,6 +1950,9 @@ def fused_experts_impl(
block_shape=block_shape,
)
if expert_map is not None:
intermediate_cache3.zero_()
invoke_fused_moe_kernel(
qintermediate_cache2,
w2,