[Perf] Optimize moe_align_block_size CUDA kernel (#19572)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Wentao Ye
2025-06-17 14:49:26 -04:00
committed by GitHub
parent ca94d7fa00
commit ffb2cd6b54
7 changed files with 386 additions and 365 deletions

View File

@@ -22,15 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// temporarily adapted from
// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
m.def(
"sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
" int block_size, Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
#ifndef USE_ROCM
m.def(
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "