[Model] Add LongCat-Flash (#23991)

Signed-off-by: yangxurui <yangxurui@meituan.com>
Co-authored-by: yangxurui <yangxurui@meituan.com>
This commit is contained in:
XuruiYang
2025-09-25 12:53:40 +08:00
committed by GitHub
parent 90b139cfff
commit 845adb3ec6
31 changed files with 1357 additions and 66 deletions

View File

@@ -44,6 +44,9 @@ __global__ void moe_align_block_size_kernel(
for (size_t i = tid; i < numel; i += stride) {
int expert_id = topk_ids[i];
if (expert_id >= num_experts) {
continue;
}
int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp;
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
@@ -95,12 +98,15 @@ template <typename scalar_t>
__global__ void count_and_sort_expert_tokens_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
size_t numel) {
size_t numel, int32_t num_experts) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;
for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i];
if (expert_id >= num_experts) {
continue;
}
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
sorted_token_ids[rank_post_pad] = i;
}
@@ -269,7 +275,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
}
});
}