Fused MOE for Mixtral (#2542)

Co-authored-by: chen shen <scv119@gmail.com>
This commit is contained in:
Philipp Moritz
2024-01-29 22:43:37 -08:00
committed by GitHub
parent 5d60def02c
commit ab40644669
4 changed files with 114 additions and 108 deletions

View File

@@ -95,7 +95,7 @@ void moe_align_block_size(
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
assert(num_experts <= NUM_MAX_EXPERTS);
VLLM_DISPATCH_INTEGRAL_TYPES(
topk_ids.scalar_type(), "moe_alig_block_size_kernel", [&] {
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
vllm::moe_align_block_size_kernel<scalar_t><<<1, num_experts, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),