DeepseekMoE support with Fused MoE kernel (#2453)

Co-authored-by: roy <jasonailu87@gmail.com>
This commit is contained in:
wangding zeng
2024-01-30 13:19:48 +08:00
committed by GitHub
parent ea8489fce2
commit 5d60def02c
9 changed files with 924 additions and 0 deletions

View File

@@ -56,6 +56,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
ops.def(
"moe_align_block_size",
&moe_align_block_size,
"Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");
// Cache ops
pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");