DeepseekMoE support with Fused MoE kernel (#2453)

Co-authored-by: roy <jasonailu87@gmail.com>
2024-01-30 13:19:48 +08:00
parent ea8489fce2
commit 5d60def02c
9 changed files with 924 additions and 0 deletions
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -56,6 +56,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
  ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
+  ops.def(
+      "moe_align_block_size",
+      &moe_align_block_size,
+      "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");

  // Cache ops
  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");