[Kernel] port sgl moe_align_block_size kernels (#12574)

sgl_moe_align_block_size is based on: ded9fcd09a moe_align_block_size is based on: ba5112ff69 Signed-off-by: Yang Chen <yangche@fb.com>
2025-02-02 21:09:50 -08:00
parent 326fcc8b9f
commit 95460fc513
6 changed files with 284 additions and 3 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -82,6 +82,7 @@ if TYPE_CHECKING:
    VLLM_MLA_DISABLE: bool = False
    VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
    VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
+    VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False


 def get_default_cache_root():
@@ -531,7 +532,13 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # matrices to match the activation type. This can lead to higher memory and
    # compute usage but better preserves the accuracy of the original model.
    "VLLM_MLA_DISABLE_REQUANTIZATION":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
+
+    # If set, vLLM will use the Triton implementation of moe_align_block_size,
+    # i.e. moe_align_block_size_triton in fused_moe.py.
+    "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
+                 ),
 }

 # end-env-vars-definition