[Kernel] port sgl moe_align_block_size kernels (#12574)

sgl_moe_align_block_size is based on:


ded9fcd09a

moe_align_block_size is based on:


ba5112ff69

Signed-off-by: Yang Chen <yangche@fb.com>
This commit is contained in:
Yang Chen
2025-02-02 21:09:50 -08:00
committed by GitHub
parent 326fcc8b9f
commit 95460fc513
6 changed files with 284 additions and 3 deletions

View File

@@ -82,6 +82,7 @@ if TYPE_CHECKING:
VLLM_MLA_DISABLE: bool = False
VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
def get_default_cache_root():
@@ -531,7 +532,13 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# matrices to match the activation type. This can lead to higher memory and
# compute usage but better preserves the accuracy of the original model.
"VLLM_MLA_DISABLE_REQUANTIZATION":
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
# If set, vLLM will use the Triton implementation of moe_align_block_size,
# i.e. moe_align_block_size_triton in fused_moe.py.
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
),
}
# end-env-vars-definition