[Kernel] port sgl moe_align_block_size kernels (#12574)
sgl_moe_align_block_size is based on:ded9fcd09amoe_align_block_size is based on:ba5112ff69Signed-off-by: Yang Chen <yangche@fb.com>
This commit is contained in:
@@ -82,6 +82,7 @@ if TYPE_CHECKING:
|
||||
VLLM_MLA_DISABLE: bool = False
|
||||
VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
|
||||
VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
|
||||
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -531,7 +532,13 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
# matrices to match the activation type. This can lead to higher memory and
|
||||
# compute usage but better preserves the accuracy of the original model.
|
||||
"VLLM_MLA_DISABLE_REQUANTIZATION":
|
||||
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
|
||||
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
|
||||
|
||||
# If set, vLLM will use the Triton implementation of moe_align_block_size,
|
||||
# i.e. moe_align_block_size_triton in fused_moe.py.
|
||||
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
|
||||
lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
|
||||
),
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
|
||||
Reference in New Issue
Block a user