Optimize moe_align_block_size for deepseek_v3 (#12850)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-13 18:43:37 -05:00
parent bffddd9a05
commit 2344192a55
2 changed files with 38 additions and 15 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -596,7 +596,7 @@ def moe_align_block_size(
                                      dtype=torch.int32,
                                      device=topk_ids.device)
    if num_experts >= 224:
-        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON:
+        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256:
            moe_align_block_size_triton(
                topk_ids,
                num_experts,
@@ -606,6 +606,7 @@ def moe_align_block_size(
                num_tokens_post_pad,
            )
        else:
+            # Currently requires num_experts=256
            ops.sgl_moe_align_block_size(
                topk_ids,
                num_experts,