Using max_loras + 1 to construct grid in fused_moe_lora (#32277)

Signed-off-by: Yu Gong <yu3.gong@gmail.com>
2026-01-24 09:39:30 -08:00
parent 203d0bc0c2
commit d4dbb7af63
1 changed files with 8 additions and 3 deletions
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -104,7 +104,10 @@ def _fused_moe_lora_kernel(
    if moe_enabled == 0:
        # Early exit for the no moe lora case.
        return
-    max_loras = tl.num_programs(axis=2)
+    # The grid size on axis 2 is (max_loras + 1) to handle the no-lora case
    # (lora_id == -1), but sorted_token_ids and expert_ids are allocated with
    # shape (max_loras, ...). Use (num_programs - 1) for correct bounds checking.
    max_loras = tl.num_programs(axis=2) - 1
    grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)
    # calculate pid_m,pid_n
@@ -255,7 +258,8 @@ def _fused_moe_lora_shrink(
        * triton.cdiv(EM, META["BLOCK_SIZE_M"])
        * triton.cdiv(N, META["BLOCK_SIZE_N"]),
        len(lora_a_stacked),
-        lora_a_stacked[0].shape[0],
+        ## max_loras + 1 to handle the no-lora case (lora_id == -1)
        lora_a_stacked[0].shape[0] + 1,
    )
    _fused_moe_lora_kernel[grid](
        qcurr_hidden_states,
@@ -355,7 +359,8 @@ def _fused_moe_lora_expand(
    grid = lambda META: (
        triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
        len(lora_b_stacked),
-        lora_b_stacked[0].shape[0],
+        ## max_loras + 1 to handle the no-lora case (lora_id == -1)
        lora_b_stacked[0].shape[0] + 1,
    )
    _fused_moe_lora_kernel[grid](
        a_intermediate_cache1,