From d4dbb7af632e16e611f496bbbafbc0c276e48ac8 Mon Sep 17 00:00:00 2001 From: yugong333 Date: Sat, 24 Jan 2026 09:39:30 -0800 Subject: [PATCH] Using max_loras + 1 to construct grid in fused_moe_lora (#32277) Signed-off-by: Yu Gong --- vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index 9376b4e6d..35939b979 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -104,7 +104,10 @@ def _fused_moe_lora_kernel( if moe_enabled == 0: # Early exit for the no moe lora case. return - max_loras = tl.num_programs(axis=2) + # The grid size on axis 2 is (max_loras + 1) to handle the no-lora case + # (lora_id == -1), but sorted_token_ids and expert_ids are allocated with + # shape (max_loras, ...). Use (num_programs - 1) for correct bounds checking. + max_loras = tl.num_programs(axis=2) - 1 grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K) # calculate pid_m,pid_n @@ -255,7 +258,8 @@ def _fused_moe_lora_shrink( * triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), len(lora_a_stacked), - lora_a_stacked[0].shape[0], + ## max_loras + 1 to handle the no-lora case (lora_id == -1) + lora_a_stacked[0].shape[0] + 1, ) _fused_moe_lora_kernel[grid]( qcurr_hidden_states, @@ -355,7 +359,8 @@ def _fused_moe_lora_expand( grid = lambda META: ( triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), len(lora_b_stacked), - lora_b_stacked[0].shape[0], + ## max_loras + 1 to handle the no-lora case (lora_id == -1) + lora_b_stacked[0].shape[0] + 1, ) _fused_moe_lora_kernel[grid]( a_intermediate_cache1,