diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index 9376b4e6d..35939b979 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -104,7 +104,10 @@ def _fused_moe_lora_kernel( if moe_enabled == 0: # Early exit for the no moe lora case. return - max_loras = tl.num_programs(axis=2) + # The grid size on axis 2 is (max_loras + 1) to handle the no-lora case + # (lora_id == -1), but sorted_token_ids and expert_ids are allocated with + # shape (max_loras, ...). Use (num_programs - 1) for correct bounds checking. + max_loras = tl.num_programs(axis=2) - 1 grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K) # calculate pid_m,pid_n @@ -255,7 +258,8 @@ def _fused_moe_lora_shrink( * triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), len(lora_a_stacked), - lora_a_stacked[0].shape[0], + ## max_loras + 1 to handle the no-lora case (lora_id == -1) + lora_a_stacked[0].shape[0] + 1, ) _fused_moe_lora_kernel[grid]( qcurr_hidden_states, @@ -355,7 +359,8 @@ def _fused_moe_lora_expand( grid = lambda META: ( triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), len(lora_b_stacked), - lora_b_stacked[0].shape[0], + ## max_loras + 1 to handle the no-lora case (lora_id == -1) + lora_b_stacked[0].shape[0] + 1, ) _fused_moe_lora_kernel[grid]( a_intermediate_cache1,