From 6d80ae83e1455cb0e47196cea557398fde0f03d1 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 3 Sep 2025 17:01:09 +0200
Subject: [PATCH] [Bugfix] Fixing division by zero in triton_attn if
 query_heads/kv_heads > 16  (#23424)

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 vllm/attention/ops/triton_unified_attention.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 56ebed0f5..250e9b389 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -674,7 +674,8 @@ def unified_attention(
     num_queries_per_kv = num_query_heads // num_kv_heads
     head_size = q.shape[2]
 
-    BLOCK_M = 16
+    BLOCK_M = 16 if num_queries_per_kv <= 16 else triton.next_power_of_2(
+        num_queries_per_kv)
     BLOCK_Q = BLOCK_M // num_queries_per_kv
 
     # Ideally we would launch with kernel with: