From 2b76b691cbaad7fae6ab7daa4e655ba1b2da4d8f Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sun, 24 May 2026 23:29:59 +0000
Subject: [PATCH] fix: block_idx() returns tuple, use [1] for y

---
 dsv4/kernels/attention/fmha.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py
index bd07340f..930bca32 100644
--- a/dsv4/kernels/attention/fmha.py
+++ b/dsv4/kernels/attention/fmha.py
@@ -177,7 +177,7 @@ class FmhaKernel:
         sP = smem.allocate_tensor(element_type=self.q_dtype,layout=_p_layout,byte_alignment=128,swizzle=_p_swizzle)
 
         # D2: Multi-CTA grid. Use block_idx_y to select Q and O for this CTA's head.
-        head_cta_idx = cute.arch.block_idx(dim=1)  # block_idx_y
+        _bidx, head_cta_idx, _bidz = cute.arch.block_idx()  # grid=(1, num_ctas, 1)
 
         # Q: if num_ctas > 1, mQ has a head dimension. local_tile indexes into it.
         # K/V: shared (MQA), always coordinate 0.