From f3a7dc15981ad6b169cccab245ecfb2db2060d0e Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Thu, 21 May 2026 07:13:14 +0000
Subject: [PATCH] FOOTGUN #0: num_tma_load_bytes MUST include V bytes. Fix v27,
 v29, comment all. Update README.

---
 README.md                 | 37 +++++++++++++++++++++++++++++++------
 tests/test_pv_diag.py     |  1 +
 tests/test_stage_b_v27.py |  8 +++++++-
 tests/test_stage_b_v29.py |  8 +++++++-
 4 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f6157b4a..62608747 100644
--- a/README.md
+++ b/README.md
@@ -151,15 +151,15 @@ The tmem allocation barrier has `num_threads = 32 * (mma_warp + epilogue_warps)`
 
 ---
 
-## ⛔ NEW DEADLOCK: (128,64) PV MMA Epilogue — UNRESOLVED (May 21, 06:45 UTC)
+## ✅ DEADLOCK FIX #4: num_tma_load_bytes Must Include V Bytes (May 21, 07:10 UTC)
 
-v28 (128,64 PV MMA) and v29 (padded V 128x128) both deadlock inside `epilogue_tma_store`.
+**Root cause:** `num_tma_load_bytes` only accounted for Q + K, not V. The TMA barrier's tx-count underflowed when V bytes arrived, wrapping the 20-bit counter. The barrier never reached zero → MMA warp waits forever.
 
-**Not caused by the three known deadlock fixes** — all are applied.
+**Fix:** Add `cute.size_in_bytes(self.b_dtype, v_smem)` to `num_tma_load_bytes`.
 
-**Symptom**: `epilogue_tma_store` blocks on `acc_pipeline.consumer_wait()`. MMA warp appears stuck at `mma_si_prod.acquire_and_advance()` (2nd acquire, waiting for softmax). But EPI warps complete softmax successfully.
+**Why it was sneaky:** test_pv_diag.py had the same bug but ran fine because V=I(128,128) is small enough that the race was benign. v29 with larger or differently-strided V exposed it.
 
-**Bisect needed**: test_pv_diag.py (V=I 128x128, WORKS) → v29 (V=I 128x128, DEADLOCKS). Same input, same MMA configuration, same pipeline. Difference must be a subtle code issue.
+**Remaining issue with v28 (128,64 PV MMA):** The (128,64) PV MMA itself still needs investigation — the softmax-to-PV TMEM alias must be adapted for the different A-fragment layout.
 
 ---
 
@@ -169,7 +169,32 @@ v21 attempted both Bug 1 and Bug 2 fixes in a hand-rolled pipeline kernel. It de
 
 ---
 
-## ⛔ FOOTGUNS — CUTLASS CuTeDSL Landmines
+## ⛔⛔⛔ FOOTGUNS — CUTLASS CuTeDSL Landmines ⛔⛔⛔
+
+### 🔴🔴🔴 0. num_tma_load_bytes MUST Include ALL TMA-Loaded Tensors (Q + K + V) — DEADLOCK IF MISSING
+
+**This is the #1 landmine. It cost us hours of debugging.**
+
+`PipelineTmaUmma.create()` takes a `tx_count` parameter (via `num_tma_load_bytes`) that tells the TMA barrier how many bytes to expect. If you load Q, K, and V via TMA but only budget Q+K bytes, the barrier's tx-count underflows when V's bytes arrive. On SM100 the 20-bit tx-count wraps, the barrier never reaches zero, and the consumer (MMA warp) waits **forever**.
+
+```python
+# ❌ WRONG — missing V bytes → DEADLOCK
+self.num_tma_load_bytes = (
+    cute.size_in_bytes(self.q_dtype, a_smem) + cute.size_in_bytes(self.b_dtype, b_smem)
+) * cute.size(qk_mma.thr_id.shape)
+
+# ✅ CORRECT — include ALL three TMA loads
+v_smem = cute.slice_(self.v_smem_s, (None, None, None, 0))
+self.num_tma_load_bytes = (
+    cute.size_in_bytes(self.q_dtype, a_smem)
+    + cute.size_in_bytes(self.b_dtype, b_smem)
+    + cute.size_in_bytes(self.b_dtype, v_smem)  # ← DO NOT FORGET THIS
+) * cute.size(qk_mma.thr_id.shape)
+```
+
+**Why it's sneaky:** With small V tensors, the race might be benign (V completes before the consumer reads). With larger V, the underflow actually traps the barrier. So it can work on small tests and deadlock on larger ones.
+
+---
 
 ### 1. St32x32bOp with 16-bit dtype → ILLEGAL MEMORY ACCESS
 
diff --git a/tests/test_pv_diag.py b/tests/test_pv_diag.py
index 80184627..2c32228b 100644
--- a/tests/test_pv_diag.py
+++ b/tests/test_pv_diag.py
@@ -100,6 +100,7 @@ class PvDiagKernel:
         tCtO_fake = pv_mma.make_fragment_C(cute.append(pv_acc_shape, self.num_acc_stage))
         self.num_tmem_alloc_cols = utils.get_num_tmem_alloc_cols([tCtS_fake, tCtO_fake], arch="sm_100")
 
+        # ⛔⛔⛔ CRITICAL: num_tma_load_bytes MUST include ALL TMA-loaded tensors (Q + K + V). Missing V → DEADLOCK. See FOOTGUN #0 in README.
         a_smem = cute.slice_(self.a_smem_s, (None, None, None, 0))
         b_smem = cute.slice_(self.b_smem_s, (None, None, None, 0))
         v_smem = cute.slice_(self.v_smem_s, (None, None, None, 0))
diff --git a/tests/test_stage_b_v27.py b/tests/test_stage_b_v27.py
index c5292f5a..c544d363 100644
--- a/tests/test_stage_b_v27.py
+++ b/tests/test_stage_b_v27.py
@@ -71,10 +71,16 @@ class StageBIdentitySoftmax:
         tCtO_fake = pv_mma.make_fragment_C(cute.append(pv_acc_shape, self.num_acc_stage))
         self.num_tmem_alloc_cols = utils.get_num_tmem_alloc_cols([tCtS_fake, tCtO_fake], arch="sm_100")
 
+        # ⛔⛔⛔ CRITICAL: num_tma_load_bytes MUST include ALL TMA-loaded tensors (Q + K + V).
+        # Missing V bytes causes TMA barrier tx-count underflow → DEADLOCK.
+        # See FOOTGUN #0 in README.
         a_smem = cute.slice_(self.a_smem_s, (None, None, None, 0))
         b_smem = cute.slice_(self.b_smem_s, (None, None, None, 0))
+        v_smem = cute.slice_(self.v_smem_s, (None, None, None, 0))
         self.num_tma_load_bytes = (
-            cute.size_in_bytes(self.q_dtype, a_smem) + cute.size_in_bytes(self.b_dtype, b_smem)
+            cute.size_in_bytes(self.q_dtype, a_smem)
+            + cute.size_in_bytes(self.b_dtype, b_smem)
+            + cute.size_in_bytes(self.b_dtype, v_smem)  # ← DO NOT FORGET V
         ) * cute.size(qk_mma.thr_id.shape)
 
     @cute.jit
diff --git a/tests/test_stage_b_v29.py b/tests/test_stage_b_v29.py
index bafc1283..502db5af 100644
--- a/tests/test_stage_b_v29.py
+++ b/tests/test_stage_b_v29.py
@@ -66,10 +66,16 @@ class StageBIdentitySoftmax:
         tCtO_fake = pv_mma.make_fragment_C(cute.append(pv_acc_shape, self.num_acc_stage))
         self.num_tmem_alloc_cols = utils.get_num_tmem_alloc_cols([tCtS_fake, tCtO_fake], arch="sm_100")
 
+        # ⛔⛔⛔ CRITICAL: num_tma_load_bytes MUST include ALL TMA-loaded tensors (Q + K + V).
+        # Missing V bytes causes TMA barrier tx-count underflow → DEADLOCK.
+        # See FOOTGUN #0 in README.
         a_smem = cute.slice_(self.a_smem_s, (None, None, None, 0))
         b_smem = cute.slice_(self.b_smem_s, (None, None, None, 0))
+        v_smem = cute.slice_(self.v_smem_s, (None, None, None, 0))
         self.num_tma_load_bytes = (
-            cute.size_in_bytes(self.q_dtype, a_smem) + cute.size_in_bytes(self.b_dtype, b_smem) + cute.size_in_bytes(self.b_dtype, cute.slice_(self.v_smem_s, (None, None, None, 0)))
+            cute.size_in_bytes(self.q_dtype, a_smem)
+            + cute.size_in_bytes(self.b_dtype, b_smem)
+            + cute.size_in_bytes(self.b_dtype, v_smem)  # ← DO NOT FORGET V
         ) * cute.size(qk_mma.thr_id.shape)
 
     @cute.jit