From bfacfeca7bcb2e3d7e6e306c740bf6578cc69aca Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sat, 23 May 2026 05:45:58 +0000
Subject: [PATCH] =?UTF-8?q?Rename=20FmhaV3StageC=20=E2=86=92=20FmhaKernel?=
 =?UTF-8?q?=20=E2=80=94=20no=20dev=20stage=20artifacts=20in=20production?=
 =?UTF-8?q?=20API?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                      | 10 ++++------
 dsv4/kernels/attention/fmha.py |  2 +-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 09773ff3..4e8a679d 100644
--- a/README.md
+++ b/README.md
@@ -144,7 +144,7 @@ Summary
 |-------|--------|-------------|
 | A | ✅ COMPLETE | Q@K^T via tcgen05.mma → TMEM → GMEM |
 | B | ✅ COMPLETE | QK → identity softmax → P@V pipeline (TMEM alias, KV-tile interleaving) |
-| C | ✅ MIGRATED TO MODULE | Real online softmax + normalize. n=128 cos 0.973. Migrated to `dsv4/kernels/attention/fmha.py` as `FmhaV3StageC`. TMEM layout mismatch still present (3% error). |
+| C | ✅ MIGRATED TO MODULE | Real online softmax + normalize. n=128 cos 0.973. Migrated to `dsv4/kernels/attention/fmha.py` as `FmhaKernel`. TMEM layout mismatch still present (3% error). |
 | D1 | 🔨 IN PROGRESS | Parameterize HEAD_DIM (64 → 512). SMEM-P path for hd>64 (register→SMEM copy TODO). |
 | D2 | TODO | Multi-query grid with head packing (128 Q heads, MQA) |
 | D3 | TODO | SWA sequence length mask (swa_lens per batch) |
@@ -160,7 +160,7 @@ Summary
 dsv4/
 ├── kernels/          Pure GPU code (CuTeDSL @cute.jit, .cu files)
 │   ├── gemm/           NVFP4 MoE GEMM kernels (grouped, fused_swiglu, dense, scheduler)
-│   ├── attention/      FMHA kernel — FmhaV3StageC (migrated from tests), SMEM-P stub
+│   ├── attention/      FMHA kernel — FmhaKernel (hd=64, TMEM-P proven; SMEM-P stub for hd>64)
 │   ├── compressor/     CSA/HCA token-level compressor (CuTeDSL, 419 lines)
 │   ├── indexer/        CSA indexer — score+topk (FP32 dot products, top-k selection)
 │   ├── router/         Dense router decode kernel (warp-specialized persistent GEMM)
@@ -228,7 +228,7 @@ dsv4/
 |------|-------|--------|
 | `test_fmha_v3.py` | A+B | ✅ Full QK→identity softmax→PV, cosine 0.999999 |
 | `test_fmha_v3_12w.py` | A+B | ✅ 12-warp QK→PV, cosine 0.999999 |
-| `test_fmha_v3_stage_c.py` | C | ✅ Real online softmax + normalize, n=128 cos 0.973. **Also migrated to `dsv4/kernels/attention/fmha.py` as `FmhaV3StageC`.** |
+| `test_fmha_v3_stage_c.py` | C | ✅ Real online softmax + normalize, n=128 cos 0.973. **Also in module as `FmhaKernel`.** |
 | `test_fmha_v3_stage_d1.py` | D1 | 🔨 Parameterized hd + SMEM-P path (WIP) |
 | `test_d1_*.py` | D1 | 🔨 Debug/diagnostic variants (hd512, regression, sweep, raw, debug) |
 | `test_paired_epilog.py` | C | ✅ Paired atom epilogue experiments |
@@ -532,7 +532,7 @@ When implementing D5a, Stage C's epilogue changes from "multiply by 1/row_sum" t
 
 ### E1 — File placement
 
-`dsv4/kernels/attention/fmha.py`. Currently contains `FmhaV3StageC` (exact migration from test). Will become `FmhaKernel` once D1 parameterization is complete and the SMEM-P path is working. Constructor takes all dimensions and dtypes, no module-level constants.
+`dsv4/kernels/attention/fmha.py`. Currently contains `FmhaKernel` (migrated from test, hd=64 TMEM-P). Will gain parameterized `head_dim` and SMEM-P path in D1. Constructor takes all dimensions and dtypes, no module-level constants.
 
 ### E2 — Constructor signature
 
@@ -557,8 +557,6 @@ class FmhaKernel:
 
 All architecture-level shapes from config flow into the constructor. No FMHA-internal magic numbers.
 
-**Naming convention:** The class will be `FmhaKernel` once D1 is complete (replacing the current `FmhaV3StageC`). The progression: `FmhaV3StageC` (hd=64, TMEM-P only) → `FmhaKernel` (parameterized hd, TMEM-P + SMEM-P). The old name stays in the test file for regression.
-
 ### E3 — Call signature
 
 ```python
diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py
index d065e47d..1f7115de 100644
--- a/dsv4/kernels/attention/fmha.py
+++ b/dsv4/kernels/attention/fmha.py
@@ -16,7 +16,7 @@ import math
 HEAD_DIM = 64
 
 
-class FmhaV3StageC:
+class FmhaKernel:
     def __init__(self, s_k=128, scale_softmax=None):
         self.s_k = s_k
         self.n_kv_tiles = s_k // 128