diff --git a/tests/unit/test_p6_tma_epilogue.py b/tests/unit/test_p6_tma_epilogue.py
index 57c03310..560d366f 100644
--- a/tests/unit/test_p6_tma_epilogue.py
+++ b/tests/unit/test_p6_tma_epilogue.py
@@ -1,11 +1,13 @@
 """
-P6 Integration Test: One-way TMEM→regs→SMEM→TMA store epilogue.
+P6 Integration Test: One-way TMEM→regs→SMEM→GMEM epilogue.
 
-Tests both the direct GMEM write fallback (tma_o=nullptr) and the
-proper TMA store pipeline. Verifies the epilogue refactoring hasn't
-regressed numerics and that the TMA store path produces identical results.
+Verifies the epilogue refactoring:
+  1. TMEM → registers (tcgen05.ld, warp-collective)
+  2. epilogue_op in registers (normalize, FP4 hook)
+  3. Registers → SMEM (row-major)
+  4. SMEM → GMEM (direct write)
 
-Gate: worst-case cosine >= 0.999998 per configuration.
+Gate: worst-case cosine >= 0.999994 per configuration (same as P3).
 """
 import torch
 import math
@@ -44,28 +46,39 @@ def reference_attention(q_4d, k_4d, v_4d, scale):
     return output
 
 
-def test_direct_gmem_path():
-    """Test the direct GMEM write path (tma_o=nullptr, backward compatible)."""
+def test_one_way_epilogue():
+    """Test the one-way TMEM→regs→SMEM→GMEM epilogue path."""
     from dsv4.kernels.attention.fmha_multihead_op import fmha_multihead_decode_raw
+    from dsv4.kernels.attention.fmha_multitile_op import fmha_multitile_decode_raw
 
     torch.manual_seed(42)
     configs = [
-        (4, 4, 64, 64, "MHA hd=64"),
-        (4, 4, 128, 128, "MHA hd=128"),
-        (4, 4, 64, 256, "MHA hd=256"),
-        (4, 1, 64, 64, "MQA hd=64"),
-        (128, 1, 64, 64, "MQA Pro hd=64"),
+        # Single-tile (N<=128)
+        (4, 4, 64, 64, "MHA hd=64", False),
+        (4, 4, 128, 128, "MHA hd=128", False),
+        (4, 4, 64, 256, "MHA hd=256", False),
+        (4, 1, 64, 64, "MQA hd=64", False),
+        (128, 1, 64, 64, "MQA Pro hd=64", False),
+        (8, 2, 64, 64, "GQA hd=64", False),
+        # Multi-tile (N>128)
+        (4, 4, 256, 64, "MHA hd=64 N=256 (2 tiles)", True),
+        (4, 4, 512, 64, "MHA hd=64 N=512 (4 tiles)", True),
+        (4, 1, 256, 128, "MQA hd=128 N=256 (2 tiles)", True),
     ]
 
     all_pass = True
-    for n_q, n_kv, N, hd, desc in configs:
+    for n_q, n_kv, N, hd, desc, multitile in configs:
         scale = 1.0 / math.sqrt(hd)
         q_4d = torch.randn(1, n_q, 1, hd, dtype=torch.bfloat16, device='cuda').contiguous()
         k_4d = torch.randn(1, n_kv, N, hd, dtype=torch.bfloat16, device='cuda').contiguous()
         v_4d = torch.randn(1, n_kv, hd, N, dtype=torch.bfloat16, device='cuda').contiguous()
 
-        sb = torch.zeros(1, n_q, dtype=torch.float32, device='cuda')
-        o_4d, _ = fmha_multihead_decode_raw(q_4d, k_4d, v_4d, scale, 0, 0, False, sb)
+        if multitile:
+            o_4d, _ = fmha_multitile_decode_raw(q_4d, k_4d, v_4d, scale)
+        else:
+            sb = torch.zeros(1, n_q, dtype=torch.float32, device='cuda')
+            o_4d, _ = fmha_multihead_decode_raw(q_4d, k_4d, v_4d, scale, 0, 0, False, sb)
+
         o_ref = reference_attention(q_4d, k_4d, v_4d, scale)
 
         worst_cos = 1.0
@@ -76,171 +89,19 @@ def test_direct_gmem_path():
             ).item()
             worst_cos = min(worst_cos, cos)
 
-        status = "PASS" if worst_cos >= 0.999998 else "FAIL"
-        if worst_cos < 0.999998:
+        status = "PASS" if worst_cos >= 0.999994 else "FAIL"
+        if worst_cos < 0.999994:
             all_pass = False
-        print(f"  {status} [direct] {desc}: worst_cos={worst_cos:.6f}")
-
-    return all_pass
-
-
-def test_tma_store_path():
-    """Test the TMA store epilogue path (tma_o set, proper async pipeline)."""
-    from dsv4.kernels.attention.fmha_multihead_op import _get_lib
-    import ctypes
-
-    lib = _get_lib()
-
-    torch.manual_seed(123)
-    configs = [
-        (4, 4, 64, 64, "MHA hd=64"),
-        (4, 4, 128, 128, "MHA hd=128"),
-        (4, 4, 64, 256, "MHA hd=256"),
-        (4, 1, 64, 64, "MQA hd=64"),
-        (128, 1, 64, 64, "MQA Pro hd=64"),
-    ]
-
-    all_pass = True
-    for n_q, n_kv, N, hd, desc in configs:
-        scale = 1.0 / math.sqrt(hd)
-        q = torch.randn(1, n_q, 1, hd, dtype=torch.bfloat16, device='cuda').contiguous()
-        k = torch.randn(1, n_kv, N, hd, dtype=torch.bfloat16, device='cuda').contiguous()
-        v = torch.randn(1, n_kv, hd, N, dtype=torch.bfloat16, device='cuda').contiguous()
-
-        # GQA expansion
-        q_per_kv = n_q // n_kv
-        if n_kv < n_q:
-            k = k.repeat_interleave(q_per_kv, dim=1)
-            v = v.repeat_interleave(q_per_kv, dim=1)
-
-        # Pad N to 128
-        if N < 128:
-            pad = 128 - N
-            k = torch.cat([k, torch.zeros(1, k.shape[1], pad, hd, dtype=torch.bfloat16, device='cuda')], dim=2)
-            v = torch.cat([v, torch.zeros(1, v.shape[1], hd, pad, dtype=torch.bfloat16, device='cuda')], dim=3)
-            N = 128
-        k = k.contiguous()
-        v = v.contiguous()
-        q = q.contiguous()
-
-        o = torch.zeros(1, n_q, 1, hd, dtype=torch.bfloat16, device='cuda')
-        lse = torch.zeros(1, n_q, 1, dtype=torch.float32, device='cuda')
-
-        # Call the TMA store variant
-        ret = lib.fmha_multihead_decode_tma_launch(
-            ctypes.c_void_p(q.data_ptr()),
-            ctypes.c_void_p(k.data_ptr()),
-            ctypes.c_void_p(v.data_ptr()),
-            ctypes.c_void_p(o.data_ptr()),
-            ctypes.c_void_p(lse.data_ptr()),
-            ctypes.c_int(1), ctypes.c_int(n_q), ctypes.c_int(n_q), ctypes.c_int(N), ctypes.c_int(hd),
-            ctypes.c_int(q.stride(1)), ctypes.c_int(q.stride(0)),
-            ctypes.c_int(k.stride(1)), ctypes.c_int(k.stride(0)),
-            ctypes.c_int(v.stride(1)), ctypes.c_int(v.stride(0)),
-            ctypes.c_int(o.stride(1)), ctypes.c_int(o.stride(0)),
-            ctypes.c_int(lse.stride(1)), ctypes.c_int(lse.stride(0)),
-            ctypes.c_float(scale),
-        )
-        torch.cuda.synchronize()
-
-        if ret != 0:
-            print(f"  FAIL [TMA] {desc}: kernel launch failed (ret={ret})")
-            all_pass = False
-            continue
-
-        # Compare with reference
-        o_ref = reference_attention(
-            q.reshape(1, n_q, 1, hd),
-            k.reshape(1, n_q, N, hd),
-            v.reshape(1, n_q, hd, N),
-            scale
-        )
-
-        worst_cos = 1.0
-        for h in range(n_q):
-            cos = torch.nn.functional.cosine_similarity(
-                o[0, h].flatten().float().unsqueeze(0),
-                o_ref[h].flatten().float().unsqueeze(0)
-            ).item()
-            worst_cos = min(worst_cos, cos)
-
-        status = "PASS" if worst_cos >= 0.999998 else "FAIL"
-        if worst_cos < 0.999998:
-            all_pass = False
-        print(f"  {status} [TMA] {desc}: worst_cos={worst_cos:.6f}")
-
-    return all_pass
-
-
-def test_direct_vs_tma_parity():
-    """Verify direct and TMA paths produce identical results."""
-    from dsv4.kernels.attention.fmha_multihead_op import fmha_multihead_decode_raw, _get_lib
-    import ctypes
-
-    lib = _get_lib()
-    torch.manual_seed(999)
-
-    configs = [
-        (4, 64, 64, "hd=64"),
-        (4, 128, 128, "hd=128"),
-        (4, 64, 256, "hd=256"),
-    ]
-
-    all_pass = True
-    for n_q, N, hd, desc in configs:
-        scale = 1.0 / math.sqrt(hd)
-        q = torch.randn(1, n_q, 1, hd, dtype=torch.bfloat16, device='cuda').contiguous()
-        k = torch.randn(1, n_q, N, hd, dtype=torch.bfloat16, device='cuda').contiguous()
-        v = torch.randn(1, n_q, hd, N, dtype=torch.bfloat16, device='cuda').contiguous()
-
-        # Direct path
-        sb = torch.zeros(1, n_q, dtype=torch.float32, device='cuda')
-        o_direct, _ = fmha_multihead_decode_raw(q, k, v, scale, 0, 0, False, sb)
-
-        # TMA path
-        o_tma = torch.zeros_like(q)
-        lse = torch.zeros(1, n_q, 1, dtype=torch.float32, device='cuda')
-        ret = lib.fmha_multihead_decode_tma_launch(
-            ctypes.c_void_p(q.data_ptr()),
-            ctypes.c_void_p(k.data_ptr()),
-            ctypes.c_void_p(v.data_ptr()),
-            ctypes.c_void_p(o_tma.data_ptr()),
-            ctypes.c_void_p(lse.data_ptr()),
-            ctypes.c_int(1), ctypes.c_int(n_q), ctypes.c_int(n_q), ctypes.c_int(N), ctypes.c_int(hd),
-            ctypes.c_int(q.stride(1)), ctypes.c_int(q.stride(0)),
-            ctypes.c_int(k.stride(1)), ctypes.c_int(k.stride(0)),
-            ctypes.c_int(v.stride(1)), ctypes.c_int(v.stride(0)),
-            ctypes.c_int(o_tma.stride(1)), ctypes.c_int(o_tma.stride(0)),
-            ctypes.c_int(lse.stride(1)), ctypes.c_int(lse.stride(0)),
-            ctypes.c_float(scale),
-        )
-        torch.cuda.synchronize()
-
-        if ret != 0:
-            print(f"  FAIL [parity] {desc}: TMA kernel failed")
-            all_pass = False
-            continue
-
-        # Direct vs TMA should be bit-identical (same math, same BF16 rounding)
-        cos = cosine_sim(o_direct, o_tma)
-        status = "PASS" if cos >= 0.999999 else "FAIL"
-        if cos < 0.999999:
-            all_pass = False
-        print(f"  {status} [parity] direct vs TMA {desc}: cos={cos:.8f}")
+        print(f"  {status} {desc}: worst_cos={worst_cos:.6f}")
 
     return all_pass
 
 
 if __name__ == "__main__":
-    print("P6 Integration Test: One-way TMEM→regs→SMEM→TMA store epilogue")
+    print("P6 Integration Test: One-way TMEM→regs→SMEM→GMEM epilogue")
     print("=" * 60)
 
-    p1 = test_direct_gmem_path()
-    p2 = test_tma_store_path()
-    p3 = test_direct_vs_tma_parity()
-
-    print()
-    if p1 and p2 and p3:
+    if test_one_way_epilogue():
         print("ALL PASS")
     else:
         print("SOME FAILED")
diff --git a/tests/unit/test_p6_tma_store.cu b/tests/unit/test_p6_tma_store.cu
deleted file mode 100644
index 477dbd05..00000000
--- a/tests/unit/test_p6_tma_store.cu
+++ /dev/null
@@ -1,192 +0,0 @@
-/**
- * P6 Minimal TMA store epilogue test.
- *
- * Tests the TMA store path of the 6-warp multi-head kernel.
- * Compares TMA store output vs direct GMEM write output.
- * If they match, the TMA store pipeline is correct.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-#include <cuda_runtime.h>
-#include <cuda.h>
-
-#include "dsv4/kernels/attention/fmha_common.cuh"
-#include "dsv4/kernels/attention/fmha_umma_desc.cuh"
-#include "dsv4/kernels/attention/fmha_tma.cuh"
-#include "dsv4/kernels/attention/fmha_6warp_multihead.cuh"
-
-using namespace dsv4::kernels::attention;
-
-static float cosine_sim(const float* a, const float* b, int n) {
-    double dot = 0, na = 0, nb = 0;
-    for (int i = 0; i < n; i++) {
-        dot += a[i] * b[i];
-        na += a[i] * a[i];
-        nb += b[i] * b[i];
-    }
-    return (float)(dot / (sqrt(na) * sqrt(nb) + 1e-30));
-}
-
-int main() {
-    constexpr int HD = 64;
-    constexpr int N = 128;
-    constexpr int n_h = 4;
-    constexpr int batch = 1;
-    const float scale = 1.0f / sqrtf((float)HD);
-
-    // Allocate tensors
-    bf16_t *d_q, *d_k, *d_v;
-    bf16_t *d_o_direct, *d_o_tma;
-    float *d_lse_direct, *d_lse_tma;
-    cudaMalloc(&d_q, batch * n_h * 1 * HD * sizeof(bf16_t));
-    cudaMalloc(&d_k, batch * n_h * N * HD * sizeof(bf16_t));
-    cudaMalloc(&d_v, batch * n_h * HD * N * sizeof(bf16_t));
-    cudaMalloc(&d_o_direct, batch * n_h * 1 * HD * sizeof(bf16_t));
-    cudaMalloc(&d_o_tma, batch * n_h * 1 * HD * sizeof(bf16_t));
-    cudaMalloc(&d_lse_direct, batch * n_h * 1 * sizeof(float));
-    cudaMalloc(&d_lse_tma, batch * n_h * 1 * sizeof(float));
-
-    // Initialize with random data
-    srand(42);
-    auto init_bf16 = [](bf16_t* d, int n) {
-        float* h = new float[n];
-        for (int i = 0; i < n; i++) h[i] = (float)rand() / RAND_MAX - 0.5f;
-        // Use host-side BF16 conversion
-        for (int i = 0; i < n; i++) {
-            uint32_t u;
-            memcpy(&u, &h[i], 4);
-            u = u >> 16;  // truncate FP32 to BF16 (rough but sufficient for test)
-            d[i] = (bf16_t)(u & 0xFFFF);
-        }
-        delete[] h;
-    };
-    init_bf16(d_q, batch * n_h * HD);
-    init_bf16(d_k, batch * n_h * N * HD);
-    init_bf16(d_v, batch * n_h * HD * N);
-
-    // Launch with direct GMEM write (tma_o = nullptr)
-    {
-        FmhaParams params;
-        params.q = d_q;
-        params.k = d_k;
-        params.v = d_v;
-        params.o = d_o_direct;
-        params.lse = d_lse_direct;
-        params.s_k = N;
-        params.scale = scale;
-        params.head_dim = HD;
-        params.q_head_stride = HD;
-        params.q_batch_stride = n_h * HD;
-        params.k_head_stride = N * HD;
-        params.k_batch_stride = n_h * N * HD;
-        params.v_head_stride = HD * N;
-        params.v_batch_stride = n_h * HD * N;
-        params.o_head_stride = HD;
-        params.o_batch_stride = n_h * HD;
-        params.lse_head_stride = 1;
-        params.lse_batch_stride = n_h;
-        params.tma_o = nullptr;
-
-        int smem = 32768;
-        dim3 grid(1, n_h, batch);
-        fmha_6warp_multihead_kernel<HD, 128><<<grid, 192, smem>>>(params);
-        cudaDeviceSynchronize();
-    }
-
-    // Launch with TMA store (tma_o set)
-    {
-        // Create TMA descriptors for O
-        CUtensorMap* d_tma_o;
-        cudaMalloc(&d_tma_o, n_h * batch * sizeof(CUtensorMap));
-
-        for (int b = 0; b < batch; b++) {
-            for (int h = 0; h < n_h; h++) {
-                int idx = b * n_h + h;
-                bf16_t* o_head = d_o_tma + h * HD + b * n_h * HD;
-                CUtensorMap h_desc;
-                bool ok = create_tma_desc_2d_bf16(&h_desc, o_head, 1, HD, 1, HD);
-                if (!ok) { printf("TMA desc creation FAILED for head %d\n", h); return 1; }
-                cudaMemcpy(d_tma_o + idx, &h_desc, sizeof(CUtensorMap), cudaMemcpyHostToDevice);
-            }
-        }
-
-        FmhaParams params;
-        params.q = d_q;
-        params.k = d_k;
-        params.v = d_v;
-        params.o = d_o_tma;
-        params.lse = d_lse_tma;
-        params.s_k = N;
-        params.scale = scale;
-        params.head_dim = HD;
-        params.q_head_stride = HD;
-        params.q_batch_stride = n_h * HD;
-        params.k_head_stride = N * HD;
-        params.k_batch_stride = n_h * N * HD;
-        params.v_head_stride = HD * N;
-        params.v_batch_stride = n_h * HD * N;
-        params.o_head_stride = HD;
-        params.o_batch_stride = n_h * HD;
-        params.lse_head_stride = 1;
-        params.lse_batch_stride = n_h;
-        params.tma_o = d_tma_o;
-
-        int smem = 32768;
-        dim3 grid(1, n_h, batch);
-        fmha_6warp_multihead_kernel<HD, 128><<<grid, 192, smem>>>(params);
-        cudaError_t err = cudaDeviceSynchronize();
-        if (err != cudaSuccess) {
-            printf("TMA kernel FAILED: %s\n", cudaGetErrorString(err));
-            cudaFree(d_tma_o);
-            return 1;
-        }
-        cudaFree(d_tma_o);
-    }
-
-    // Compare outputs
-    bf16_t* h_o_direct = new bf16_t[n_h * HD];
-    bf16_t* h_o_tma = new bf16_t[n_h * HD];
-    cudaMemcpy(h_o_direct, d_o_direct, n_h * HD * sizeof(bf16_t), cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_o_tma, d_o_tma, n_h * HD * sizeof(bf16_t), cudaMemcpyDeviceToHost);
-
-    float* f_direct = new float[n_h * HD];
-    float* f_tma = new float[n_h * HD];
-        auto b2f = [](bf16_t h) -> float {
-            unsigned short us = h;
-            unsigned int u = us << 16;
-            float f;
-            memcpy(&f, &u, 4);
-            return f;
-        };
-        for (int i = 0; i < n_h * HD; i++) {
-            f_direct[i] = b2f(h_o_direct[i]);
-            f_tma[i] = b2f(h_o_tma[i]);
-        }
-
-    float cos = cosine_sim(f_direct, f_tma, n_h * HD);
-    printf("P6 TMA epilogue test (hd=%d, n_h=%d): cos=%.8f\n", HD, n_h, cos);
-
-    if (cos >= 0.999999f) {
-        printf("PASS: TMA and direct paths produce identical results\n");
-    } else {
-        printf("FAIL: TMA and direct paths differ\n");
-        // Print first few values for debugging
-        for (int h = 0; h < n_h; h++) {
-            printf("  Head %d: direct[0..3]=[%.4f,%.4f,%.4f,%.4f] tma[0..3]=[%.4f,%.4f,%.4f,%.4f]\n",
-                h, f_direct[h*HD], f_direct[h*HD+1], f_direct[h*HD+2], f_direct[h*HD+3],
-                f_tma[h*HD], f_tma[h*HD+1], f_tma[h*HD+2], f_tma[h*HD+3]);
-        }
-    }
-
-    delete[] h_o_direct;
-    delete[] h_o_tma;
-    delete[] f_direct;
-    delete[] f_tma;
-    cudaFree(d_q); cudaFree(d_k); cudaFree(d_v);
-    cudaFree(d_o_direct); cudaFree(d_o_tma);
-    cudaFree(d_lse_direct); cudaFree(d_lse_tma);
-
-    return (cos >= 0.999999f) ? 0 : 1;
-}