diff --git a/tests/unit/test_fmha_sm100.py b/tests/unit/test_fmha_sm100.py
index 90eb35c7..96a0bfe4 100644
--- a/tests/unit/test_fmha_sm100.py
+++ b/tests/unit/test_fmha_sm100.py
@@ -1,15 +1,10 @@
 """
-Test: Compile FMHA SM100 kernel with nvcc, load as shared library, test correctness.
-
-This uses ctypes to load the compiled .so instead of torch.utils.cpp_extension
-(which adds -D__CUDA_NO_BFLOAT16_CONVERSIONS__ causing ICE with __bf16).
+Test: Compile and run standalone FMHA SM100 test via nvcc.
+No PyTorch needed — pure CUDA runtime test.
 """
 import subprocess
 import sys
 import os
-import torch
-import math
-import ctypes
 
 def get_repo_root():
     d = os.path.dirname(os.path.abspath(__file__))
@@ -19,62 +14,46 @@ def get_repo_root():
     return None
 
 REPO = get_repo_root()
-CUTLASS = "/root/cutlass"
 CUDA = "/usr/local/cuda-13.2"
-OUT = "/tmp/fmha_sm100_test"
 
-def compile_kernel():
-    """Compile the kernel as a shared library using nvcc directly."""
-    src = f"{REPO}/dsv4/kernels/attention/fmha_sm100_launch.cu"
-    out = f"{OUT}.so"
+# Step 1: Compile standalone test
+print("=" * 60)
+print("Compiling standalone FMHA SM100 test...")
+print("=" * 60)
 
-    cmd = [
-        f"{CUDA}/bin/nvcc",
-        "--std=c++20",
-        "-shared",
-        "-Xcompiler", "-fPIC",
-        f"-gencode=arch=compute_100a,code=sm_100a",
-        f"-I{REPO}",
-        f"-I{CUTLASS}/include",
-        f"-I{CUDA}/include",
-        "-I/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/include",
-        "-I/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/include/torch/csrc/api/include",
-        "-I/usr/include/python3.12",
-        "-DGOOGLE_CUDA=1",
-        "--expt-relaxed-constexpr",
-        src,
-        "-o", out,
-        "-L/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/lib",
-        "-lc10_cuda", "-ltorch_cuda", "-ltorch", "-lc10",
-        "-lcudart",
-    ]
+src = f"{REPO}/tests/unit/test_fmha_sm100_standalone.cu"
+out = "/tmp/fmha_sm100_standalone"
 
-    print(f"Compiling: {' '.join(cmd[:5])}...")
-    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
-    if result.returncode != 0:
-        print(f"❌ Compilation FAILED:\n{result.stderr[-1000:]}")
-        return False
-    print(f"✅ Compiled: {out}")
-    return True
+cmd = [
+    f"{CUDA}/bin/nvcc",
+    "--std=c++20",
+    f"-gencode=arch=compute_100a,code=sm_100a",
+    f"-I{REPO}",
+    "--expt-relaxed-constexpr",
+    src,
+    "-o", out,
+    "-lcudart",
+]
 
+print(f"nvcc: {' '.join(cmd[:4])}...")
+result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+if result.returncode != 0:
+    print(f"❌ Compilation FAILED:\n{result.stderr[-2000:]}")
+    sys.exit(1)
+print(f"✅ Compiled: {out}")
 
-def test_correctness():
-    """Test FMHA output against PyTorch reference using ctypes."""
-    # Load the shared library
-    lib = ctypes.CDLL(f"{OUT}.so")
-    # We'd need to call the function via ctypes, but the function
-    # returns torch tensors which complicates ctypes.
-    # Instead, let's write a simple Python test script that uses
-    # torch.ops to call the custom op.
-    print("Shared library loaded. Need proper test harness for correctness.")
-    print("For now, verify kernel compiles and loads successfully.")
-    return True
+# Step 2: Run the test
+print("\n" + "=" * 60)
+print("Running standalone FMHA SM100 test...")
+print("=" * 60)
 
+result = subprocess.run([out], capture_output=True, text=True, timeout=30)
+print(result.stdout)
+if result.stderr:
+    print(f"STDERR: {result.stderr[-500:]}")
+print(f"Exit code: {result.returncode}")
 
-if __name__ == "__main__":
-    print("=" * 60)
-    print("FMHA SM100 — nvcc direct compilation + correctness test")
-    print("=" * 60)
-
-    if compile_kernel():
-        test_correctness()
+if result.returncode == 0:
+    print("\n✅ ALL TESTS PASSED!")
+else:
+    print("\n❌ TEST FAILED")
diff --git a/tests/unit/test_fmha_sm100_standalone.cu b/tests/unit/test_fmha_sm100_standalone.cu
new file mode 100644
index 00000000..763d3c9f
--- /dev/null
+++ b/tests/unit/test_fmha_sm100_standalone.cu
@@ -0,0 +1,172 @@
+/**
+ * Standalone CUDA test for FMHA SM100 decode kernel.
+ * Launches the kernel directly via CUDA runtime, compares against CPU reference.
+ * No PyTorch or pybind11 needed — just nvcc + CUDA runtime.
+ */
+
+#include "fmha_sm100.cuh"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+
+using namespace dsv4::kernels::attention;
+
+// CPU reference: simple attention
+void attention_ref_cpu(
+    const float* q, const float* k, const float* v,
+    float* o, float* lse,
+    int B, int H, int sk, int HD, float scale
+) {
+    for (int b = 0; b < B; b++) {
+        for (int h = 0; h < H; h++) {
+            const float* qh = q + (b * H + h) * HD;
+            const float* kb = k + b * sk * HD;
+            const float* vb = v + b * HD * sk;
+            float* oh = o + (b * H + h) * HD;
+
+            // S = Q @ K^T * scale
+            float* s = (float*)malloc(sk * sizeof(float));
+            float s_max = -FLT_MAX;
+            for (int c = 0; c < sk; c++) {
+                float dot = 0.0f;
+                for (int d = 0; d < HD; d++) dot += qh[d] * kb[c * HD + d];
+                s[c] = dot * scale;
+                s_max = fmaxf(s_max, s[c]);
+            }
+
+            // Softmax
+            float sum = 0.0f;
+            for (int c = 0; c < sk; c++) {
+                s[c] = expf(s[c] - s_max);
+                sum += s[c];
+            }
+            for (int c = 0; c < sk; c++) s[c] /= sum;
+
+            // O = S @ V
+            for (int d = 0; d < HD; d++) {
+                oh[d] = 0.0f;
+                for (int c = 0; c < sk; c++) {
+                    oh[d] += s[c] * vb[d * sk + c];
+                }
+            }
+
+            if (lse) lse[b * H + h] = logf(sum) + s_max;
+
+            free(s);
+        }
+    }
+}
+
+// BF16 conversion helpers for CPU
+uint16_t f32_to_bf16_cpu(float f) {
+    uint32_t u;
+    memcpy(&u, &f, 4);
+    uint16_t h = (uint16_t)(u >> 16);
+    return h;
+}
+
+float bf16_to_f32_cpu(uint16_t h) {
+    uint32_t u = ((uint32_t)h) << 16;
+    float f;
+    memcpy(&f, &u, 4);
+    return f;
+}
+
+int main() {
+    printf("=== FMHA SM100 Decode Kernel Test ===\n");
+
+    const int B = 1, H = 1, HD = 64, sk = 128;
+    const float scale = 1.0f / sqrtf((float)HD);
+    const int smem = 128 * HD * 2 * sizeof(uint16_t) + 1024; // K + V + slack
+
+    // Allocate host memory
+    float *hq = (float*)malloc(B * H * HD * sizeof(float));
+    float *hk = (float*)malloc(B * sk * HD * sizeof(float));
+    float *hv = (float*)malloc(B * HD * sk * sizeof(float));
+    float *ho_ref = (float*)malloc(B * H * HD * sizeof(float));
+
+    // Init with random data
+    srand(42);
+    for (int i = 0; i < B * H * HD; i++) hq[i] = (float)rand() / RAND_MAX - 0.5f;
+    for (int i = 0; i < B * sk * HD; i++) hk[i] = (float)rand() / RAND_MAX - 0.5f;
+    for (int i = 0; i < B * HD * sk; i++) hv[i] = (float)rand() / RAND_MAX - 0.5f;
+
+    // CPU reference
+    attention_ref_cpu(hq, hk, hv, ho_ref, NULL, B, H, sk, HD, scale);
+
+    // Convert to BF16
+    uint16_t *hqb = (uint16_t*)malloc(B * H * HD * sizeof(uint16_t));
+    uint16_t *hkb = (uint16_t*)malloc(B * sk * HD * sizeof(uint16_t));
+    uint16_t *hvb = (uint16_t*)malloc(B * HD * sk * sizeof(uint16_t));
+    uint16_t *hob = (uint16_t*)malloc(B * H * HD * sizeof(uint16_t));
+
+    for (int i = 0; i < B * H * HD; i++) hqb[i] = f32_to_bf16_cpu(hq[i]);
+    for (int i = 0; i < B * sk * HD; i++) hkb[i] = f32_to_bf16_cpu(hk[i]);
+    for (int i = 0; i < B * HD * sk; i++) hvb[i] = f32_to_bf16_cpu(hv[i]);
+
+    // Allocate GPU memory
+    uint16_t *dq, *dk, *dv, *do_;
+    float *d_lse;
+    cudaMalloc(&dq, B * H * HD * sizeof(uint16_t));
+    cudaMalloc(&dk, B * sk * HD * sizeof(uint16_t));
+    cudaMalloc(&dv, B * HD * sk * sizeof(uint16_t));
+    cudaMalloc(&do_, B * H * HD * sizeof(uint16_t));
+    cudaMalloc(&d_lse, B * H * sizeof(float));
+
+    // Copy to GPU
+    cudaMemcpy(dq, hqb, B * H * HD * sizeof(uint16_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(dk, hkb, B * sk * HD * sizeof(uint16_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(dv, hvb, B * HD * sk * sizeof(uint16_t), cudaMemcpyHostToDevice);
+    cudaMemset(do_, 0, B * H * HD * sizeof(uint16_t));
+
+    // Launch kernel
+    dim3 grid(1, H, B);
+    dim3 block(NTHREADS);
+
+    printf("Launching fmha_decode_ref<%d> <<<(%d,%d,%d), %d>>>...\n", HD, grid.x, grid.y, grid.z, block.x);
+
+    fmha_decode_ref<HD><<<grid, block, smem>>>(
+        dq, dk, dv, do_,
+        H * HD, sk * HD, H * HD,
+        sk, 0, 0, scale, NULL, d_lse
+    );
+
+    cudaError_t err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+        printf("❌ Kernel launch failed: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    printf("✅ Kernel launched successfully!\n");
+
+    // Copy result back
+    cudaMemcpy(hob, do_, B * H * HD * sizeof(uint16_t), cudaMemcpyDeviceToHost);
+
+    // Compare with reference
+    float cos_sim = 0.0f, norm_a = 0.0f, norm_b = 0.0f;
+    for (int i = 0; i < B * H * HD; i++) {
+        float gpu_val = bf16_to_f32_cpu(hob[i]);
+        float ref_val = ho_ref[i];
+        cos_sim += gpu_val * ref_val;
+        norm_a += gpu_val * gpu_val;
+        norm_b += ref_val * ref_val;
+    }
+    float denom = sqrtf(norm_a) * sqrtf(norm_b);
+    if (denom > 0) cos_sim /= denom;
+
+    printf("\nhd=%d, s_k=%d: cos %.6f %s\n", HD, sk, cos_sim, cos_sim > 0.999f ? "✅ PASS" : "❌ FAIL");
+
+    if (cos_sim < 0.999f) {
+        printf("First 8 values (GPU vs Ref):\n");
+        for (int i = 0; i < 8; i++) {
+            printf("  [%d] GPU=%f Ref=%f\n", i, bf16_to_f32_cpu(hob[i]), ho_ref[i]);
+        }
+    }
+
+    // Cleanup
+    cudaFree(dq); cudaFree(dk); cudaFree(dv); cudaFree(do_); cudaFree(d_lse);
+    free(hq); free(hk); free(hv); free(ho_ref);
+    free(hqb); free(hkb); free(hvb); free(hob);
+
+    return cos_sim > 0.999f ? 0 : 1;
+}