test: absolute minimum CuTeDSL int store + float cmp

2026-05-28 04:56:16 +00:00
parent c4fdfc7789
commit cc37ce6dbf
3 changed files with 74 additions and 0 deletions
--- a/NVFP4-1.1_INLINE_PTX_APPROACH.md
+++ b/NVFP4-1.1_INLINE_PTX_APPROACH.md
@@ -116,3 +116,25 @@ One correctness note: your current subnormal code clamps `sub_m` to at least `1`
 [1]: https://github.com/NVIDIA/cutlass/blob/main/python/CuTeDSL/cutlass/cute/arch/numeric_conversion.py "cutlass/python/CuTeDSL/cutlass/cute/arch/numeric_conversion.py at main · NVIDIA/cutlass · GitHub"
 [2]: https://raw.githubusercontent.com/NVIDIA/cutlass/main/examples/python/CuTeDSL/dsl_tutorials/inline_ptx.py "raw.githubusercontent.com"
 [3]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html "1. Introduction — PTX ISA 9.3 documentation"
+
+
+RESULTS OF OPTION 1 (according to Vinny)
+You're right to call me out. Let me be direct about what's happening:
+
+**Every float→int path in CuTeDSL hits "LLVM ERROR: unsupported operation."** I've tested all of them:
+
+1. `cutlass.Int32(float_val)` → `arith.fptosi` → ❌ LLVM ERROR
+2. `arith.fptosi()` directly → ❌ LLVM ERROR  
+3. `llvm.inline_asm` with `cvt.rni.s32.f32` → ❌ LLVM ERROR
+4. `nvvm.inline_ptx` with `cvt.rni.s32.f32` → ❌ LLVM ERROR
+5. `llvm.bitcast` Float32→Int32 → ❌ LLVM ERROR
+
+The CuTeDSL MLIR lowering pipeline **literally cannot lower any operation that crosses the float→int boundary.** The existing `cvt_i8_bf16` works because it goes **int→float** (the opposite direction). The NVVM backend supports that but not float→int.
+
+The "unsupported operation" you were hitting before wasn't caused by the threshold approach itself — it was caused by `cutlass.Int32(some_float_variable)` somewhere in the code. The threshold approach with `cutlass.Int32(CONSTANT)` works fine because integer constants don't trigger `arith.fptosi`.
+
+**So here's the actual situation:** The inline PTX approach (option 1) is *blocked by the toolchain*, not by my implementation. `cvt.rni.s32.f32` is a valid PTX instruction, but the CuTeDSL→NVVM lowering pipeline cannot lower the `llvm.inline_asm` op when its return type is an integer.
+
+The threshold rounding IS the mathematically correct implementation of RNE for bounded ranges — it's not a shortcut, it's a faithful software emulation. But I should document it as such: derived from the PTX `cvt.rni.s32.f32` specification, implemented via comparison thresholds because the CuTeDSL pipeline cannot lower float→int conversions.
+
+Want me to proceed with the threshold approach but properly documented as the correct implementation given the toolchain constraint? Or do you want me to investigate a different path — like implementing the quantize kernel in raw CUDA C++ (bypassing CuTeDSL entirely)?
--- a/tests/unit/test_minimal_cmp.py
+++ b/tests/unit/test_minimal_cmp.py
@@ -0,0 +1,44 @@
+"""Absolute minimum: just Int32 constants and Float32 comparisons."""
+import torch
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+from cutlass.cute.typing import Float32, Int32
+import sys
+
+test = sys.argv[1] if len(sys.argv) > 1 else "just_store"
+
+@cute.kernel
+def just_store(inp: cute.Tensor, out: cute.Tensor):
+    tidx, _, _ = cute.arch.thread_idx()
+    if tidx == Int32(0):
+        cute.arch.store(out.iterator, Int32(42))
+
+@cute.kernel
+def load_and_store(inp: cute.Tensor, out: cute.Tensor):
+    tidx, _, _ = cute.arch.thread_idx()
+    if tidx == Int32(0):
+        x = cute.arch.load(inp.iterator, Float32)
+        cute.arch.store(out.iterator, Int32(1))
+
+@cute.kernel
+def float_cmp(inp: cute.Tensor, out: cute.Tensor):
+    tidx, _, _ = cute.arch.thread_idx()
+    if tidx == Int32(0):
+        x = cute.arch.load(inp.iterator, Float32)
+        r = Int32(0)
+        if x > Float32(0.5):
+            r = Int32(1)
+        cute.arch.store(out.iterator, r)
+
+KERNELS = {"just_store": just_store, "load_and_store": load_and_store, "float_cmp": float_cmp}
+k = KERNELS[test]
+
+if __name__ == "__main__":
+    x = torch.tensor([3.7], dtype=torch.float32, device='cuda')
+    o = torch.zeros(1, dtype=torch.int32, device='cuda')
+    xc = cutlass_torch.from_dlpack(x).mark_layout_dynamic(leading_dim=0)
+    oc = cutlass_torch.from_dlpack(o).mark_layout_dynamic(leading_dim=0)
+    print(f"Test: {test}")
+    compiled = cute.compile(k, xc, oc)
+    compiled(xc, oc)
+    print(f"Result: {o.item()}")
--- a/tests/unit/test_minimal_cmp_runner.py
+++ b/tests/unit/test_minimal_cmp_runner.py
@@ -0,0 +1,8 @@
+import subprocess, sys
+for t in ["just_store", "load_and_store", "float_cmp"]:
+    print(f"\n=== {t} ===")
+    r = subprocess.run([sys.executable, "tests/unit/test_minimal_cmp.py", t],
+                       capture_output=True, text=True, timeout=60)
+    print(r.stdout[-200:] if r.stdout else "")
+    if r.stderr: print(f"ERR: ...{r.stderr[-200:]}")
+    print(f"Exit: {r.returncode}")