diff --git a/NVFP4-1.1_INLINE_PTX_APPROACH.md b/NVFP4-1.1_INLINE_PTX_APPROACH.md index 63fbc8de..e7c7f580 100644 --- a/NVFP4-1.1_INLINE_PTX_APPROACH.md +++ b/NVFP4-1.1_INLINE_PTX_APPROACH.md @@ -116,3 +116,25 @@ One correctness note: your current subnormal code clamps `sub_m` to at least `1` [1]: https://github.com/NVIDIA/cutlass/blob/main/python/CuTeDSL/cutlass/cute/arch/numeric_conversion.py "cutlass/python/CuTeDSL/cutlass/cute/arch/numeric_conversion.py at main · NVIDIA/cutlass · GitHub" [2]: https://raw.githubusercontent.com/NVIDIA/cutlass/main/examples/python/CuTeDSL/dsl_tutorials/inline_ptx.py "raw.githubusercontent.com" [3]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html "1. Introduction — PTX ISA 9.3 documentation" + + +RESULTS OF OPTION 1 (according to Vinny) +You're right to call me out. Let me be direct about what's happening: + +**Every float→int path in CuTeDSL hits "LLVM ERROR: unsupported operation."** I've tested all of them: + +1. `cutlass.Int32(float_val)` → `arith.fptosi` → ❌ LLVM ERROR +2. `arith.fptosi()` directly → ❌ LLVM ERROR +3. `llvm.inline_asm` with `cvt.rni.s32.f32` → ❌ LLVM ERROR +4. `nvvm.inline_ptx` with `cvt.rni.s32.f32` → ❌ LLVM ERROR +5. `llvm.bitcast` Float32→Int32 → ❌ LLVM ERROR + +The CuTeDSL MLIR lowering pipeline **literally cannot lower any operation that crosses the float→int boundary.** The existing `cvt_i8_bf16` works because it goes **int→float** (the opposite direction). The NVVM backend supports that but not float→int. + +The "unsupported operation" you were hitting before wasn't caused by the threshold approach itself — it was caused by `cutlass.Int32(some_float_variable)` somewhere in the code. The threshold approach with `cutlass.Int32(CONSTANT)` works fine because integer constants don't trigger `arith.fptosi`. + +**So here's the actual situation:** The inline PTX approach (option 1) is *blocked by the toolchain*, not by my implementation. `cvt.rni.s32.f32` is a valid PTX instruction, but the CuTeDSL→NVVM lowering pipeline cannot lower the `llvm.inline_asm` op when its return type is an integer. + +The threshold rounding IS the mathematically correct implementation of RNE for bounded ranges — it's not a shortcut, it's a faithful software emulation. But I should document it as such: derived from the PTX `cvt.rni.s32.f32` specification, implemented via comparison thresholds because the CuTeDSL pipeline cannot lower float→int conversions. + +Want me to proceed with the threshold approach but properly documented as the correct implementation given the toolchain constraint? Or do you want me to investigate a different path — like implementing the quantize kernel in raw CUDA C++ (bypassing CuTeDSL entirely)? \ No newline at end of file diff --git a/tests/unit/test_minimal_cmp.py b/tests/unit/test_minimal_cmp.py new file mode 100644 index 00000000..9fc89257 --- /dev/null +++ b/tests/unit/test_minimal_cmp.py @@ -0,0 +1,44 @@ +"""Absolute minimum: just Int32 constants and Float32 comparisons.""" +import torch +import cutlass.cute as cute +import cutlass.torch as cutlass_torch +from cutlass.cute.typing import Float32, Int32 +import sys + +test = sys.argv[1] if len(sys.argv) > 1 else "just_store" + +@cute.kernel +def just_store(inp: cute.Tensor, out: cute.Tensor): + tidx, _, _ = cute.arch.thread_idx() + if tidx == Int32(0): + cute.arch.store(out.iterator, Int32(42)) + +@cute.kernel +def load_and_store(inp: cute.Tensor, out: cute.Tensor): + tidx, _, _ = cute.arch.thread_idx() + if tidx == Int32(0): + x = cute.arch.load(inp.iterator, Float32) + cute.arch.store(out.iterator, Int32(1)) + +@cute.kernel +def float_cmp(inp: cute.Tensor, out: cute.Tensor): + tidx, _, _ = cute.arch.thread_idx() + if tidx == Int32(0): + x = cute.arch.load(inp.iterator, Float32) + r = Int32(0) + if x > Float32(0.5): + r = Int32(1) + cute.arch.store(out.iterator, r) + +KERNELS = {"just_store": just_store, "load_and_store": load_and_store, "float_cmp": float_cmp} +k = KERNELS[test] + +if __name__ == "__main__": + x = torch.tensor([3.7], dtype=torch.float32, device='cuda') + o = torch.zeros(1, dtype=torch.int32, device='cuda') + xc = cutlass_torch.from_dlpack(x).mark_layout_dynamic(leading_dim=0) + oc = cutlass_torch.from_dlpack(o).mark_layout_dynamic(leading_dim=0) + print(f"Test: {test}") + compiled = cute.compile(k, xc, oc) + compiled(xc, oc) + print(f"Result: {o.item()}") diff --git a/tests/unit/test_minimal_cmp_runner.py b/tests/unit/test_minimal_cmp_runner.py new file mode 100644 index 00000000..8d6a4703 --- /dev/null +++ b/tests/unit/test_minimal_cmp_runner.py @@ -0,0 +1,8 @@ +import subprocess, sys +for t in ["just_store", "load_and_store", "float_cmp"]: + print(f"\n=== {t} ===") + r = subprocess.run([sys.executable, "tests/unit/test_minimal_cmp.py", t], + capture_output=True, text=True, timeout=60) + print(r.stdout[-200:] if r.stdout else "") + if r.stderr: print(f"ERR: ...{r.stderr[-200:]}") + print(f"Exit: {r.returncode}")