Fix includes: use cutlass/float_subbyte.h (has float_e2m1_t and float_ue4m3_t), point to latest CUTLASS

2026-05-13 23:23:01 +00:00
parent d789f5e3e0
commit 8a9af441dc
3 changed files with 6 additions and 7 deletions
--- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu
+++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu
@@ -43,7 +43,7 @@
 #include "cutlass/kernel_hardware_info.hpp"
 #include "cutlass/detail/sm100_blockscaled_layout.hpp"

-#include "cute/numeric/float8.hpp"
+#include "cutlass/float_subbyte.h"
 #include "cute/layout.hpp"

 using namespace cute;
--- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/pytorch_binding.cpp
+++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/pytorch_binding.cpp
@@ -27,7 +27,7 @@
 #include "cutlass/kernel_hardware_info.hpp"
 #include "cutlass/detail/sm100_blockscaled_layout.hpp"

-#include "cute/numeric/float8.hpp"
+#include "cutlass/float_subbyte.h"
 #include "cute/layout.hpp"

 using namespace cute;
--- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/setup.py
+++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/setup.py
@@ -1,18 +1,17 @@
-"""
-Setup script for CUTLASS NVFP4 block-scaled GEMM PyTorch extension.
-"""
+"""Setup script for CUTLASS NVFP4 block-scaled GEMM PyTorch extension."""

 import os
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension

-# CUTLASS include directory
+# CUTLASS include directory — prefer the latest from GitHub
 CUTLASS_INCLUDE_DIR = os.environ.get(
    "CUTLASS_INCLUDE_DIR",
-    "/usr/local/lib/python3.12/dist-packages/tilelang/3rdparty/cutlass/include"
+    "/root/cutlass/include"
 )
 if not os.path.exists(os.path.join(CUTLASS_INCLUDE_DIR, "cutlass", "cutlass.h")):
    for alt in [
+        "/root/cutlass/include",
        "/usr/local/lib/python3.12/dist-packages/tilelang/3rdparty/cutlass/include",
        "/usr/local/include/cutlass",
        "/opt/cutlass/include",