[Kernel] (2/N) Machete - Integrate into CompressedTensorsWNA16 and GPTQMarlin (#7701)

Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-09-23 13:46:26 -04:00
parent ee5f34b1c2
commit 86e9c8df29
27 changed files with 1005 additions and 246 deletions
--- a/csrc/cutlass_extensions/torch_utils.hpp
+++ b/csrc/cutlass_extensions/torch_utils.hpp
@@ -68,7 +68,13 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
                        name, ".stride(", idx, ") to be ", StrideEle::value);
            return StrideEle{};
          } else {
-            return tensor.stride(idx);
+            if (tensor.size(idx) == 1) {
+              // use 0 stride for dim with size 1, this is easier for
+              // cute/cutlass to optimize (helps the TMA code flatten dims)
+              return StrideEle{0};
+            } else {
+              return tensor.stride(idx);
+            }
          }
        } else {
          // Extra strides are assumed to be 0 or 1