[Kernel][Bugfix] Refactor and Fix CUTLASS 2:4 Sparse Kernels (#13198)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Tyler Michael Smith
2025-02-13 19:01:14 -05:00
committed by GitHub
parent 2344192a55
commit c1e37bf71b
16 changed files with 576 additions and 473 deletions

View File

@@ -23,6 +23,9 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias);
using CompressorResult = std::tuple<torch::Tensor, torch::Tensor>;
CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a);
#endif
void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
@@ -68,3 +71,30 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
"CUDA device capability: ",
version_num);
}
std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a) {
// Check for strides and alignment
TORCH_CHECK(a.stride(1) == 1); // Row-major
TORCH_CHECK(a.stride(0) % 8 == 0); // 8 Byte Alignment for Compression
at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
int32_t version_num = get_sm_version_num();
// Guard against compilation issues for sm90 kernels
#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
if (version_num >= 90) {
std::vector<torch::Tensor> result_tensors;
auto [a_meta, a_nzs] = cutlass_sparse_compress_sm90(a);
result_tensors.push_back(std::move(a_nzs));
result_tensors.push_back(std::move(a_meta));
return result_tensors;
}
#endif
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"No compiled cutlass_sparse_compress for a compute capability less than "
"CUDA device capability: ",
version_num);
}