[CI/Build] Per file CUDA Archs (improve wheel size and dev build times) (#8845)

2024-10-03 22:55:25 -04:00
parent 3dbb215b38
commit aeb37c2a72
22 changed files with 828 additions and 370 deletions
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -28,6 +28,7 @@

 #include "common/base.h"
 #include "core/scalar_type.hpp"
+#include "core/registration.h"

 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800

@@ -1134,3 +1135,7 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,

  return c;
 }
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
+}