[CI/Build] Per file CUDA Archs (improve wheel size and dev build times) (#8845)

This commit is contained in:
Lucas Wilkinson
2024-10-03 22:55:25 -04:00
committed by GitHub
parent 3dbb215b38
commit aeb37c2a72
22 changed files with 828 additions and 370 deletions

View File

@@ -34,10 +34,9 @@ static __global__ void prepack_B_kernel(BInTensor B_in,
}
template <typename PrepackedLayoutB, typename InLayout>
static void prepack_B(cudaStream_t stream,
typename PrepackedLayoutB::ElementB const* B_in_ptr,
InLayout B_layout,
typename PrepackedLayoutB::ElementB* B_out_ptr) {
static void prepack_B_template(
cudaStream_t stream, typename PrepackedLayoutB::ElementB const* B_in_ptr,
InLayout B_layout, typename PrepackedLayoutB::ElementB* B_out_ptr) {
using TileShapeNKL =
decltype(append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}));
auto ilvd_NKbNbKL_to_offset =