[feat]: CUTLASS block scaled group gemm for SM100 (#19757)

Signed-off-by: Duncan Moss <djm.moss@gmail.com>
Co-authored-by: Duncan Moss <dmoss@nvidia.com>
This commit is contained in:
Duncan Moss
2025-07-04 11:58:04 -07:00
committed by GitHub
parent 2f35a022e6
commit 3d184b95b8
13 changed files with 726 additions and 30 deletions

View File

@@ -51,7 +51,8 @@ struct cutlass_3x_gemm {
// These are the minimum alignments needed for the kernels to compile
static constexpr int AlignmentAB =
128 / cutlass::sizeof_bits<ElementAB>::value;
static constexpr int AlignmentCD = 4;
static constexpr int AlignmentCD =
128 / cutlass::sizeof_bits<ElementD>::value;
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<