Fix SM90 GEMM (#149)

* Fix sm90 GEMM

* Fix typo

---------

Co-authored-by: Kuai Yu <yukuai@deepseek.com>
This commit is contained in:
yukuai26
2025-08-01 10:36:49 +08:00
committed by GitHub
parent c50deed14c
commit aff9da0aba

View File

@@ -169,7 +169,7 @@ struct Scheduler {
// For SM90 only
// NOTES: we don't have to set `is_peer_cta_alive` for masked grouped GEMM, as it must be aligned
is_peer_cta_alive = kNum1DBlocksPerGroup % kNumMulticast == 0 or // Always aligned on N (constant bypass)
is_peer_cta_alive = num_n_blocks % kNumMulticast == 0 or // Always aligned on N (constant bypass)
num_m_blocks % kNumMulticast == 0 or // Always aligned on M (constant bypass)
(next_block_idx ^ 1) < num_blocks; // Peer CTA in bound
get_swizzled_block_idx(next_block_idx, m_block_idx, n_block_idx);