* Merge with private repo * Update README * Update README * Update README * Add PyTorch requirements * Fix sync scopes for MQA logits (#256) * Update README
36 lines
1.1 KiB
Plaintext
36 lines
1.1 KiB
Plaintext
// GEMM kernels
|
|
#include <deep_gemm/impls/sm90_bf16_gemm.cuh>
|
|
#include <deep_gemm/impls/sm90_fp8_gemm_1d1d.cuh>
|
|
#include <deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh>
|
|
#include <deep_gemm/impls/sm100_bf16_gemm.cuh>
|
|
#include <deep_gemm/impls/sm100_fp8_fp4_gemm_1d1d.cuh>
|
|
|
|
// Attention kernels
|
|
#include <deep_gemm/impls/sm90_fp8_mqa_logits.cuh>
|
|
#include <deep_gemm/impls/sm90_fp8_paged_mqa_logits.cuh>
|
|
#include <deep_gemm/impls/sm100_fp4_mqa_logits.cuh>
|
|
#include <deep_gemm/impls/sm100_fp8_mqa_logits.cuh>
|
|
#include <deep_gemm/impls/sm100_fp4_paged_mqa_logits.cuh>
|
|
#include <deep_gemm/impls/sm100_fp8_paged_mqa_logits.cuh>
|
|
|
|
// Einsum kernels
|
|
#include <deep_gemm/impls/sm90_bmk_bnk_mn.cuh>
|
|
#include <deep_gemm/impls/sm100_bmk_bnk_mn.cuh>
|
|
|
|
// Hyperconnection kernels
|
|
#include <deep_gemm/impls/sm90_tf32_hc_prenorm_gemm.cuh>
|
|
#include <deep_gemm/impls/sm100_tf32_hc_prenorm_gemm.cuh>
|
|
|
|
// Layout kernels
|
|
#include <deep_gemm/impls/smxx_layout.cuh>
|
|
#include <deep_gemm/impls/smxx_clean_logits.cuh>
|
|
|
|
// Mega kernels
|
|
#include <deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh>
|
|
|
|
using namespace deep_gemm;
|
|
|
|
int main() {
|
|
return 0;
|
|
}
|