From 5bda27244b78f4993506a7ce215d13b22b307991 Mon Sep 17 00:00:00 2001 From: Chenggang Zhao Date: Thu, 10 Apr 2025 09:52:15 +0800 Subject: [PATCH] Add CMake support for CLion indexing --- CMakeLists.txt | 44 ++++++++++++++++++++++++ deep_gemm/include/deep_gemm/fp8_gemm.cuh | 2 +- indexing/main.cu | 29 ++++++++++++++++ 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 CMakeLists.txt create mode 100644 indexing/main.cu diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..658aa7b --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,44 @@ +# NOTES: current just for CMake-based IDE (e.g. CLion) indexing, the real compilation is done via JIT +# TODO: add CUDA utils' library via CMake +cmake_minimum_required(VERSION 3.10) +project(deep_gemm LANGUAGES CXX CUDA) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CUDA_STANDARD 20) +set(CMAKE_VERBOSE_MAKEFILE ON) + +find_package(CUDAToolkit REQUIRED) +find_package(pybind11 REQUIRED) + +file(WRITE ${CMAKE_BINARY_DIR}/test_cuda.cu "extern \"C\" __global__ void testKernel() { }") +execute_process( + COMMAND ${CUDA_NVCC_EXECUTABLE} ${CMAKE_CUDA_FLAGS} -gencode arch=compute_90a,code=sm_90a -o ${CMAKE_BINARY_DIR}/test_cuda.o -c ${CMAKE_BINARY_DIR}/test_cuda.cu + RESULT_VARIABLE NVCC_RESULT + OUTPUT_VARIABLE NVCC_OUTPUT + ERROR_VARIABLE NVCC_ERROR_OUTPUT + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} +) + +if (NVCC_RESULT EQUAL "0") + set(NVCC_SUPPORTS_SM90 TRUE) + message(STATUS "NVCC supports SM90") +else() + message(STATUS "NVCC does not support SM90") +endif() + +if (NVCC_SUPPORTS_SM90) + set(TORCH_CUDA_ARCH_LIST "8.6" CACHE STRING "Add arch tag 90a to NVCC" FORCE) + list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_90a,code=sm_90a") +endif() +find_package(Torch REQUIRED) + +include_directories(deep_gemm/include third-party/cutlass/include third-party/cutlass/tools/util/include) +include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS}) +link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -fPIC -DNDEBUG") +set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3 -std=c++17 -DNDEBUG --ptxas-options=--register-usage-level=10") + +cuda_add_library(example_gemm STATIC indexing/main.cu) diff --git a/deep_gemm/include/deep_gemm/fp8_gemm.cuh b/deep_gemm/include/deep_gemm/fp8_gemm.cuh index 2523435..9ba930f 100644 --- a/deep_gemm/include/deep_gemm/fp8_gemm.cuh +++ b/deep_gemm/include/deep_gemm/fp8_gemm.cuh @@ -1,6 +1,6 @@ +#pragma once #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunknown-attributes" -#pragma once #include #include diff --git a/indexing/main.cu b/indexing/main.cu new file mode 100644 index 0000000..1e16bee --- /dev/null +++ b/indexing/main.cu @@ -0,0 +1,29 @@ +#include "deep_gemm/fp8_gemm.cuh" + +using namespace deep_gemm; + +int main() { + int m = 128; + constexpr int N = 4096; + constexpr int K = 7168; + + constexpr int BLOCK_M = 128; + constexpr int BLOCK_N = 128; + constexpr int BLOCK_K = 128; + constexpr int BLOCK_N_PADDING = 0; + constexpr int kNumGroups = 1; + constexpr int kNumStages = 5; + constexpr int kNumTMAMulticast = 1; + constexpr bool kIsTMAMulticastOnA = false; + + using gemm_t = Gemm; + auto tma_a_desc = gemm_t::make_2d_tma_a_desc(reinterpret_cast<__nv_fp8_e4m3*>(0), m); + auto tma_b_desc = gemm_t::make_2d_tma_b_desc(reinterpret_cast<__nv_fp8_e4m3*>(0)); + auto tma_scales_a_desc = gemm_t::make_2d_tma_scales_a_desc(reinterpret_cast(0), m); + auto tma_d_desc = gemm_t::make_3d_tma_d_desc(reinterpret_cast(0), m); + gemm_t::run(nullptr, nullptr, nullptr, + m, + tma_a_desc, tma_b_desc, tma_scales_a_desc, tma_d_desc, + nullptr, 132, 0); + return 0; +}