[Kernel] Layernorm performance optimization (#3662)

This commit is contained in:
mawong-amd
2024-03-30 14:26:38 -07:00
committed by GitHub
parent 51c31bc10c
commit b6d103542c
4 changed files with 285 additions and 47 deletions

View File

@@ -100,6 +100,11 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
list(REMOVE_ITEM GPU_FLAGS
"-D__CUDA_NO_HALF_OPERATORS__"
"-D__CUDA_NO_HALF_CONVERSIONS__"
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
"-D__CUDA_NO_HALF2_OPERATORS__")
endif()
elseif(${GPU_LANG} STREQUAL "HIP")