Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
117 lines
5.2 KiB
CMake
117 lines
5.2 KiB
CMake
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
|
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
|
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
|
# we need to manually set VLLM_GPU_ARCHES here.
|
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
foreach(_ARCH ${CUDA_ARCHS})
|
|
string(REPLACE "." "" _ARCH "${_ARCH}")
|
|
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
|
endforeach()
|
|
endif()
|
|
|
|
#
|
|
# Build vLLM flash attention from source
|
|
#
|
|
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
|
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
|
# They should be identical but if they aren't, this is a massive footgun.
|
|
#
|
|
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
|
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2), --component _vllm_fa3_C (for FA3),
|
|
# or --component _vllm_fa4_cutedsl_C (for FA4 CuteDSL Python files).
|
|
# If no component is specified, vllm-flash-attn is still installed.
|
|
|
|
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
|
# This is to enable local development of vllm-flash-attn within vLLM.
|
|
# It can be set as an environment variable or passed as a cmake argument.
|
|
# The environment variable takes precedence.
|
|
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
|
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
|
endif()
|
|
|
|
if(VLLM_FLASH_ATTN_SRC_DIR)
|
|
FetchContent_Declare(
|
|
vllm-flash-attn SOURCE_DIR
|
|
${VLLM_FLASH_ATTN_SRC_DIR}
|
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
|
)
|
|
else()
|
|
FetchContent_Declare(
|
|
vllm-flash-attn
|
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
|
GIT_TAG f5bc33cfc02c744d24a2e9d50e6db656de40611c
|
|
GIT_PROGRESS TRUE
|
|
# Don't share the vllm-flash-attn build between build types
|
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
|
)
|
|
endif()
|
|
|
|
# Make sure vllm-flash-attn install rules are nested under vllm/
|
|
# ALL_COMPONENTS ensures the save/modify/restore runs exactly once regardless
|
|
# of how many components are being installed, avoiding double-append of /vllm/.
|
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
|
|
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
|
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
|
|
|
|
# Fetch the vllm-flash-attn library
|
|
FetchContent_MakeAvailable(vllm-flash-attn)
|
|
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
|
|
|
# Restore the install prefix after FA's install rules
|
|
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|
|
|
# Install shared Python files for both FA2 and FA3 components
|
|
foreach(_FA_COMPONENT _vllm_fa2_C _vllm_fa3_C)
|
|
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
|
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")"
|
|
COMPONENT ${_FA_COMPONENT})
|
|
|
|
# Copy vllm_flash_attn python files (except __init__.py and flash_attn_interface.py
|
|
# which are source-controlled in vllm)
|
|
install(
|
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
|
DESTINATION vllm/vllm_flash_attn
|
|
COMPONENT ${_FA_COMPONENT}
|
|
FILES_MATCHING PATTERN "*.py"
|
|
PATTERN "__init__.py" EXCLUDE
|
|
PATTERN "flash_attn_interface.py" EXCLUDE
|
|
)
|
|
|
|
endforeach()
|
|
|
|
#
|
|
# FA4 CuteDSL component
|
|
# This is a Python-only component that copies the flash_attn/cute directory
|
|
# and transforms imports to match our package structure.
|
|
#
|
|
add_custom_target(_vllm_fa4_cutedsl_C)
|
|
|
|
# Install flash_attn/cute directory (needed for FA4).
|
|
# When using a local source dir (VLLM_FLASH_ATTN_SRC_DIR), create a symlink
|
|
# so edits to cute-dsl Python files take effect immediately without rebuilding.
|
|
# Otherwise, copy files and transform flash_attn.cute imports to
|
|
# vllm.vllm_flash_attn.cute to match our package structure.
|
|
if(VLLM_FLASH_ATTN_SRC_DIR)
|
|
install(CODE "
|
|
set(LINK_TARGET \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\")
|
|
set(LINK_NAME \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute\")
|
|
file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")
|
|
file(REMOVE_RECURSE \"\${LINK_NAME}\")
|
|
file(CREATE_LINK \"\${LINK_TARGET}\" \"\${LINK_NAME}\" SYMBOLIC)
|
|
" COMPONENT _vllm_fa4_cutedsl_C)
|
|
else()
|
|
install(CODE "
|
|
file(GLOB_RECURSE CUTE_PY_FILES \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute/*.py\")
|
|
foreach(SRC_FILE \${CUTE_PY_FILES})
|
|
file(RELATIVE_PATH REL_PATH \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\" \${SRC_FILE})
|
|
set(DST_FILE \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute/\${REL_PATH}\")
|
|
get_filename_component(DST_DIR \${DST_FILE} DIRECTORY)
|
|
file(MAKE_DIRECTORY \${DST_DIR})
|
|
file(READ \${SRC_FILE} FILE_CONTENTS)
|
|
string(REPLACE \"flash_attn.cute\" \"vllm.vllm_flash_attn.cute\" FILE_CONTENTS \"\${FILE_CONTENTS}\")
|
|
file(WRITE \${DST_FILE} \"\${FILE_CONTENTS}\")
|
|
endforeach()
|
|
" COMPONENT _vllm_fa4_cutedsl_C)
|
|
endif()
|