VLLM_USE_TRITON_FLASH_ATTN V0 variable deprecation (#27611)
Signed-off-by: Andreas Karatzas <akaratza@amd.com> Signed-off-by: Andreas Karatzas <Andreas.Karatzas@amd.com>
This commit is contained in:
@@ -49,25 +49,8 @@ _ROCM_UNSUPPORTED_MODELS: list[str] = []
|
||||
|
||||
# Models partially supported by ROCm.
|
||||
# Architecture -> Reason.
|
||||
_ROCM_SWA_REASON = (
|
||||
"Sliding window attention (SWA) is not yet supported in "
|
||||
"Triton flash attention. For half-precision SWA support, "
|
||||
"please use CK flash attention by setting "
|
||||
"`VLLM_USE_TRITON_FLASH_ATTN=0`"
|
||||
)
|
||||
_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {
|
||||
"Qwen2ForCausalLM": _ROCM_SWA_REASON,
|
||||
"MistralForCausalLM": _ROCM_SWA_REASON,
|
||||
"MixtralForCausalLM": _ROCM_SWA_REASON,
|
||||
"PaliGemmaForConditionalGeneration": (
|
||||
"ROCm flash attention does not yet fully support 32-bit precision on PaliGemma"
|
||||
),
|
||||
"Phi3VForCausalLM": (
|
||||
"ROCm Triton flash attention may run into compilation errors due to "
|
||||
"excessive use of shared memory. If this happens, disable Triton FA "
|
||||
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`"
|
||||
),
|
||||
}
|
||||
_ROCM_SWA_REASON = ()
|
||||
_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {}
|
||||
_ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
|
||||
"0x74a0": "AMD_Instinct_MI300A",
|
||||
"0x74a1": "AMD_Instinct_MI300X",
|
||||
|
||||
Reference in New Issue
Block a user