[ROCM][KERNEL] Paged attention for V1 (#15720)
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: root <root@banff-cyxtera-s65-4.amd.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: root <root@banff-cyxtera-s65-4.amd.com>
This commit is contained in:
committed by
GitHub
parent
bd7599d34a
commit
e73ff24e31
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
from functools import lru_cache, wraps
|
||||
from functools import cache, lru_cache, wraps
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional
|
||||
|
||||
import torch
|
||||
@@ -98,6 +98,25 @@ def device_id_to_physical_device_id(device_id: int) -> int:
|
||||
return device_id
|
||||
|
||||
|
||||
@cache
|
||||
def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
|
||||
block_size: int, gqa_ratio: int,
|
||||
max_seq_len: int,
|
||||
sliding_window: int) -> bool:
|
||||
|
||||
GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
|
||||
ON_NAVI = "gfx1" in GPU_ARCH
|
||||
ON_MI250_MI300 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
|
||||
|
||||
# rocm custom page attention not support on navi (gfx1*)
|
||||
return (ON_MI250_MI300 and not ON_NAVI and (sliding_window == 0)
|
||||
and (qtype == torch.half or qtype == torch.bfloat16)
|
||||
and (head_size == 64 or head_size == 128)
|
||||
and (block_size == 16 or block_size == 32)
|
||||
and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
|
||||
and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
|
||||
|
||||
|
||||
class RocmPlatform(Platform):
|
||||
_enum = PlatformEnum.ROCM
|
||||
device_name: str = "rocm"
|
||||
|
||||
Reference in New Issue
Block a user