diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index b9b32d07b..44fa2962a 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -11,6 +11,7 @@ from vllm.attention.backends.abstract import ( AttentionBackend, AttentionImpl, AttentionType, + MultipleOf, ) from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode from vllm.attention.ops.paged_attn import PagedAttention @@ -158,6 +159,13 @@ class RocmAttentionBackend(AttentionBackend): torch.float32, ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + # ROCM paged attention kernel only supports block sizes 16 and 32 + # due to shared memory (LDS) constraints on AMD GPUs. + # See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro. + return [16, 32] + @classmethod def get_supported_head_sizes(cls) -> list[int]: return [32, 64, 96, 128, 160, 192, 224, 256]