[ROCm][Bugfix] fix cache block size mismatch for aiter unified attention (#37606)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
This commit is contained in:
Divakar Verma
2026-03-19 20:00:08 -04:00
committed by GitHub
parent be12afd284
commit 4ca3fa6bb4
2 changed files with 7 additions and 24 deletions

View File

@@ -665,7 +665,6 @@ class RocmPlatform(Platform):
def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
from vllm.config.compilation import CUDAGraphMode from vllm.config.compilation import CUDAGraphMode
cache_config = vllm_config.cache_config
compilation_config = vllm_config.compilation_config compilation_config = vllm_config.compilation_config
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
@@ -687,32 +686,9 @@ class RocmPlatform(Platform):
) )
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
if cache_config and not cache_config.user_specified_block_size:
if (
envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER
# NOTE: This block has been deprecated
# or get_env_variable_attn_backend()
# == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN
# TODO: monitor https://github.com/vllm-project/vllm/pull/30396
# to see how we can transition to the new way of selecting
# attention backends
):
cache_config.block_size = 64
logger.warning(
"[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
)
else:
cache_config.block_size = 16
if parallel_config.worker_cls == "auto": if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
@classmethod
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
# TODO: ROCm still sets block_size in check_and_update_config.
# Move that logic here so block_size is chosen by the backend.
pass
@classmethod @classmethod
def verify_model_arch(cls, model_arch: str) -> None: def verify_model_arch(cls, model_arch: str) -> None:
if model_arch in _ROCM_UNSUPPORTED_MODELS: if model_arch in _ROCM_UNSUPPORTED_MODELS:

View File

@@ -29,6 +29,13 @@ class RocmAiterUnifiedAttentionBackend(RocmAttentionBackend):
def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
return [MultipleOf(16)] return [MultipleOf(16)]
@classmethod
def get_preferred_block_size(cls, default_block_size: int) -> int:
logger.warning_once(
"[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
)
return 64
@classmethod @classmethod
def supports_block_size(cls, block_size: int | None) -> bool: def supports_block_size(cls, block_size: int | None) -> bool:
if block_size is None: if block_size is None: