From 4ca3fa6bb4633fed1196292f764ce8cf13f647b5 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Thu, 19 Mar 2026 20:00:08 -0400 Subject: [PATCH] [ROCm][Bugfix] fix cache block size mismatch for aiter unified attention (#37606) Signed-off-by: Divakar Verma --- vllm/platforms/rocm.py | 24 ------------------- .../backends/rocm_aiter_unified_attn.py | 7 ++++++ 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 329445d37..3c5f8a079 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -665,7 +665,6 @@ class RocmPlatform(Platform): def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: from vllm.config.compilation import CUDAGraphMode - cache_config = vllm_config.cache_config compilation_config = vllm_config.compilation_config parallel_config = vllm_config.parallel_config @@ -687,32 +686,9 @@ class RocmPlatform(Platform): ) compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - if cache_config and not cache_config.user_specified_block_size: - if ( - envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER - # NOTE: This block has been deprecated - # or get_env_variable_attn_backend() - # == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN - # TODO: monitor https://github.com/vllm-project/vllm/pull/30396 - # to see how we can transition to the new way of selecting - # attention backends - ): - cache_config.block_size = 64 - logger.warning( - "[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64." - ) - else: - cache_config.block_size = 16 - if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" - @classmethod - def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None: - # TODO: ROCm still sets block_size in check_and_update_config. - # Move that logic here so block_size is chosen by the backend. - pass - @classmethod def verify_model_arch(cls, model_arch: str) -> None: if model_arch in _ROCM_UNSUPPORTED_MODELS: diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py index acf223780..bd7f137f9 100644 --- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py +++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py @@ -29,6 +29,13 @@ class RocmAiterUnifiedAttentionBackend(RocmAttentionBackend): def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: return [MultipleOf(16)] + @classmethod + def get_preferred_block_size(cls, default_block_size: int) -> int: + logger.warning_once( + "[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64." + ) + return 64 + @classmethod def supports_block_size(cls, block_size: int | None) -> bool: if block_size is None: