[ROCm][Bugfix] fix cache block size mismatch for aiter unified attention (#37606)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
This commit is contained in:
@@ -665,7 +665,6 @@ class RocmPlatform(Platform):
|
|||||||
def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
|
def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
|
||||||
from vllm.config.compilation import CUDAGraphMode
|
from vllm.config.compilation import CUDAGraphMode
|
||||||
|
|
||||||
cache_config = vllm_config.cache_config
|
|
||||||
compilation_config = vllm_config.compilation_config
|
compilation_config = vllm_config.compilation_config
|
||||||
parallel_config = vllm_config.parallel_config
|
parallel_config = vllm_config.parallel_config
|
||||||
|
|
||||||
@@ -687,32 +686,9 @@ class RocmPlatform(Platform):
|
|||||||
)
|
)
|
||||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||||
|
|
||||||
if cache_config and not cache_config.user_specified_block_size:
|
|
||||||
if (
|
|
||||||
envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER
|
|
||||||
# NOTE: This block has been deprecated
|
|
||||||
# or get_env_variable_attn_backend()
|
|
||||||
# == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN
|
|
||||||
# TODO: monitor https://github.com/vllm-project/vllm/pull/30396
|
|
||||||
# to see how we can transition to the new way of selecting
|
|
||||||
# attention backends
|
|
||||||
):
|
|
||||||
cache_config.block_size = 64
|
|
||||||
logger.warning(
|
|
||||||
"[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
cache_config.block_size = 16
|
|
||||||
|
|
||||||
if parallel_config.worker_cls == "auto":
|
if parallel_config.worker_cls == "auto":
|
||||||
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
|
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
|
|
||||||
# TODO: ROCm still sets block_size in check_and_update_config.
|
|
||||||
# Move that logic here so block_size is chosen by the backend.
|
|
||||||
pass
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def verify_model_arch(cls, model_arch: str) -> None:
|
def verify_model_arch(cls, model_arch: str) -> None:
|
||||||
if model_arch in _ROCM_UNSUPPORTED_MODELS:
|
if model_arch in _ROCM_UNSUPPORTED_MODELS:
|
||||||
|
|||||||
@@ -29,6 +29,13 @@ class RocmAiterUnifiedAttentionBackend(RocmAttentionBackend):
|
|||||||
def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
|
def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
|
||||||
return [MultipleOf(16)]
|
return [MultipleOf(16)]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_preferred_block_size(cls, default_block_size: int) -> int:
|
||||||
|
logger.warning_once(
|
||||||
|
"[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
|
||||||
|
)
|
||||||
|
return 64
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supports_block_size(cls, block_size: int | None) -> bool:
|
def supports_block_size(cls, block_size: int | None) -> bool:
|
||||||
if block_size is None:
|
if block_size is None:
|
||||||
|
|||||||
Reference in New Issue
Block a user