diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index 1ac5e5b8c..239f3b5d4 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -15,12 +15,10 @@ from vllm.distributed.kv_events import BlockStored, KVEventBatch from vllm.platforms import current_platform CPU_BLOCK_SIZES = [48] -ATTN_BACKENDS = ["FLASH_ATTN"] +ATTN_BACKENDS = ["FLASH_ATTN", "TRITON_ATTN"] if current_platform.is_cuda(): ATTN_BACKENDS.append("FLASHINFER") -elif current_platform.is_rocm(): - ATTN_BACKENDS = ["TRITON_ATTN"] class MockSubscriber: diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 8cf363d59..b3dfc55cd 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -290,6 +290,19 @@ class TritonAttentionBackend(AttentionBackend): raise ValueError("Block size must be a multiple of 16.") return (num_blocks, 2, block_size, num_kv_heads, head_size) + @staticmethod + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: + # `stride_order` indicates the permutation that gets + # us from `get_kv_cache_shape` to the actual memory layout we want. + if include_num_layers_dimension: + # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size) + return (1, 0, 2, 3, 4, 5) + + # (num_blocks, 2, block_size, num_kv_heads, head_size) + return (0, 1, 2, 3, 4) + @staticmethod def use_cascade_attention(*args, **kwargs) -> bool: return False