[ROCm][CI] v1 cpu offloading attention backend fix (#31833)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-01-08 00:37:50 -06:00
committed by GitHub
parent 6b2a672e47
commit 5f2a473ff3

View File

@@ -15,10 +15,12 @@ from vllm.distributed.kv_events import BlockStored, KVEventBatch
from vllm.platforms import current_platform
CPU_BLOCK_SIZES = [48]
ATTN_BACKENDS = ["FLASH_ATTN", "TRITON_ATTN"]
ATTN_BACKENDS = []
if current_platform.is_cuda():
ATTN_BACKENDS.append("FLASHINFER")
ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER", "TRITON_ATTN"]
elif current_platform.is_rocm():
ATTN_BACKENDS = ["TRITON_ATTN"]
class MockSubscriber: