Triton Attention: Support cross-layers blocks (#30687)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
This commit is contained in:
Or Ozeri
2026-01-05 21:29:16 +02:00
committed by GitHub
parent 21156ff199
commit d8e38d4939
2 changed files with 14 additions and 3 deletions

View File

@@ -15,12 +15,10 @@ from vllm.distributed.kv_events import BlockStored, KVEventBatch
from vllm.platforms import current_platform
CPU_BLOCK_SIZES = [48]
ATTN_BACKENDS = ["FLASH_ATTN"]
ATTN_BACKENDS = ["FLASH_ATTN", "TRITON_ATTN"]
if current_platform.is_cuda():
ATTN_BACKENDS.append("FLASHINFER")
elif current_platform.is_rocm():
ATTN_BACKENDS = ["TRITON_ATTN"]
class MockSubscriber: