Triton Attention: Support cross-layers blocks (#30687)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
This commit is contained in:
@@ -15,12 +15,10 @@ from vllm.distributed.kv_events import BlockStored, KVEventBatch
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
CPU_BLOCK_SIZES = [48]
|
||||
ATTN_BACKENDS = ["FLASH_ATTN"]
|
||||
ATTN_BACKENDS = ["FLASH_ATTN", "TRITON_ATTN"]
|
||||
|
||||
if current_platform.is_cuda():
|
||||
ATTN_BACKENDS.append("FLASHINFER")
|
||||
elif current_platform.is_rocm():
|
||||
ATTN_BACKENDS = ["TRITON_ATTN"]
|
||||
|
||||
|
||||
class MockSubscriber:
|
||||
|
||||
Reference in New Issue
Block a user