Remove V0 attention backends (#25351)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -513,10 +513,6 @@ def make_backend(backend_name: str) -> AttentionBackend:
|
||||
Construct the backend instance determined by the backend_name string
|
||||
argument.
|
||||
|
||||
"XFORMERS" -> construct xformers backend
|
||||
|
||||
TODO: other backends
|
||||
|
||||
Note: at time of writing the Attention wrapper automatically selects
|
||||
its own backend for Attention.forward(); so the backend instance which
|
||||
you generate with this function is not meant to be used for *running*
|
||||
@@ -528,18 +524,68 @@ def make_backend(backend_name: str) -> AttentionBackend:
|
||||
|
||||
* Backend instance
|
||||
'''
|
||||
if backend_name == STR_XFORMERS_ATTN_VAL:
|
||||
# NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
|
||||
from vllm.attention.backends.xformers import XFormersBackend
|
||||
return XFormersBackend()
|
||||
elif backend_name == STR_FLASH_ATTN_VAL:
|
||||
from vllm.attention.backends.flash_attn import FlashAttentionBackend
|
||||
if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"):
|
||||
from vllm.v1.attention.backends.xformers import (
|
||||
XFormersAttentionBackend)
|
||||
return XFormersAttentionBackend()
|
||||
if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"):
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||
return FlashAttentionBackend()
|
||||
if backend_name == "TRITON_ATTN_VLLM_V1":
|
||||
from vllm.v1.attention.backends.triton_attn import (
|
||||
TritonAttentionBackend)
|
||||
return TritonAttentionBackend()
|
||||
if backend_name == "FLEX_ATTENTION":
|
||||
from vllm.v1.attention.backends.flex_attention import (
|
||||
FlexAttentionBackend)
|
||||
return FlexAttentionBackend()
|
||||
if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"):
|
||||
from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
|
||||
return TorchSDPABackend()
|
||||
if backend_name == "FLASHINFER":
|
||||
from vllm.v1.attention.backends.flashinfer import FlashInferBackend
|
||||
return FlashInferBackend()
|
||||
|
||||
raise AssertionError(
|
||||
f"Unrecognized backend_name {backend_name} for unit test")
|
||||
|
||||
|
||||
def make_alibi_bias(
|
||||
alibi_slopes: torch.Tensor,
|
||||
num_kv_heads: int,
|
||||
dtype: torch.dtype,
|
||||
seq_lens: list[int],
|
||||
) -> list[Any]:
|
||||
"""Create ALiBi biases compatible with xFormers attention tests."""
|
||||
from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias
|
||||
|
||||
if alibi_slopes is None:
|
||||
return [None for _ in seq_lens]
|
||||
|
||||
attn_biases: list[Any] = []
|
||||
num_heads = alibi_slopes.shape[0]
|
||||
assert num_heads >= num_kv_heads, (
|
||||
"ALiBi slopes expect at least as many heads as KV heads")
|
||||
|
||||
for seq_len in seq_lens:
|
||||
bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
|
||||
bias = bias[None, :] - bias[:, None]
|
||||
|
||||
padded_len = (seq_len + 7) // 8 * 8
|
||||
bias_tensor = torch.empty(
|
||||
1,
|
||||
num_heads,
|
||||
seq_len,
|
||||
padded_len,
|
||||
device=alibi_slopes.device,
|
||||
dtype=dtype,
|
||||
)[:, :, :, :seq_len].copy_(bias)
|
||||
bias_tensor.mul_(alibi_slopes[:, None, None])
|
||||
attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))
|
||||
|
||||
return attn_biases
|
||||
|
||||
|
||||
def _make_metadata_tensors(
|
||||
seq_lens: Optional[list[int]],
|
||||
context_lens: Optional[list[int]],
|
||||
|
||||
Reference in New Issue
Block a user