Remove V0 attention backends (#25351)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-09-21 16:03:28 -07:00
committed by GitHub
parent af7dfb0d1a
commit bc6e542d9f
28 changed files with 143 additions and 7376 deletions

View File

@@ -513,10 +513,6 @@ def make_backend(backend_name: str) -> AttentionBackend:
Construct the backend instance determined by the backend_name string
argument.
"XFORMERS" -> construct xformers backend
TODO: other backends
Note: at time of writing the Attention wrapper automatically selects
its own backend for Attention.forward(); so the backend instance which
you generate with this function is not meant to be used for *running*
@@ -528,18 +524,68 @@ def make_backend(backend_name: str) -> AttentionBackend:
* Backend instance
'''
if backend_name == STR_XFORMERS_ATTN_VAL:
# NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
from vllm.attention.backends.xformers import XFormersBackend
return XFormersBackend()
elif backend_name == STR_FLASH_ATTN_VAL:
from vllm.attention.backends.flash_attn import FlashAttentionBackend
if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"):
from vllm.v1.attention.backends.xformers import (
XFormersAttentionBackend)
return XFormersAttentionBackend()
if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"):
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
return FlashAttentionBackend()
if backend_name == "TRITON_ATTN_VLLM_V1":
from vllm.v1.attention.backends.triton_attn import (
TritonAttentionBackend)
return TritonAttentionBackend()
if backend_name == "FLEX_ATTENTION":
from vllm.v1.attention.backends.flex_attention import (
FlexAttentionBackend)
return FlexAttentionBackend()
if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"):
from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
return TorchSDPABackend()
if backend_name == "FLASHINFER":
from vllm.v1.attention.backends.flashinfer import FlashInferBackend
return FlashInferBackend()
raise AssertionError(
f"Unrecognized backend_name {backend_name} for unit test")
def make_alibi_bias(
alibi_slopes: torch.Tensor,
num_kv_heads: int,
dtype: torch.dtype,
seq_lens: list[int],
) -> list[Any]:
"""Create ALiBi biases compatible with xFormers attention tests."""
from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias
if alibi_slopes is None:
return [None for _ in seq_lens]
attn_biases: list[Any] = []
num_heads = alibi_slopes.shape[0]
assert num_heads >= num_kv_heads, (
"ALiBi slopes expect at least as many heads as KV heads")
for seq_len in seq_lens:
bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
bias = bias[None, :] - bias[:, None]
padded_len = (seq_len + 7) // 8 * 8
bias_tensor = torch.empty(
1,
num_heads,
seq_len,
padded_len,
device=alibi_slopes.device,
dtype=dtype,
)[:, :, :, :seq_len].copy_(bias)
bias_tensor.mul_(alibi_slopes[:, None, None])
attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))
return attn_biases
def _make_metadata_tensors(
seq_lens: Optional[list[int]],
context_lens: Optional[list[int]],