199 lines
7.6 KiB
Python
199 lines
7.6 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
"""Attention layer with PagedAttention and Triton prefix prefill."""
|
|
from typing import Any, Optional
|
|
|
|
import torch
|
|
|
|
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
|
AttentionMetadata, AttentionType)
|
|
from vllm.attention.ops.chunked_prefill_paged_decode import (
|
|
chunked_prefill_paged_decode)
|
|
from vllm.attention.ops.paged_attn import PagedAttention
|
|
from vllm.logger import init_logger
|
|
from vllm.v1.attention.backends.flash_attn import (
|
|
FlashAttentionMetadata, FlashAttentionMetadataBuilder)
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
class TritonAttentionBackend(AttentionBackend):
|
|
|
|
accept_output_buffer: bool = True
|
|
|
|
@staticmethod
|
|
def get_supported_head_sizes() -> list[int]:
|
|
return [32, 64, 96, 128, 160, 192, 224, 256]
|
|
|
|
@staticmethod
|
|
def get_name() -> str:
|
|
return "TRITON_ATTN_VLLM_V1"
|
|
|
|
@staticmethod
|
|
def get_impl_cls() -> type["TritonAttentionImpl"]:
|
|
return TritonAttentionImpl
|
|
|
|
@staticmethod
|
|
def get_metadata_cls() -> type["AttentionMetadata"]:
|
|
return FlashAttentionMetadata
|
|
|
|
@staticmethod
|
|
def get_kv_cache_shape(
|
|
num_blocks: int,
|
|
block_size: int,
|
|
num_kv_heads: int,
|
|
head_size: int,
|
|
) -> tuple[int, ...]:
|
|
if block_size % 16 != 0:
|
|
raise ValueError("Block size must be a multiple of 16.")
|
|
return (2, num_blocks, block_size, num_kv_heads, head_size)
|
|
|
|
@staticmethod
|
|
def use_cascade_attention(*args, **kwargs) -> bool:
|
|
return False
|
|
|
|
@staticmethod
|
|
def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
|
|
return FlashAttentionMetadataBuilder
|
|
|
|
|
|
class TritonAttentionImpl(AttentionImpl):
|
|
|
|
def __init__(
|
|
self,
|
|
num_heads: int,
|
|
head_size: int,
|
|
scale: float,
|
|
num_kv_heads: int,
|
|
alibi_slopes: Optional[list[float]],
|
|
sliding_window: Optional[int],
|
|
kv_cache_dtype: str,
|
|
blocksparse_params: Optional[dict[str, Any]] = None,
|
|
logits_soft_cap: Optional[float] = None,
|
|
attn_type: AttentionType = AttentionType.DECODER,
|
|
use_irope: bool = False,
|
|
) -> None:
|
|
if blocksparse_params is not None:
|
|
raise ValueError(
|
|
"TritonAttention does not support block-sparse attention.")
|
|
self.num_heads = num_heads
|
|
self.head_size = head_size
|
|
self.scale = float(scale)
|
|
self.num_kv_heads = num_kv_heads
|
|
if alibi_slopes is not None:
|
|
alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
|
|
self.alibi_slopes = alibi_slopes
|
|
if sliding_window is None:
|
|
self.sliding_window = (-1, -1)
|
|
else:
|
|
self.sliding_window = (sliding_window - 1, 0)
|
|
self.kv_cache_dtype = kv_cache_dtype
|
|
self.use_irope = use_irope
|
|
|
|
assert self.num_heads % self.num_kv_heads == 0
|
|
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
|
|
|
support_head_sizes = TritonAttentionBackend.get_supported_head_sizes()
|
|
if head_size not in support_head_sizes:
|
|
raise ValueError(
|
|
f"Head size {head_size} is not supported by TritonAttention. "
|
|
f"Supported head sizes are: {support_head_sizes}.")
|
|
|
|
if attn_type != AttentionType.DECODER:
|
|
raise NotImplementedError("Encoder self-attention and "
|
|
"encoder/decoder cross-attention "
|
|
"are not implemented for "
|
|
"TritonAttentionImpl")
|
|
|
|
def forward(
|
|
self,
|
|
layer: torch.nn.Module,
|
|
query: torch.Tensor,
|
|
key: torch.Tensor,
|
|
value: torch.Tensor,
|
|
kv_cache: torch.Tensor,
|
|
attn_metadata: FlashAttentionMetadata,
|
|
output: Optional[torch.Tensor] = None,
|
|
) -> torch.Tensor:
|
|
"""Forward pass with FlashAttention.
|
|
|
|
Args:
|
|
query: shape = [num_tokens, num_heads, head_size]
|
|
key: shape = [num_tokens, num_kv_heads, head_size]
|
|
value: shape = [num_tokens, num_kv_heads, head_size]
|
|
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
|
|
attn_metadata: Metadata for attention.
|
|
Returns:
|
|
shape = [num_tokens, num_heads * head_size]
|
|
"""
|
|
assert output is not None, "Output tensor must be provided."
|
|
|
|
if attn_metadata is None:
|
|
# Profiling run.
|
|
return output
|
|
|
|
assert attn_metadata.use_cascade is False
|
|
|
|
# IMPORTANT!
|
|
# NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
|
|
# eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
|
|
# in this method. For example, `view` and `slice` (or `[:n]`) operations
|
|
# are surprisingly slow even in the case they do not invoke any GPU ops.
|
|
# Minimize the PyTorch ops in this method as much as possible.
|
|
# Whenever making a change in this method, please benchmark the
|
|
# performance to make sure it does not introduce any overhead.
|
|
|
|
num_actual_tokens = attn_metadata.num_actual_tokens
|
|
key_cache, value_cache = PagedAttention.split_kv_cache(
|
|
kv_cache, self.num_kv_heads, self.head_size)
|
|
|
|
# Reshape the input keys and values and store them in the cache.
|
|
PagedAttention.write_to_paged_cache(
|
|
key,
|
|
value,
|
|
key_cache,
|
|
value_cache,
|
|
attn_metadata.slot_mapping,
|
|
self.kv_cache_dtype,
|
|
layer._k_scale,
|
|
layer._v_scale,
|
|
)
|
|
|
|
use_local_attn = \
|
|
(self.use_irope and attn_metadata.local_attn_metadata is not None)
|
|
|
|
if use_local_attn:
|
|
assert attn_metadata.local_attn_metadata is not None
|
|
local_metadata = attn_metadata.local_attn_metadata
|
|
cu_seqlens_q = local_metadata.local_query_start_loc
|
|
sequesd_k = local_metadata.local_seqused_k
|
|
max_seqlen_q = local_metadata.local_max_query_len
|
|
max_seqlen_k = local_metadata.local_max_seq_len
|
|
block_table = local_metadata.local_block_table
|
|
else:
|
|
cu_seqlens_q = attn_metadata.query_start_loc
|
|
sequesd_k = attn_metadata.seq_lens
|
|
max_seqlen_q = attn_metadata.max_query_len
|
|
max_seqlen_k = attn_metadata.max_seq_len
|
|
block_table = attn_metadata.block_table
|
|
|
|
# Compute attention and update output up to `num_actual_tokens`.
|
|
chunked_prefill_paged_decode(query=query[:num_actual_tokens],
|
|
key=key[:num_actual_tokens],
|
|
value=value[:num_actual_tokens],
|
|
output=output[:num_actual_tokens],
|
|
kv_cache_dtype=self.kv_cache_dtype,
|
|
key_cache=key_cache,
|
|
value_cache=value_cache,
|
|
block_table=block_table,
|
|
query_start_loc=cu_seqlens_q,
|
|
seq_lens=sequesd_k,
|
|
max_seq_len=max_seqlen_k,
|
|
max_query_len=max_seqlen_q,
|
|
k_scale=layer._k_scale,
|
|
v_scale=layer._v_scale,
|
|
alibi_slopes=self.alibi_slopes,
|
|
sliding_window=self.sliding_window[0],
|
|
sm_scale=self.scale)
|
|
|
|
return output
|