[1/N][Attention] Restructure attention: move files (#31916)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-01-09 16:10:24 -05:00
parent 1f8b7c536b
commit 2612ba9285
195 changed files with 426 additions and 396 deletions
--- a/vllm/model_executor/layers/attention/init.py
+++ b/vllm/model_executor/layers/attention/init.py
--- a/vllm/model_executor/layers/attention/chunked_local_attention.py
+++ b/vllm/model_executor/layers/attention/chunked_local_attention.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+
+import torch
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig
+from vllm.config.vllm import VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.v1.attention.backend import AttentionBackend
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    make_local_attention_virtual_batches,
+    subclass_attention_backend,
+)
+from vllm.v1.attention.selector import get_attn_backend
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    ChunkedLocalAttentionSpec,
+    KVCacheSpec,
+)
+
+
+@functools.lru_cache
+def create_chunked_local_attention_backend(
+    underlying_attn_backend: AttentionBackend,
+    attention_chunk_size: int,
+    block_size: int,
+) -> type[AttentionBackend]:
+    prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
+
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+    assert issubclass(underlying_builder, AttentionMetadataBuilder)
+
+    class ChunkedLocalAttentionBuilder(underlying_builder):  # type: ignore
+        @classmethod
+        def get_cudagraph_support(
+            cls: type["AttentionMetadataBuilder"],
+            vllm_config: VllmConfig,
+            kv_cache_spec: AttentionSpec,
+        ) -> AttentionCGSupport:
+            # Explicit override in case the underlying builder specialized this getter.
+            # @override omitted only because of mypy limitation due to type variable.
+            return AttentionCGSupport.NEVER
+
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ):
+            cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
+                attention_chunk_size, common_attn_metadata, block_size
+            )
+            metadata = super().build(common_prefix_len, cm, fast_build)
+            metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
+            return metadata
+
+        def update_block_table(
+            self, metadata, blk_table: torch.Tensor, slot_mapping: torch.Tensor
+        ):
+            blk_table = metadata.make_virtual_batches_block_table(blk_table)
+            return super().update_block_table(metadata, blk_table, slot_mapping)
+
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=ChunkedLocalAttentionBuilder,
+    )
+
+    return attn_backend
+
+
+class ChunkedLocalAttention(Attention):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        attention_chunk_size: int,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        kv_sharing_target_layer_name: str | None = None,
+        prefix: str = "",
+    ):
+        self.attention_chunk_size = attention_chunk_size
+        dtype = torch.get_default_dtype()
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        underlying_attn_backend = get_attn_backend(
+            head_size, dtype, kv_cache_dtype, block_size
+        )
+        attn_backend = create_chunked_local_attention_backend(
+            underlying_attn_backend, attention_chunk_size, block_size
+        )
+
+        super().__init__(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+            alibi_slopes=alibi_slopes,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
+            attn_backend=attn_backend,
+        )
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        assert self.attention_chunk_size
+        return ChunkedLocalAttentionSpec(
+            block_size=vllm_config.cache_config.block_size,
+            num_kv_heads=self.num_kv_heads,
+            head_size=self.head_size,
+            dtype=self.kv_cache_torch_dtype,
+            attention_chunk_size=self.attention_chunk_size,
+        )
--- a/vllm/model_executor/layers/attention/cross_attention.py
+++ b/vllm/model_executor/layers/attention/cross_attention.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from copy import copy
+
+import numpy as np
+import torch
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
+)
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata,
+    subclass_attention_backend,
+)
+from vllm.v1.attention.selector import get_attn_backend
+from vllm.v1.kv_cache_interface import CrossAttentionSpec, KVCacheSpec
+
+logger = init_logger(__name__)
+
+
+def _get_cross_slot_mapping(
+    encoder_seq_lens: np.ndarray,
+    block_table_tensor: torch.Tensor,
+    kv_cache_spec: CrossAttentionSpec,
+    device: torch.device,
+) -> torch.Tensor:
+    """Get cross-attention slot mappings."""
+
+    block_size = kv_cache_spec.block_size
+    slot_mappings = []
+
+    # Find indices with non-zero encoder sequence lengths
+    # The majority of parallel requests will be running the
+    # decoder, so this list should be relatively small.
+    active_indices = np.nonzero(encoder_seq_lens)[0]
+
+    for req_index in active_indices:
+        encoder_seq_len = encoder_seq_lens[req_index].item()
+
+        # Calculate the number of blocks needed for this request
+        num_blocks_needed = cdiv(encoder_seq_len, block_size)
+
+        # Get the block IDs for this request from the tensor
+        req_block_ids = block_table_tensor[req_index]
+
+        # Get only the blocks we need (first num_blocks_needed blocks)
+        needed_block_ids = req_block_ids[:num_blocks_needed]
+
+        # All needed blocks are allocated
+        i_values = torch.arange(encoder_seq_len, dtype=torch.int64, device=device)
+        block_indices = i_values // block_size
+        block_offsets = i_values % block_size
+        block_numbers = needed_block_ids[block_indices]
+        slot_mapping = block_numbers * block_size + block_offsets
+
+        slot_mappings.append(slot_mapping)
+
+    if slot_mappings:
+        return torch.cat(slot_mappings)
+    else:
+        return torch.empty(0, dtype=torch.int64, device=device)
+
+
+@functools.lru_cache
+def create_cross_attention_backend(
+    underlying_attn_backend: AttentionBackend,
+) -> type[AttentionBackend]:
+    prefix = "CrossAttention_"
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class CrossAttentionBuilder(underlying_builder):  # type: ignore
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
+            new_metadata = copy(common_attn_metadata)
+            new_metadata.causal = False
+            max_encoder_len = int(new_metadata.encoder_seq_lens_cpu.max())
+            new_metadata.max_seq_len = max_encoder_len
+            # Any computed tokens indicated decode step>1 (no chunked prefill)
+            num_cache_decodes = (
+                (common_attn_metadata.num_computed_tokens_cpu > 0).sum().item()
+            )
+            if num_cache_decodes > 0:
+                # CrossAttn KV cache has already been populated on first decoder step,
+                # skip slot_mapping calculation for requests that do not need
+                # reshape_and_cache.
+                num_tokens = common_attn_metadata.num_computed_tokens_cpu.numpy()
+                new_metadata.encoder_seq_lens_cpu = np.where(
+                    num_tokens > 0, 0, new_metadata.encoder_seq_lens_cpu
+                )
+
+            # seq_lens is provided by model runner: initial encoder input length is
+            # needed here to know how many tokens to attend to from the cached
+            # cross-attention KV cache.
+            new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens
+            new_metadata._seq_lens_cpu = torch.from_numpy(
+                common_attn_metadata.encoder_seq_lens_cpu
+            )
+
+            # NOTE (NickLucche) use `new_metadata` instead of `common_*` (initial) here
+            new_metadata.slot_mapping = _get_cross_slot_mapping(
+                new_metadata.encoder_seq_lens_cpu,
+                new_metadata.block_table_tensor,
+                self.kv_cache_spec,
+                self.device,
+            )
+            return super().build(common_prefix_len, new_metadata, fast_build)
+
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=CrossAttentionBuilder,
+    )
+
+    return attn_backend
+
+
+class CrossAttention(Attention):
+    """
+    Cross-attention for encoder-decoder models.
+    Handles attention between decoder queries and encoder keys/values.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        cache_config: CacheConfig | None = None,
+        attn_type: str | None = None,
+        **kwargs,
+    ):
+        dtype = torch.get_default_dtype()
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        if attn_type is not None:
+            assert attn_type == AttentionType.ENCODER_DECODER, (
+                "CrossAttention only supports AttentionType.ENCODER_DECODER"
+            )
+
+        underlying_attn_backend = get_attn_backend(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=AttentionType.ENCODER_DECODER,
+        )
+        attn_backend = create_cross_attention_backend(underlying_attn_backend)
+
+        super().__init__(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            cache_config=cache_config,
+            attn_backend=attn_backend,
+            attn_type=AttentionType.ENCODER_DECODER,
+            **kwargs,
+        )
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        return CrossAttentionSpec(
+            block_size=vllm_config.cache_config.block_size,
+            num_kv_heads=self.num_kv_heads,
+            head_size=self.head_size,
+            dtype=self.kv_cache_torch_dtype,
+        )
--- a/vllm/model_executor/layers/attention/encoder_only_attention.py
+++ b/vllm/model_executor/layers/attention/encoder_only_attention.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from copy import copy
+
+import torch
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig
+from vllm.config.vllm import VllmConfig
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
+)
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata,
+    subclass_attention_backend,
+)
+from vllm.v1.attention.selector import get_attn_backend
+from vllm.v1.kv_cache_interface import KVCacheSpec
+
+
+@functools.lru_cache
+def create_encoder_only_attention_backend(
+    underlying_attn_backend: AttentionBackend,
+) -> type[AttentionBackend]:
+    prefix = "EncoderOnlyAttention_"
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class EncoderOnlyAttentionBuilder(underlying_builder):  # type: ignore
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
+            new_common_attn_metadata = copy(common_attn_metadata)
+            new_common_attn_metadata.causal = False
+            return super().build(
+                common_prefix_len, new_common_attn_metadata, fast_build
+            )
+
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=EncoderOnlyAttentionBuilder,
+    )
+
+    return attn_backend
+
+
+class EncoderOnlyAttention(Attention):
+    """
+    Encoder attention is a special case that doesn't need a KV Cache.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        cache_config: CacheConfig | None = None,
+        attn_type: str | None = None,
+        **kwargs,
+    ):
+        dtype = torch.get_default_dtype()
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        underlying_attn_backend = get_attn_backend(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=AttentionType.ENCODER_ONLY,
+        )
+
+        attn_backend = create_encoder_only_attention_backend(underlying_attn_backend)
+
+        if attn_type is not None:
+            assert attn_type == AttentionType.ENCODER_ONLY, (
+                "EncoderOnlyAttention only supports AttentionType.ENCODER_ONLY"
+            )
+
+        super().__init__(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            cache_config=cache_config,
+            attn_backend=attn_backend,
+            attn_type=AttentionType.ENCODER_ONLY,
+            **kwargs,
+        )
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        # Does not need KV cache
+        return None
--- a/vllm/model_executor/layers/attention/mm_encoder_attention.py
+++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from vllm.config import MultiModalConfig
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.models.vision import get_vit_attn_backend
+from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.ops.vit_attn_wrappers import (
+    vit_flash_attn_wrapper,
+    vit_torch_sdpa_wrapper,
+)
+
+logger = init_logger(__name__)
+
+
+# --8<-- [start:mm_encoder_attn]
+@CustomOp.register("mm_encoder_attn")
+class MMEncoderAttention(CustomOp):
+    """Multi-headed attention without any cache, used for multimodal encoder."""
+
+    # --8<-- [end:mm_encoder_attn]
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float | None = None,
+        num_kv_heads: int | None = None,
+        prefix: str = "",
+        multimodal_config: MultiModalConfig | None = None,
+    ) -> None:
+        """
+        Args:
+            num_heads: number of attention heads per partition.
+            head_size: hidden_size per attention head.
+            scale: scale factor.
+            num_kv_heads: number of kv heads.
+            prefix: This has no effect, it is only here to make it easier to
+                    swap between Attention and MultiHeadAttention
+            multimodal_config: configs for multi-modal.
+        """
+        super().__init__()
+
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = scale
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.layer_name = prefix
+
+        assert self.num_heads % self.num_kv_heads == 0, (
+            f"num_heads ({self.num_heads}) is not "
+            f"divisible by num_kv_heads ({self.num_kv_heads})"
+        )
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        # During model initialization, the default dtype is set as the model
+        # weight and activation dtype.
+        dtype = torch.get_default_dtype()
+
+        # Try to get vision attention backend from multimodal_config.
+        attn_backend_override = None
+        if multimodal_config is not None:
+            attn_backend_override = multimodal_config.mm_encoder_attn_backend
+
+        # Get device-specific vision attention backend.
+        self.attn_backend = get_vit_attn_backend(
+            head_size=head_size,
+            dtype=dtype,
+            attn_backend_override=attn_backend_override,
+        )
+
+        self.is_flash_attn_backend = self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }
+
+        self._fa_version = (
+            get_flash_attn_version() if self.is_flash_attn_backend else None
+        )
+
+        logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
+
+    @classmethod
+    def enabled(cls) -> bool:
+        return True
+
+    def maybe_reshape_qkv_to_4d(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        bsz: int,
+        q_len: int,
+        kv_len: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Reshape query, key, value to 4D tensors:
+        (batch_size, seq_len, num_heads, head_size)
+        """
+        query = query.view(bsz, q_len, self.num_heads, self.head_size)
+        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+
+        if (num_repeat := self.num_queries_per_kv) > 1:
+            # Handle MQA and GQA
+            key = torch.repeat_interleave(key, num_repeat, dim=2)
+            value = torch.repeat_interleave(value, num_repeat, dim=2)
+
+        return query, key, value
+
+    def _forward_sdpa(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Input shape:
+        (batch_size x seq_len x hidden_size) or
+        (batch_size x seq_len x num_heads x head_size)
+        """
+        bsz, q_len = query.size()[:2]
+        kv_len = key.size(1)
+        is_reshaped = query.dim() != 4
+
+        query, key, value = self.maybe_reshape_qkv_to_4d(
+            query, key, value, bsz, q_len, kv_len
+        )
+
+        output = vit_torch_sdpa_wrapper(
+            q=query,
+            k=key,
+            v=value,
+            scale=self.scale,
+            cu_seqlens=cu_seqlens,
+        )
+        if is_reshaped:
+            output = output.reshape(bsz, q_len, -1)
+        return output
+
+    def _forward_fa(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        """Input shape:
+        (batch_size x seq_len x hidden_size) or
+        (batch_size x seq_len x num_heads x head_size)
+        """
+        assert (cu_seqlens is not None and max_seqlen is not None) or (
+            cu_seqlens is None and max_seqlen is None
+        ), "cu_seqlens and max_seqlen should be both set or both None."
+
+        bsz, q_len = query.size()[:2]
+        kv_len = key.size(1)
+        is_reshaped = query.dim() != 4
+
+        query, key, value = self.maybe_reshape_qkv_to_4d(
+            query, key, value, bsz, q_len, kv_len
+        )
+
+        output = vit_flash_attn_wrapper(
+            q=query,
+            k=key,
+            v=value,
+            scale=self.scale,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            batch_size=bsz,
+            is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA),
+            fa_version=self._fa_version,
+        )
+        if is_reshaped:
+            output = output.reshape(bsz, q_len, -1)
+        return output
+
+    def forward_native(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        return self._forward_sdpa(query, key, value, cu_seqlens)
+
+    def forward_cuda(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        if self.is_flash_attn_backend:
+            return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+            return self._forward_sdpa(query, key, value, cu_seqlens)
+        else:
+            raise ValueError(
+                f"Unsupported multi-modal encoder attention backend for CUDA: "
+                f"{self.attn_backend}."
+            )
+
+    def forward_cpu(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        return self._forward_sdpa(query, key, value, cu_seqlens)
+
+    def forward_xpu(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        assert self.is_flash_attn_backend, (
+            "XPU only supports FLASH_ATTN for vision attention."
+        )
+        return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
--- a/vllm/model_executor/layers/attention/static_sink_attention.py
+++ b/vllm/model_executor/layers/attention/static_sink_attention.py
@@ -0,0 +1,254 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+
+import torch
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
+)
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata,
+    subclass_attention_backend,
+)
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash_diffkv,
+)
+from vllm.v1.attention.selector import get_attn_backend
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    KVCacheSpec,
+    SinkFullAttentionSpec,
+)
+
+logger = init_logger(__name__)
+
+
+@functools.lru_cache
+def create_static_sink_attention_backend(
+    underlying_attn_backend: type[AttentionBackend],
+    sink_len: int = 0,
+) -> type[AttentionBackend]:
+    prefix = "StaticSink_"
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class StaticSinkAttentionBuilder(underlying_builder):  # type: ignore
+        def __init__(
+            self,
+            kv_cache_spec: AttentionSpec,
+            layer_names: list[str],
+            vllm_config: VllmConfig,
+            device: torch.device,
+        ):
+            super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+            model_config = vllm_config.model_config
+            scheduler_config = vllm_config.scheduler_config
+            self.sink_len = sink_len
+            self.block_size = vllm_config.cache_config.block_size
+            self.num_sink_blocks = self.sink_len // vllm_config.cache_config.block_size
+            self.max_num_blocks = cdiv(
+                model_config.max_model_len, vllm_config.cache_config.block_size
+            )
+            self.block_table_with_sink = torch.zeros(
+                (
+                    scheduler_config.max_num_seqs,
+                    self.max_num_blocks + self.num_sink_blocks,
+                ),
+                device=device,
+                dtype=torch.int32,
+            )
+            self.block_table_with_sink[:, : self.num_sink_blocks] = torch.arange(
+                1,
+                self.num_sink_blocks + 1,
+                device=device,
+                dtype=torch.int32,
+            )
+
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
+            common_attn_metadata.seq_lens[:] = (
+                common_attn_metadata.seq_lens + self.sink_len
+            )
+            common_attn_metadata.seq_lens[
+                common_attn_metadata.seq_lens == self.sink_len
+            ] = 0
+            common_attn_metadata.max_seq_len = (
+                common_attn_metadata.max_seq_len + self.sink_len
+            )
+            max_num_blocks = cdiv(common_attn_metadata.max_seq_len, self.block_size)
+            num_reqs = common_attn_metadata.num_reqs
+            self.block_table_with_sink[
+                :num_reqs, self.num_sink_blocks : self.num_sink_blocks + max_num_blocks
+            ] = common_attn_metadata.block_table_tensor[:, :max_num_blocks]
+            common_attn_metadata.block_table_tensor = self.block_table_with_sink[
+                :num_reqs
+            ]
+
+            return super().build(common_prefix_len, common_attn_metadata, fast_build)
+
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=StaticSinkAttentionBuilder,
+    )
+
+    return attn_backend
+
+
+@CustomOp.register("static_sink_attention")
+class StaticSinkAttention(Attention, CustomOp):
+    """
+    Attention with static sink tokens
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        sink_len: int,
+        attn_backend: type[AttentionBackend] | None = None,
+        cache_config: CacheConfig | None = None,
+        **kwargs,
+    ):
+        dtype = torch.get_default_dtype()
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        if attn_backend is not None:
+            underlying_attn_backend = attn_backend
+        else:
+            underlying_attn_backend = get_attn_backend(
+                head_size, dtype, kv_cache_dtype, block_size
+            )
+        attn_backend = create_static_sink_attention_backend(
+            underlying_attn_backend,  # type: ignore[arg-type]
+            sink_len=sink_len,
+        )
+        Attention.__init__(
+            self=self,
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            cache_config=cache_config,
+            attn_backend=attn_backend,
+            **kwargs,
+        )
+        CustomOp.__init__(self)
+
+        self.sink_len = sink_len
+        self.block_size = block_size
+        self.sink_populated = False
+        self.sink_key = None
+        self.sink_value = None
+
+    def update_sink_kv(self, sink_key, sink_value) -> None:
+        self.sink_key = sink_key
+        self.sink_value = sink_value
+
+    def forward_native(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        output_shape: torch.Size | None = None,
+    ) -> torch.Tensor:
+        assert self.sink_key is not None and self.sink_value is not None, (
+            "sink_key and sink_value have not been prepared"
+        )
+        if not self.sink_populated:
+            forward_context: ForwardContext = get_forward_context()
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+            torch.ops.vllm.maybe_populate_sink(self_kv_cache, self.layer_name)
+
+        return super().forward(query, key, value, output_shape)
+
+    def forward_cuda(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        output_shape: torch.Size | None = None,
+    ) -> torch.Tensor:
+        return self.forward_native(query, key, value, output_shape)
+
+    def forward(self, *args, **kwargs):
+        return self._forward_method(*args, **kwargs)
+
+    def populate_sink_kv(self, self_kv_cache):
+        sink_kv_slot_mapping = torch.arange(
+            self.block_size,
+            self.sink_len + self.block_size,
+            device=torch.cuda.current_device(),
+            dtype=torch.long,
+        )
+        triton_reshape_and_cache_flash_diffkv(
+            self.sink_key,
+            self.sink_value,
+            self_kv_cache,
+            sink_kv_slot_mapping,
+            self.kv_cache_dtype,
+            self._k_scale,
+            self._v_scale,
+        )
+        # We only populate the sink_key and sink_value once
+        self.sink_populated = True
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        # Block size may get updated after model loading, refresh it
+        block_size = vllm_config.cache_config.block_size
+        # Should not be called for enc-dec or encoder-only attention.
+        assert self.attn_type == AttentionType.DECODER
+
+        return SinkFullAttentionSpec(
+            block_size=block_size,
+            num_kv_heads=self.num_kv_heads,
+            head_size=self.head_size,
+            head_size_v=self.head_size_v,
+            sink_len=self.sink_len,
+            dtype=self.kv_cache_torch_dtype,
+        )
+
+
+def maybe_populate_sink(
+    self_kv_cache: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    if self.sink_populated or self_kv_cache.numel() == 0:
+        return
+    self.populate_sink_kv(self_kv_cache)
+
+
+def maybe_populate_sink_fake(
+    self_kv_cache: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="maybe_populate_sink",
+    op_func=maybe_populate_sink,
+    mutates_args=["self_kv_cache"],
+    fake_impl=maybe_populate_sink_fake,
+)
--- a/vllm/model_executor/layers/attention_layer_base.py
+++ b/vllm/model_executor/layers/attention_layer_base.py
@@ -4,8 +4,8 @@

 from abc import ABC, abstractmethod

-from vllm.attention.backends.abstract import AttentionBackend, AttentionImpl
 from vllm.config import VllmConfig
+from vllm.v1.attention.backend import AttentionBackend, AttentionImpl
 from vllm.v1.kv_cache_interface import KVCacheSpec


--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -6,11 +6,11 @@ from typing import Any

 import torch

-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum

 logger = init_logger(__name__)

--- a/vllm/model_executor/layers/kda.py
+++ b/vllm/model_executor/layers/kda.py
@@ -5,7 +5,6 @@ import torch
 from einops import rearrange
 from torch import nn

-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import (
    divide,
@@ -17,6 +16,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import sharded_weight_loader
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata

 from .fla.ops.kda import (
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -5,10 +5,10 @@ from collections.abc import Iterable

 import torch

-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.selector import get_mamba_attn_backend
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.v1.attention.backend import AttentionBackend
+from vllm.v1.attention.selector import get_mamba_attn_backend
 from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec


--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -8,7 +8,6 @@ import torch.nn.functional as F
 from einops import rearrange
 from torch import nn

-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
@@ -29,6 +28,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata


--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -5,7 +5,6 @@
 import torch
 from torch import nn

-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import (
    divide,
@@ -43,6 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 )
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata

 # Added by the IBM Team, 2024
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -4,7 +4,6 @@

 import torch

-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import ForwardContext, get_forward_context
@@ -24,6 +23,7 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
    causal_conv1d_update,
 )
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata