[Hybrid]: Decouple Kernel Block Size from KV Page Size (#24486)

Signed-off-by: lizhiyuan <uniartisan2017@gmail.com> Signed-off-by: Zhiyuan Li <uniartisan2017@gmail.com>
2025-10-09 14:43:39 +08:00
parent d17f0fbf30
commit d24cf322e1
18 changed files with 573 additions and 55 deletions
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -3,7 +3,7 @@
 """High-Performance Triton-only Attention layer."""

 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union

 import torch

@@ -12,6 +12,7 @@ from vllm.attention.backends.abstract import (
    AttentionImpl,
    AttentionMetadata,
    AttentionType,
+    MultipleOf,
 )
 from vllm.attention.ops.triton_reshape_and_cache_flash import (
    triton_reshape_and_cache_flash,
@@ -157,6 +158,10 @@ class TritonAttentionBackend(AttentionBackend):
    def get_supported_dtypes(cls) -> list[torch.dtype]:
        return [torch.float16, torch.bfloat16, torch.float32]

+    @staticmethod
+    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+        return [MultipleOf(16)]
+
    @classmethod
    def validate_head_size(cls, head_size: int) -> None:
        # Triton Attention supports any head size above 32