[Hybrid]: Decouple Kernel Block Size from KV Page Size (#24486)

Signed-off-by: lizhiyuan <uniartisan2017@gmail.com>
Signed-off-by: Zhiyuan Li <uniartisan2017@gmail.com>
This commit is contained in:
Zhiyuan Li
2025-10-09 14:43:39 +08:00
committed by GitHub
parent d17f0fbf30
commit d24cf322e1
18 changed files with 573 additions and 55 deletions

View File

@@ -3,7 +3,7 @@
"""Attention layer with AiterFlashAttention."""
from dataclasses import dataclass
from typing import Optional
from typing import Optional, Union
import torch
@@ -12,6 +12,7 @@ from vllm.attention.backends.abstract import (
AttentionImpl,
AttentionMetadata,
AttentionType,
MultipleOf,
)
from vllm.config import VllmConfig
from vllm.logger import init_logger
@@ -359,6 +360,10 @@ class AiterFlashAttentionBackend(AttentionBackend):
def get_supported_head_sizes(cls) -> list[int]:
return [64, 128, 256]
@staticmethod
def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
return [MultipleOf(16)]
@classmethod
def validate_head_size(cls, head_size: int) -> None:
supported_head_sizes = cls.get_supported_head_sizes()