[Attention] Refactor FA block_size limitations to hybrid models only (#29084)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -447,7 +447,10 @@ class AiterFlashAttentionMetadataBuilder(
|
||||
class AiterFlashAttentionBackend(AttentionBackend):
|
||||
accept_output_buffer: bool = True
|
||||
supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
|
||||
supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
|
||||
|
||||
@staticmethod
|
||||
def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
|
||||
return [MultipleOf(16)]
|
||||
|
||||
@classmethod
|
||||
def get_supported_head_sizes(cls) -> list[int]:
|
||||
|
||||
Reference in New Issue
Block a user