[Attention] Use FA4 for MLA prefill (#34732)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
This commit is contained in:
Matthew Bonanni
2026-03-12 12:10:17 -04:00
committed by GitHub
parent 85199f9681
commit f444c05c32
9 changed files with 413 additions and 78 deletions

View File

@@ -125,11 +125,14 @@ def get_flash_attn_version(
# FA4 on SM100 (Blackwell) has TMEM capacity limits that restrict
# supported head dimensions.
# See: https://github.com/Dao-AILab/flash-attention/issues/1959
# Exception: hdim 192 is supported for MLA's diff-headdim case
# (qk=192, v=128), added upstream in commits 1a15733e/1b36ab19.
if (
fa_version == 4
and device_capability.major >= 10
and head_size is not None
and head_size > 128
and head_size != 192
):
logger.warning_once(
"FA4 on Blackwell does not support head_size=%d due to TMEM "