[Attention] Use FA4 for MLA prefill (#34732)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
This commit is contained in:
@@ -125,11 +125,14 @@ def get_flash_attn_version(
|
||||
# FA4 on SM100 (Blackwell) has TMEM capacity limits that restrict
|
||||
# supported head dimensions.
|
||||
# See: https://github.com/Dao-AILab/flash-attention/issues/1959
|
||||
# Exception: hdim 192 is supported for MLA's diff-headdim case
|
||||
# (qk=192, v=128), added upstream in commits 1a15733e/1b36ab19.
|
||||
if (
|
||||
fa_version == 4
|
||||
and device_capability.major >= 10
|
||||
and head_size is not None
|
||||
and head_size > 128
|
||||
and head_size != 192
|
||||
):
|
||||
logger.warning_once(
|
||||
"FA4 on Blackwell does not support head_size=%d due to TMEM "
|
||||
|
||||
Reference in New Issue
Block a user