[Attention] Flash Attention 3 - fp8 (#14570)

Signed-off-by: Mickael Seznec <mickael@mistral.ai>
This commit is contained in:
Mickaël Seznec
2025-03-20 06:14:20 +01:00
committed by GitHub
parent ae65f3e237
commit a597a57595
15 changed files with 272 additions and 76 deletions

View File

@@ -1558,7 +1558,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
block_size=block_size,
num_kv_heads=attn_module.num_kv_heads,
head_size=attn_module.head_size,
dtype=attn_module.dtype,
dtype=self.kv_cache_dtype,
use_mla=use_mla)
elif attn_module.attn_type in (AttentionType.ENCODER,
AttentionType.ENCODER_ONLY):