[Attention] Flash Attention 3 - fp8 (#14570)

Signed-off-by: Mickael Seznec <mickael@mistral.ai>
2025-03-20 06:14:20 +01:00
parent ae65f3e237
commit a597a57595
15 changed files with 272 additions and 76 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1558,7 +1558,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                    block_size=block_size,
                    num_kv_heads=attn_module.num_kv_heads,
                    head_size=attn_module.head_size,
-                    dtype=attn_module.dtype,
+                    dtype=self.kv_cache_dtype,
                    use_mla=use_mla)
            elif attn_module.attn_type in (AttentionType.ENCODER,
                                           AttentionType.ENCODER_ONLY):