[V1] Enable V1 Fp8 cache for FA3 in the oracle (#15191)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
Lucas Wilkinson
2025-03-23 18:07:04 -04:00
committed by GitHub
parent 9c5c81b0da
commit dccf535f8e
9 changed files with 45 additions and 23 deletions

View File

@@ -1562,9 +1562,20 @@ class EngineArgs:
# No Fp8 KV cache so far.
if self.kv_cache_dtype != "auto":
_raise_or_fallback(feature_name="--kv-cache-dtype",
recommend_to_remove=False)
return False
fp8_attention = self.kv_cache_dtype.startswith("fp8")
will_use_fa = (
current_platform.is_cuda()
and not envs.is_set("VLLM_ATTENTION_BACKEND")
) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
supported = False
if fp8_attention and will_use_fa:
from vllm.vllm_flash_attn.fa_utils import (
flash_attn_supports_fp8)
supported = flash_attn_supports_fp8()
if not supported:
_raise_or_fallback(feature_name="--kv-cache-dtype",
recommend_to_remove=False)
return False
# No Prompt Adapter so far.
if self.enable_prompt_adapter: