[V1] Enable V1 Fp8 cache for FA3 in the oracle (#15191)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
@@ -1562,9 +1562,20 @@ class EngineArgs:
|
||||
|
||||
# No Fp8 KV cache so far.
|
||||
if self.kv_cache_dtype != "auto":
|
||||
_raise_or_fallback(feature_name="--kv-cache-dtype",
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
fp8_attention = self.kv_cache_dtype.startswith("fp8")
|
||||
will_use_fa = (
|
||||
current_platform.is_cuda()
|
||||
and not envs.is_set("VLLM_ATTENTION_BACKEND")
|
||||
) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
|
||||
supported = False
|
||||
if fp8_attention and will_use_fa:
|
||||
from vllm.vllm_flash_attn.fa_utils import (
|
||||
flash_attn_supports_fp8)
|
||||
supported = flash_attn_supports_fp8()
|
||||
if not supported:
|
||||
_raise_or_fallback(feature_name="--kv-cache-dtype",
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
|
||||
# No Prompt Adapter so far.
|
||||
if self.enable_prompt_adapter:
|
||||
|
||||
Reference in New Issue
Block a user