[TPU] support fp8 kv cache quantization (#19292)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
This commit is contained in:
@@ -1358,10 +1358,10 @@ class EngineArgs:
|
||||
and not envs.is_set("VLLM_ATTENTION_BACKEND")
|
||||
) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
|
||||
supported = False
|
||||
if current_platform.is_rocm() or (
|
||||
current_platform.is_cuda()
|
||||
and current_platform.is_device_capability(100)
|
||||
): # handle hpu also for OOT platform
|
||||
if (current_platform.is_rocm()
|
||||
or (current_platform.is_cuda()
|
||||
and current_platform.is_device_capability(100))
|
||||
or current_platform.is_tpu()):
|
||||
supported = True
|
||||
elif fp8_attention and will_use_fa:
|
||||
from vllm.attention.utils.fa_utils import (
|
||||
|
||||
Reference in New Issue
Block a user