[TPU] support fp8 kv cache quantization (#19292)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-07-19 20:01:00 -07:00
parent 2b504eb770
commit 3a1d8940ae
6 changed files with 94 additions and 27 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1358,10 +1358,10 @@ class EngineArgs:
                and not envs.is_set("VLLM_ATTENTION_BACKEND")
            ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
            supported = False
-            if current_platform.is_rocm() or (
-                    current_platform.is_cuda()
-                    and current_platform.is_device_capability(100)
-            ):  # handle hpu also for OOT platform
+            if (current_platform.is_rocm()
+                    or (current_platform.is_cuda()
+                        and current_platform.is_device_capability(100))
+                    or current_platform.is_tpu()):
                supported = True
            elif fp8_attention and will_use_fa:
                from vllm.attention.utils.fa_utils import (