Fix kv_cache_dtype handling for out-of-tree HPU plugin (#21302)

Signed-off-by: Konrad Zawora <kzawora@habana.ai> Signed-off-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Chendi.Xue <chendi.xue@intel.com>
2025-07-22 08:35:14 +02:00
parent 6e5b5ca580
commit c17231e827
5 changed files with 30 additions and 16 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1352,22 +1352,8 @@ class EngineArgs:

        # No Fp8 KV cache so far.
        if self.kv_cache_dtype != "auto":
-            fp8_attention = self.kv_cache_dtype.startswith("fp8")
-            will_use_fa = (
-                current_platform.is_cuda()
-                and not envs.is_set("VLLM_ATTENTION_BACKEND")
-            ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
-            supported = False
-            if (current_platform.is_rocm()
-                    or (current_platform.is_cuda()
-                        and current_platform.is_device_capability(100))
-                    or current_platform.is_tpu()):
-                supported = True
-            elif fp8_attention and will_use_fa:
-                from vllm.attention.utils.fa_utils import (
-                    flash_attn_supports_fp8)
-                supported = flash_attn_supports_fp8()
-
+            supported = current_platform.is_kv_cache_dtype_supported(
+                self.kv_cache_dtype)
            if not supported:
                _raise_or_fallback(feature_name="--kv-cache-dtype",
                                   recommend_to_remove=False)