[V0 deprecation] Deprecate use_v1 parameter (#28112)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-12 22:03:52 +08:00
parent a9d18b5107
commit 10138c92a5
8 changed files with 31 additions and 35 deletions
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -131,7 +131,6 @@ class CpuPlatform(Platform):
        dtype: torch.dtype,
        kv_cache_dtype: str | None,
        block_size: int,
-        use_v1: bool,
        use_mla: bool,
        has_sink: bool,
        use_sparse: bool,
@@ -144,8 +143,6 @@ class CpuPlatform(Platform):
            raise NotImplementedError("MLA is not supported on CPU.")
        if use_sparse:
            raise NotImplementedError("Sparse Attention is not supported on CPU.")
-        if not use_v1:
-            raise ValueError("CPU backend only supports V1.")
        return AttentionBackendEnum.CPU_ATTN.get_path()

    @classmethod
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -336,17 +336,10 @@ class CudaPlatformBase(Platform):
        dtype: torch.dtype,
        kv_cache_dtype: "CacheDType | None",
        block_size: int | None,
-        use_v1: bool,
        use_mla: bool,
        has_sink: bool,
        use_sparse: bool,
    ) -> str:
-        if not use_v1:
-            raise RuntimeError(
-                "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-                "to select a supported backend."
-            )
-
        device_capability = cls.get_device_capability()
        assert device_capability is not None

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -215,7 +215,6 @@ class Platform:
        dtype: torch.dtype,
        kv_cache_dtype: "CacheDType | None",
        block_size: int,
-        use_v1: bool,
        use_mla: bool,
        has_sink: bool,
        use_sparse: bool,
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -213,7 +213,6 @@ class RocmPlatform(Platform):
        dtype,
        kv_cache_dtype,
        block_size,
-        use_v1,
        use_mla,
        has_sink,
        use_sparse,
@@ -224,12 +223,6 @@ class RocmPlatform(Platform):
        if use_sparse:
            raise NotImplementedError("Sparse Attention is not supported on ROCm.")

-        if not use_v1:
-            raise RuntimeError(
-                "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-                "to select a supported backend."
-            )
-
        if use_mla:
            if selected_backend is None:
                selected_backend = (
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -58,7 +58,6 @@ class TpuPlatform(Platform):
        dtype: torch.dtype,
        kv_cache_dtype: str | None,
        block_size: int,
-        use_v1: bool,
        use_mla: bool,
        has_sink,
        use_sparse,
@@ -70,8 +69,6 @@ class TpuPlatform(Platform):
        if selected_backend != AttentionBackendEnum.PALLAS:
            logger.info("Cannot use %s backend on TPU.", selected_backend)

-        if not use_v1:
-            raise ValueError("TPU backend only supports V1.")
        logger.info("Using Pallas V1 backend.")
        return AttentionBackendEnum.PALLAS.get_path()

--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -48,7 +48,6 @@ class XPUPlatform(Platform):
        dtype: torch.dtype,
        kv_cache_dtype: str | None,
        block_size: int,
-        use_v1: bool,
        use_mla: bool,
        has_sink: bool,
        use_sparse,
@@ -76,7 +75,7 @@ class XPUPlatform(Platform):
        elif selected_backend:
            raise ValueError(
                f"Invalid attention backend for {cls.device_name}, "
-                f"with use_v1: {use_v1} use_mla: {use_mla}"
+                f"with use_mla: {use_mla}"
            )

        logger.info("Using Flash Attention backend.")