[V0 deprecation] Remove VLLM_USE_V1 usage in most modules (#27955)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-05 12:51:16 +08:00
parent 878fd5a16f
commit 428bc7bf1c
19 changed files with 107 additions and 238 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -134,16 +134,11 @@ def get_attn_backend(
    use_sparse: bool = False,
 ) -> type[AttentionBackend]:
    """Selects which attention backend to use and lazily imports it."""
-    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
-    # value to be returned from the cache if the value changes between calls.
-    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
-    # private function.
    return _cached_get_attn_backend(
        head_size=head_size,
        dtype=dtype,
        kv_cache_dtype=kv_cache_dtype,
        block_size=block_size,
-        use_v1=envs.VLLM_USE_V1,
        use_mla=use_mla,
        has_sink=has_sink,
        use_sparse=use_sparse,
@@ -156,7 +151,6 @@ def _cached_get_attn_backend(
    dtype: torch.dtype,
    kv_cache_dtype: str | None,
    block_size: int,
-    use_v1: bool = False,
    use_mla: bool = False,
    has_sink: bool = False,
    use_sparse: bool = False,
@@ -199,7 +193,7 @@ def _cached_get_attn_backend(
        dtype,
        kv_cache_dtype,
        block_size,
-        use_v1,
+        True,
        use_mla,
        has_sink,
        use_sparse,