[V0 deprecation] Remove VLLM_USE_V1 usage in most modules (#27955)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-05 12:51:16 +08:00
committed by GitHub
parent 878fd5a16f
commit 428bc7bf1c
19 changed files with 107 additions and 238 deletions

View File

@@ -134,16 +134,11 @@ def get_attn_backend(
use_sparse: bool = False,
) -> type[AttentionBackend]:
"""Selects which attention backend to use and lazily imports it."""
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
# value to be returned from the cache if the value changes between calls.
# To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
# private function.
return _cached_get_attn_backend(
head_size=head_size,
dtype=dtype,
kv_cache_dtype=kv_cache_dtype,
block_size=block_size,
use_v1=envs.VLLM_USE_V1,
use_mla=use_mla,
has_sink=has_sink,
use_sparse=use_sparse,
@@ -156,7 +151,6 @@ def _cached_get_attn_backend(
dtype: torch.dtype,
kv_cache_dtype: str | None,
block_size: int,
use_v1: bool = False,
use_mla: bool = False,
has_sink: bool = False,
use_sparse: bool = False,
@@ -199,7 +193,7 @@ def _cached_get_attn_backend(
dtype,
kv_cache_dtype,
block_size,
use_v1,
True,
use_mla,
has_sink,
use_sparse,