[Deprecation] Remove deprecated environment variables (#32812)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye
2026-01-21 21:25:16 -05:00
committed by GitHub
parent 5e00b561cd
commit 6437ff1fb9
6 changed files with 7 additions and 118 deletions

View File

@@ -7,11 +7,8 @@ from pydantic import field_validator
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.v1.attention.backends.registry import AttentionBackendEnum
logger = init_logger(__name__)
@config
@dataclass
@@ -69,46 +66,3 @@ class AttentionConfig:
if isinstance(value, str):
return AttentionBackendEnum[value.upper()]
return value
def _set_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
"""Set field from env var if set, with deprecation warning."""
from vllm import envs
if envs.is_set(env_var_name):
value = getattr(envs, env_var_name)
if field_name == "backend":
value = self.validate_backend_before(value)
setattr(self, field_name, value)
logger.warning_once(
"Using %s environment variable is deprecated and will be removed in "
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
"--attention-config.%s command line argument or "
"AttentionConfig(%s=...) config field instead.",
env_var_name,
field_name,
field_name,
)
def __post_init__(self) -> None:
self._set_from_env_if_set("backend", "VLLM_ATTENTION_BACKEND")
self._set_from_env_if_set("flash_attn_version", "VLLM_FLASH_ATTN_VERSION")
self._set_from_env_if_set(
"use_prefill_decode_attention", "VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
)
self._set_from_env_if_set(
"flash_attn_max_num_splits_for_cuda_graph",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
)
self._set_from_env_if_set("use_cudnn_prefill", "VLLM_USE_CUDNN_PREFILL")
self._set_from_env_if_set(
"use_trtllm_ragged_deepseek_prefill",
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
)
self._set_from_env_if_set("use_trtllm_attention", "VLLM_USE_TRTLLM_ATTENTION")
self._set_from_env_if_set(
"disable_flashinfer_prefill", "VLLM_DISABLE_FLASHINFER_PREFILL"
)
self._set_from_env_if_set(
"disable_flashinfer_q_quantization",
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
)

View File

@@ -20,8 +20,6 @@ if TYPE_CHECKING:
VLLM_NCCL_SO_PATH: str | None = None
LD_LIBRARY_PATH: str | None = None
VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE: int = 256
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
VLLM_FLASH_ATTN_VERSION: int | None = None
LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: str | None = None
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
@@ -36,7 +34,6 @@ if TYPE_CHECKING:
VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
VLLM_NO_USAGE_STATS: bool = False
VLLM_DISABLE_FLASHINFER_PREFILL: bool = False
VLLM_DO_NOT_TRACK: bool = False
VLLM_USAGE_SOURCE: str = ""
VLLM_CONFIGURE_LOGGING: bool = True
@@ -48,7 +45,6 @@ if TYPE_CHECKING:
NO_COLOR: bool = False
VLLM_LOG_STATS_INTERVAL: float = 10.0
VLLM_TRACE_FUNCTION: int = 0
VLLM_ATTENTION_BACKEND: str | None = None
VLLM_USE_FLASHINFER_SAMPLER: bool | None = None
VLLM_PP_LAYER_PARTITION: str | None = None
VLLM_CPU_KVCACHE_SPACE: int | None = 0
@@ -142,7 +138,6 @@ if TYPE_CHECKING:
VLLM_SERVER_DEV_MODE: bool = False
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
VLLM_MLA_DISABLE: bool = False
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = ""
VLLM_CUDART_SO_PATH: str | None = None
@@ -214,15 +209,11 @@ if TYPE_CHECKING:
VLLM_MORIIO_POST_BATCH_SIZE: int = -1
VLLM_MORIIO_NUM_WORKERS: int = 1
VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: int = 480
VLLM_USE_CUDNN_PREFILL: bool = False
VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
VLLM_LOOPBACK_IP: str = ""
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
VLLM_USE_TRTLLM_ATTENTION: str | None = None
VLLM_NVFP4_GEMM_BACKEND: str | None = None
VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False
VLLM_HAS_FLASHINFER_CUBIN: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
@@ -592,17 +583,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE": lambda: int(
os.environ.get("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE", "256")
),
# Use separate prefill and decode kernels for V1 attention instead of
# the unified triton kernel.
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: (
os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower()
in ("true", "1")
),
# Force vllm to use a specific flash-attention version (2 or 3), only valid
# when using the flash-attention backend.
"VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
os.environ.get("VLLM_FLASH_ATTN_VERSION", None)
),
# Feature flag to enable/disable Inductor standalone compile.
# In torch <= 2.7 we ignore this flag; in torch >= 2.9 this is
# enabled by default.
@@ -668,10 +648,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
),
"VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
"VLLM_DISABLE_FLASHINFER_PREFILL": lambda: os.environ.get(
"VLLM_DISABLE_FLASHINFER_PREFILL", "0"
)
== "1",
"VLLM_DO_NOT_TRACK": lambda: (
os.environ.get("VLLM_DO_NOT_TRACK", None)
or os.environ.get("DO_NOT_TRACK", None)
@@ -707,25 +683,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set to 1, vllm will trace function calls
# Useful for debugging
"VLLM_TRACE_FUNCTION": lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
# Backend for attention computation
# Example options:
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
# - "FLASH_ATTN": use FlashAttention
# - "FLASHINFER": use flashinfer
# - "FLASHMLA": use FlashMLA
# - "FLASH_ATTN_MLA": use FlashAttention for MLA
# - "FLASHINFER_MLA": use FlashInfer for MLA
# - "CUTLASS_MLA": use CUTLASS for MLA
# All possible options loaded dynamically from AttentionBackendEnum
"VLLM_ATTENTION_BACKEND": env_with_choices(
"VLLM_ATTENTION_BACKEND",
None,
lambda: list(
__import__(
"vllm.v1.attention.backends.registry", fromlist=["AttentionBackendEnum"]
).AttentionBackendEnum.__members__.keys()
),
),
# If set, vllm will use flashinfer sampler
"VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])
@@ -1127,10 +1084,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will disable the MLA attention optimizations.
"VLLM_MLA_DISABLE": lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
# If set, vLLM will pick up the provided Flash Attention MLA
# max number splits for cuda graph decode
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int(
os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", "32")
),
# Number of GPUs per worker in Ray, if it is set to be a fraction,
# it allows ray to schedule multiple actors on a single GPU,
# so that users can colocate other actors on the same GPUs as vLLM.
@@ -1464,26 +1417,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT": lambda: int(
os.getenv("VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT", "480")
),
# Controls whether or not to use cudnn prefill
"VLLM_USE_CUDNN_PREFILL": lambda: bool(
int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))
),
# Controls whether to use TRT-LLM ragged DeepSeek prefill
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL": lambda: bool(
int(os.getenv("VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", "0"))
),
# If set to 1/True, use the TRTLLM attention backend in flashinfer.
# If set to 0/False, use the default attention backend in flashinfer.
# If not set, auto-detect the attention backend in flashinfer.
"VLLM_USE_TRTLLM_ATTENTION": lambda: (
None
if "VLLM_USE_TRTLLM_ATTENTION" not in os.environ
else os.environ["VLLM_USE_TRTLLM_ATTENTION"].lower() in ("1", "true")
),
# If set to 1, when we use fp8 kv, we do not quantize Q to fp8
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": lambda: bool(
int(os.getenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "0"))
),
# If set, it means we pre-downloaded cubin files and flashinfer will
# read the cubin files directly.
"VLLM_HAS_FLASHINFER_CUBIN": lambda: bool(

View File

@@ -287,7 +287,10 @@ class RocmPlatform(Platform):
return AttentionBackendEnum.ROCM_AITER_FA.get_path()
# Priority 3: Check for ROCM_ATTN (prefill-decode split)
if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
from vllm.config import get_current_vllm_config
vllm_config = get_current_vllm_config()
if vllm_config.attention_config.use_prefill_decode_attention:
logger.info("Using Rocm Attention backend.")
return AttentionBackendEnum.ROCM_ATTN.get_path()

View File

@@ -37,7 +37,6 @@ _GLOBAL_RUNTIME_DATA = dict[str, str | int | bool]()
_USAGE_ENV_VARS_TO_COLLECT = [
"VLLM_USE_MODELSCOPE",
"VLLM_ATTENTION_BACKEND",
"VLLM_USE_FLASHINFER_SAMPLER",
"VLLM_PP_LAYER_PARTITION",
"VLLM_USE_TRITON_AWQ",