[Deprecation] Remove deprecated environment variables (#32812)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -7,11 +7,8 @@ from pydantic import field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
@@ -69,46 +66,3 @@ class AttentionConfig:
|
||||
if isinstance(value, str):
|
||||
return AttentionBackendEnum[value.upper()]
|
||||
return value
|
||||
|
||||
def _set_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
|
||||
"""Set field from env var if set, with deprecation warning."""
|
||||
from vllm import envs
|
||||
|
||||
if envs.is_set(env_var_name):
|
||||
value = getattr(envs, env_var_name)
|
||||
if field_name == "backend":
|
||||
value = self.validate_backend_before(value)
|
||||
setattr(self, field_name, value)
|
||||
logger.warning_once(
|
||||
"Using %s environment variable is deprecated and will be removed in "
|
||||
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
|
||||
"--attention-config.%s command line argument or "
|
||||
"AttentionConfig(%s=...) config field instead.",
|
||||
env_var_name,
|
||||
field_name,
|
||||
field_name,
|
||||
)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self._set_from_env_if_set("backend", "VLLM_ATTENTION_BACKEND")
|
||||
self._set_from_env_if_set("flash_attn_version", "VLLM_FLASH_ATTN_VERSION")
|
||||
self._set_from_env_if_set(
|
||||
"use_prefill_decode_attention", "VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"flash_attn_max_num_splits_for_cuda_graph",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
|
||||
)
|
||||
self._set_from_env_if_set("use_cudnn_prefill", "VLLM_USE_CUDNN_PREFILL")
|
||||
self._set_from_env_if_set(
|
||||
"use_trtllm_ragged_deepseek_prefill",
|
||||
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
|
||||
)
|
||||
self._set_from_env_if_set("use_trtllm_attention", "VLLM_USE_TRTLLM_ATTENTION")
|
||||
self._set_from_env_if_set(
|
||||
"disable_flashinfer_prefill", "VLLM_DISABLE_FLASHINFER_PREFILL"
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"disable_flashinfer_q_quantization",
|
||||
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
|
||||
)
|
||||
|
||||
67
vllm/envs.py
67
vllm/envs.py
@@ -20,8 +20,6 @@ if TYPE_CHECKING:
|
||||
VLLM_NCCL_SO_PATH: str | None = None
|
||||
LD_LIBRARY_PATH: str | None = None
|
||||
VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE: int = 256
|
||||
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
|
||||
VLLM_FLASH_ATTN_VERSION: int | None = None
|
||||
LOCAL_RANK: int = 0
|
||||
CUDA_VISIBLE_DEVICES: str | None = None
|
||||
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
|
||||
@@ -36,7 +34,6 @@ if TYPE_CHECKING:
|
||||
VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
|
||||
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
|
||||
VLLM_NO_USAGE_STATS: bool = False
|
||||
VLLM_DISABLE_FLASHINFER_PREFILL: bool = False
|
||||
VLLM_DO_NOT_TRACK: bool = False
|
||||
VLLM_USAGE_SOURCE: str = ""
|
||||
VLLM_CONFIGURE_LOGGING: bool = True
|
||||
@@ -48,7 +45,6 @@ if TYPE_CHECKING:
|
||||
NO_COLOR: bool = False
|
||||
VLLM_LOG_STATS_INTERVAL: float = 10.0
|
||||
VLLM_TRACE_FUNCTION: int = 0
|
||||
VLLM_ATTENTION_BACKEND: str | None = None
|
||||
VLLM_USE_FLASHINFER_SAMPLER: bool | None = None
|
||||
VLLM_PP_LAYER_PARTITION: str | None = None
|
||||
VLLM_CPU_KVCACHE_SPACE: int | None = 0
|
||||
@@ -142,7 +138,6 @@ if TYPE_CHECKING:
|
||||
VLLM_SERVER_DEV_MODE: bool = False
|
||||
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
|
||||
VLLM_MLA_DISABLE: bool = False
|
||||
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32
|
||||
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
|
||||
VLLM_RAY_BUNDLE_INDICES: str = ""
|
||||
VLLM_CUDART_SO_PATH: str | None = None
|
||||
@@ -214,15 +209,11 @@ if TYPE_CHECKING:
|
||||
VLLM_MORIIO_POST_BATCH_SIZE: int = -1
|
||||
VLLM_MORIIO_NUM_WORKERS: int = 1
|
||||
VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: int = 480
|
||||
VLLM_USE_CUDNN_PREFILL: bool = False
|
||||
VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
|
||||
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
|
||||
VLLM_LOOPBACK_IP: str = ""
|
||||
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True
|
||||
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
|
||||
VLLM_USE_TRTLLM_ATTENTION: str | None = None
|
||||
VLLM_NVFP4_GEMM_BACKEND: str | None = None
|
||||
VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False
|
||||
VLLM_HAS_FLASHINFER_CUBIN: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
|
||||
@@ -592,17 +583,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE": lambda: int(
|
||||
os.environ.get("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE", "256")
|
||||
),
|
||||
# Use separate prefill and decode kernels for V1 attention instead of
|
||||
# the unified triton kernel.
|
||||
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: (
|
||||
os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower()
|
||||
in ("true", "1")
|
||||
),
|
||||
# Force vllm to use a specific flash-attention version (2 or 3), only valid
|
||||
# when using the flash-attention backend.
|
||||
"VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
|
||||
os.environ.get("VLLM_FLASH_ATTN_VERSION", None)
|
||||
),
|
||||
# Feature flag to enable/disable Inductor standalone compile.
|
||||
# In torch <= 2.7 we ignore this flag; in torch >= 2.9 this is
|
||||
# enabled by default.
|
||||
@@ -668,10 +648,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
|
||||
),
|
||||
"VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
|
||||
"VLLM_DISABLE_FLASHINFER_PREFILL": lambda: os.environ.get(
|
||||
"VLLM_DISABLE_FLASHINFER_PREFILL", "0"
|
||||
)
|
||||
== "1",
|
||||
"VLLM_DO_NOT_TRACK": lambda: (
|
||||
os.environ.get("VLLM_DO_NOT_TRACK", None)
|
||||
or os.environ.get("DO_NOT_TRACK", None)
|
||||
@@ -707,25 +683,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# If set to 1, vllm will trace function calls
|
||||
# Useful for debugging
|
||||
"VLLM_TRACE_FUNCTION": lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
|
||||
# Backend for attention computation
|
||||
# Example options:
|
||||
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
|
||||
# - "FLASH_ATTN": use FlashAttention
|
||||
# - "FLASHINFER": use flashinfer
|
||||
# - "FLASHMLA": use FlashMLA
|
||||
# - "FLASH_ATTN_MLA": use FlashAttention for MLA
|
||||
# - "FLASHINFER_MLA": use FlashInfer for MLA
|
||||
# - "CUTLASS_MLA": use CUTLASS for MLA
|
||||
# All possible options loaded dynamically from AttentionBackendEnum
|
||||
"VLLM_ATTENTION_BACKEND": env_with_choices(
|
||||
"VLLM_ATTENTION_BACKEND",
|
||||
None,
|
||||
lambda: list(
|
||||
__import__(
|
||||
"vllm.v1.attention.backends.registry", fromlist=["AttentionBackendEnum"]
|
||||
).AttentionBackendEnum.__members__.keys()
|
||||
),
|
||||
),
|
||||
# If set, vllm will use flashinfer sampler
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
|
||||
int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])
|
||||
@@ -1127,10 +1084,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# If set, vLLM will disable the MLA attention optimizations.
|
||||
"VLLM_MLA_DISABLE": lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
|
||||
# If set, vLLM will pick up the provided Flash Attention MLA
|
||||
# max number splits for cuda graph decode
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int(
|
||||
os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", "32")
|
||||
),
|
||||
# Number of GPUs per worker in Ray, if it is set to be a fraction,
|
||||
# it allows ray to schedule multiple actors on a single GPU,
|
||||
# so that users can colocate other actors on the same GPUs as vLLM.
|
||||
@@ -1464,26 +1417,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT": lambda: int(
|
||||
os.getenv("VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT", "480")
|
||||
),
|
||||
# Controls whether or not to use cudnn prefill
|
||||
"VLLM_USE_CUDNN_PREFILL": lambda: bool(
|
||||
int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))
|
||||
),
|
||||
# Controls whether to use TRT-LLM ragged DeepSeek prefill
|
||||
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL": lambda: bool(
|
||||
int(os.getenv("VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", "0"))
|
||||
),
|
||||
# If set to 1/True, use the TRTLLM attention backend in flashinfer.
|
||||
# If set to 0/False, use the default attention backend in flashinfer.
|
||||
# If not set, auto-detect the attention backend in flashinfer.
|
||||
"VLLM_USE_TRTLLM_ATTENTION": lambda: (
|
||||
None
|
||||
if "VLLM_USE_TRTLLM_ATTENTION" not in os.environ
|
||||
else os.environ["VLLM_USE_TRTLLM_ATTENTION"].lower() in ("1", "true")
|
||||
),
|
||||
# If set to 1, when we use fp8 kv, we do not quantize Q to fp8
|
||||
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": lambda: bool(
|
||||
int(os.getenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "0"))
|
||||
),
|
||||
# If set, it means we pre-downloaded cubin files and flashinfer will
|
||||
# read the cubin files directly.
|
||||
"VLLM_HAS_FLASHINFER_CUBIN": lambda: bool(
|
||||
|
||||
@@ -287,7 +287,10 @@ class RocmPlatform(Platform):
|
||||
return AttentionBackendEnum.ROCM_AITER_FA.get_path()
|
||||
|
||||
# Priority 3: Check for ROCM_ATTN (prefill-decode split)
|
||||
if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
|
||||
from vllm.config import get_current_vllm_config
|
||||
|
||||
vllm_config = get_current_vllm_config()
|
||||
if vllm_config.attention_config.use_prefill_decode_attention:
|
||||
logger.info("Using Rocm Attention backend.")
|
||||
return AttentionBackendEnum.ROCM_ATTN.get_path()
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@ _GLOBAL_RUNTIME_DATA = dict[str, str | int | bool]()
|
||||
|
||||
_USAGE_ENV_VARS_TO_COLLECT = [
|
||||
"VLLM_USE_MODELSCOPE",
|
||||
"VLLM_ATTENTION_BACKEND",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_PP_LAYER_PARTITION",
|
||||
"VLLM_USE_TRITON_AWQ",
|
||||
|
||||
Reference in New Issue
Block a user