2025-02-02 14:58:18 -05:00
# SPDX-License-Identifier: Apache-2.0
2025-06-03 11:20:17 -07:00
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
2025-02-02 14:58:18 -05:00
2025-03-23 23:53:09 +08:00
import hashlib
2024-05-02 11:13:25 -07:00
import os
2025-03-27 16:14:41 -07:00
import sys
2024-08-07 12:24:56 -04:00
import tempfile
2025-03-03 01:34:51 +00:00
from typing import TYPE_CHECKING , Any , Callable , Optional
2024-05-02 11:13:25 -07:00
if TYPE_CHECKING :
VLLM_HOST_IP : str = " "
2024-05-21 01:45:06 +08:00
VLLM_PORT : Optional [ int ] = None
2024-08-07 12:24:56 -04:00
VLLM_RPC_BASE_PATH : str = tempfile . gettempdir ( )
2024-05-02 11:13:25 -07:00
VLLM_USE_MODELSCOPE : bool = False
2024-06-20 22:12:35 -07:00
VLLM_RINGBUFFER_WARNING_INTERVAL : int = 60
2024-05-02 11:13:25 -07:00
VLLM_NCCL_SO_PATH : Optional [ str ] = None
LD_LIBRARY_PATH : Optional [ str ] = None
2025-06-12 11:31:04 -04:00
VLLM_USE_TRITON_FLASH_ATTN : bool = True
2025-05-29 10:48:24 -04:00
VLLM_V1_USE_PREFILL_DECODE_ATTENTION : bool = False
2025-08-05 22:37:21 -07:00
VLLM_USE_AITER_UNIFIED_ATTENTION : bool = False
2025-01-23 09:45:48 -05:00
VLLM_FLASH_ATTN_VERSION : Optional [ int ] = None
2024-05-02 11:13:25 -07:00
LOCAL_RANK : int = 0
CUDA_VISIBLE_DEVICES : Optional [ str ] = None
VLLM_ENGINE_ITERATION_TIMEOUT_S : int = 60
VLLM_API_KEY : Optional [ str ] = None
S3_ACCESS_KEY_ID : Optional [ str ] = None
S3_SECRET_ACCESS_KEY : Optional [ str ] = None
S3_ENDPOINT_URL : Optional [ str ] = None
2025-03-27 17:21:23 +08:00
VLLM_MODEL_REDIRECT_PATH : Optional [ str ] = None
2024-07-16 14:12:25 +08:00
VLLM_CACHE_ROOT : str = os . path . expanduser ( " ~/.cache/vllm " )
VLLM_CONFIG_ROOT : str = os . path . expanduser ( " ~/.config/vllm " )
2024-05-02 11:13:25 -07:00
VLLM_USAGE_STATS_SERVER : str = " https://stats.vllm.ai "
VLLM_NO_USAGE_STATS : bool = False
VLLM_DO_NOT_TRACK : bool = False
VLLM_USAGE_SOURCE : str = " "
VLLM_CONFIGURE_LOGGING : int = 1
2024-05-24 23:49:49 -07:00
VLLM_LOGGING_LEVEL : str = " INFO "
2024-10-22 20:17:28 -07:00
VLLM_LOGGING_PREFIX : str = " "
2024-05-02 11:13:25 -07:00
VLLM_LOGGING_CONFIG_PATH : Optional [ str ] = None
2025-02-04 18:46:26 -08:00
VLLM_LOGITS_PROCESSOR_THREADS : Optional [ int ] = None
2024-05-02 11:13:25 -07:00
VLLM_TRACE_FUNCTION : int = 0
VLLM_ATTENTION_BACKEND : Optional [ str ] = None
2024-12-27 09:32:38 +09:00
VLLM_USE_FLASHINFER_SAMPLER : Optional [ bool ] = None
2024-10-19 02:55:48 +02:00
VLLM_FLASHINFER_FORCE_TENSOR_CORES : bool = False
2024-07-31 12:02:17 -07:00
VLLM_PP_LAYER_PARTITION : Optional [ str ] = None
2025-07-22 00:07:08 +08:00
VLLM_CPU_KVCACHE_SPACE : Optional [ int ] = 0
2024-07-27 04:50:10 +08:00
VLLM_CPU_OMP_THREADS_BIND : str = " "
2025-07-19 20:13:55 +08:00
VLLM_CPU_NUM_OF_RESERVED_CPU : Optional [ int ] = None
2025-03-14 11:43:18 +08:00
VLLM_CPU_MOE_PREPACK : bool = True
2025-07-01 15:25:03 +08:00
VLLM_CPU_SGL_KERNEL : bool = False
2024-07-16 14:12:25 +08:00
VLLM_XLA_CACHE_PATH : str = os . path . join ( VLLM_CACHE_ROOT , " xla_cache " )
2025-03-20 20:05:28 -07:00
VLLM_XLA_CHECK_RECOMPILATION : bool = False
2024-07-02 00:08:29 +03:00
VLLM_FUSED_MOE_CHUNK_SIZE : int = 64 * 1024
2025-06-22 15:17:49 -07:00
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING : bool = True
2024-07-17 22:27:09 -07:00
VLLM_USE_RAY_SPMD_WORKER : bool = False
2024-05-02 11:13:25 -07:00
VLLM_USE_RAY_COMPILED_DAG : bool = False
2025-04-01 06:07:53 -07:00
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE : str = " auto "
2025-01-14 21:19:55 -08:00
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM : bool = False
2025-08-01 11:50:58 -07:00
VLLM_USE_RAY_WRAPPED_PP_COMM : bool = True
2025-06-02 17:06:20 -07:00
VLLM_XLA_USE_SPMD : bool = False
2024-06-20 17:06:34 -07:00
VLLM_WORKER_MULTIPROC_METHOD : str = " fork "
2024-07-16 14:12:25 +08:00
VLLM_ASSETS_CACHE : str = os . path . join ( VLLM_CACHE_ROOT , " assets " )
2024-06-07 11:23:32 -07:00
VLLM_IMAGE_FETCH_TIMEOUT : int = 5
2024-11-21 15:06:42 +08:00
VLLM_VIDEO_FETCH_TIMEOUT : int = 30
2024-11-08 04:25:59 +08:00
VLLM_AUDIO_FETCH_TIMEOUT : int = 10
2025-08-13 06:09:26 -07:00
VLLM_MEDIA_LOADING_THREAD_COUNT : int = 8
2025-07-23 23:22:19 -04:00
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB : int = 25
2025-05-14 22:26:49 -07:00
VLLM_VIDEO_LOADER_BACKEND : str = " opencv "
2025-08-07 16:45:04 +08:00
VLLM_MM_INPUT_CACHE_GIB : int = 4
2024-05-03 15:55:56 -07:00
VLLM_TARGET_DEVICE : str = " cuda "
MAX_JOBS : Optional [ str ] = None
NVCC_THREADS : Optional [ str ] = None
VLLM_USE_PRECOMPILED : bool = False
2025-08-10 19:29:02 -04:00
VLLM_DOCKER_BUILD_CONTEXT : bool = False
2025-03-03 00:43:14 -08:00
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL : bool = False
2024-08-08 10:47:48 -06:00
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH : bool = False
2024-05-03 15:55:56 -07:00
CMAKE_BUILD_TYPE : Optional [ str ] = None
VERBOSE : bool = False
2024-08-03 20:01:38 -03:00
VLLM_ALLOW_LONG_MAX_MODEL_LEN : bool = False
2024-09-18 09:56:58 -04:00
VLLM_RPC_TIMEOUT : int = 10000 # ms
2025-06-10 05:41:21 +08:00
VLLM_HTTP_TIMEOUT_KEEP_ALIVE : int = 5 # seconds
2025-03-03 01:34:51 +00:00
VLLM_PLUGINS : Optional [ list [ str ] ] = None
2025-05-12 10:39:10 -07:00
VLLM_LORA_RESOLVER_CACHE_DIR : Optional [ str ] = None
2024-08-21 15:39:26 -07:00
VLLM_TORCH_PROFILER_DIR : Optional [ str ] = None
2025-07-30 10:46:31 +08:00
VLLM_TORCH_PROFILER_RECORD_SHAPES : bool = False
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY : bool = False
VLLM_TORCH_PROFILER_WITH_STACK : bool = True
VLLM_TORCH_PROFILER_WITH_FLOPS : bool = False
2024-09-18 18:38:11 +08:00
VLLM_USE_TRITON_AWQ : bool = False
2024-09-05 18:10:33 -07:00
VLLM_ALLOW_RUNTIME_LORA_UPDATING : bool = False
2024-09-27 14:27:56 -07:00
VLLM_SKIP_P2P_CHECK : bool = False
2025-03-03 01:34:51 +00:00
VLLM_DISABLED_KERNELS : list [ str ] = [ ]
2025-03-15 01:02:20 -04:00
VLLM_USE_V1 : bool = True
2025-03-22 13:36:14 +08:00
VLLM_ROCM_USE_AITER : bool = False
2025-04-22 17:46:28 +08:00
VLLM_ROCM_USE_AITER_PAGED_ATTN : bool = False
2025-03-29 18:33:56 +08:00
VLLM_ROCM_USE_AITER_LINEAR : bool = True
2025-03-26 16:30:30 +08:00
VLLM_ROCM_USE_AITER_MOE : bool = True
2025-03-22 13:36:14 +08:00
VLLM_ROCM_USE_AITER_RMSNORM : bool = True
2025-04-23 00:31:13 +08:00
VLLM_ROCM_USE_AITER_MLA : bool = True
2025-06-18 23:46:51 +08:00
VLLM_ROCM_USE_AITER_MHA : bool = True
2025-04-21 22:46:22 -05:00
VLLM_ROCM_USE_SKINNY_GEMM : bool = True
2025-02-22 08:54:38 -05:00
VLLM_ROCM_FP8_PADDING : bool = True
2025-03-24 19:45:30 -04:00
VLLM_ROCM_MOE_PADDING : bool = True
2025-03-27 01:46:12 -04:00
VLLM_ROCM_CUSTOM_PAGED_ATTN : bool = True
2024-12-13 19:27:32 -05:00
VLLM_ENABLE_V1_MULTIPROCESSING : bool = True
2024-12-10 12:40:52 -08:00
VLLM_LOG_BATCHSIZE_INTERVAL : float = - 1
2024-12-16 16:15:22 -08:00
VLLM_DISABLE_COMPILE_CACHE : bool = False
2025-03-20 06:14:20 +01:00
Q_SCALE_CONSTANT : int = 200
2025-01-23 13:04:03 -05:00
K_SCALE_CONSTANT : int = 200
V_SCALE_CONSTANT : int = 100
2025-01-22 10:52:27 -08:00
VLLM_SERVER_DEV_MODE : bool = False
2025-01-22 14:22:12 -08:00
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE : int = 128
2025-01-31 02:49:37 -05:00
VLLM_MLA_DISABLE : bool = False
2025-02-06 02:03:19 +08:00
VLLM_RAY_PER_WORKER_GPUS : float = 1.0
VLLM_RAY_BUNDLE_INDICES : str = " "
2025-02-12 09:06:13 -08:00
VLLM_CUDART_SO_PATH : Optional [ str ] = None
2025-02-22 19:28:59 +08:00
VLLM_DP_RANK : int = 0
2025-03-27 16:14:41 -07:00
VLLM_DP_RANK_LOCAL : int = - 1
2025-02-22 19:28:59 +08:00
VLLM_DP_SIZE : int = 1
VLLM_DP_MASTER_IP : str = " "
VLLM_DP_MASTER_PORT : int = 0
2025-06-12 14:01:16 -04:00
VLLM_MOE_DP_CHUNK_SIZE : int = 256
2025-06-09 10:50:39 -04:00
VLLM_RANDOMIZE_DP_DUMMY_INPUTS : bool = False
2025-03-08 00:53:38 +08:00
VLLM_MARLIN_USE_ATOMIC_ADD : bool = False
2025-03-14 23:57:55 -04:00
VLLM_V0_USE_OUTLINES_CACHE : bool = False
2025-07-10 14:30:26 -05:00
VLLM_V1_USE_OUTLINES_CACHE : bool = False
2025-03-26 17:35:05 -04:00
VLLM_TPU_BUCKET_PADDING_GAP : int = 0
2025-06-25 15:51:02 -07:00
VLLM_TPU_MOST_MODEL_LEN : Optional [ int ] = None
2025-07-30 10:02:12 -07:00
VLLM_TPU_USING_PATHWAYS : bool = False
2025-04-01 12:07:43 -04:00
VLLM_USE_DEEP_GEMM : bool = False
2025-08-11 12:39:08 -04:00
VLLM_USE_DEEP_GEMM_E8M0 : bool = True
2025-08-02 08:12:03 +05:30
VLLM_SKIP_DEEP_GEMM_WARMUP : bool = False
2025-07-19 02:33:01 -07:00
VLLM_USE_FLASHINFER_MOE_FP8 : bool = False
VLLM_USE_FLASHINFER_MOE_FP4 : bool = False
2025-08-07 21:18:22 -05:00
VLLM_FLASHINFER_MOE_BACKEND : str = " throughput "
2025-04-08 22:13:22 -04:00
VLLM_XGRAMMAR_CACHE_MB : int = 0
2025-04-17 04:28:32 +02:00
VLLM_MSGPACK_ZERO_COPY_THRESHOLD : int = 256
2025-05-08 01:34:02 -04:00
VLLM_ALLOW_INSECURE_SERIALIZATION : bool = False
2025-05-12 12:46:16 -04:00
VLLM_NIXL_SIDE_CHANNEL_HOST : str = " localhost "
VLLM_NIXL_SIDE_CHANNEL_PORT : int = 5557
2025-05-14 10:46:49 +08:00
VLLM_ALL2ALL_BACKEND : str = " naive "
2025-05-23 15:52:20 -07:00
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE : int = 163840
2025-05-30 17:02:54 -04:00
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS : int = 1
2025-06-05 19:53:08 +03:00
VLLM_SLEEP_WHEN_IDLE : bool = False
2025-06-08 06:39:12 -07:00
VLLM_MQ_MAX_CHUNK_BYTES_MB : int = 16
2025-06-23 11:36:26 +08:00
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS : int = 300
2025-06-17 15:09:22 +02:00
VLLM_KV_CACHE_LAYOUT : Optional [ str ] = None
2025-06-20 07:47:16 -07:00
VLLM_COMPUTE_NANS_IN_LOGITS : bool = False
2025-06-25 14:28:19 -04:00
VLLM_USE_NVFP4_CT_EMULATIONS : bool = False
2025-06-27 11:54:24 +08:00
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION : str = " NONE "
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16 : bool = True
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB : Optional [ int ] = None
2025-07-08 09:56:40 +02:00
VLLM_NIXL_ABORT_REQUEST_TIMEOUT : int = 120
2025-07-15 17:56:45 -07:00
VLLM_USE_CUDNN_PREFILL : bool = False
2025-07-23 20:20:14 -04:00
VLLM_ENABLE_CUDAGRAPH_GC : bool = False
2025-07-15 14:23:52 -07:00
VLLM_LOOPBACK_IP : str = " "
2025-07-28 18:49:04 -04:00
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE : bool = False
2025-08-03 04:04:21 -07:00
VLLM_ENABLE_RESPONSES_API_STORE : bool = False
2025-08-06 18:07:41 -07:00
VLLM_USE_TRTLLM_ATTENTION : Optional [ str ] = None
2025-08-06 12:37:27 -07:00
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 : bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 : bool = False
2025-08-13 20:27:25 +08:00
VLLM_TUNED_CONFIG_FOLDER : Optional [ str ] = None
2024-05-02 11:13:25 -07:00
2024-07-16 14:12:25 +08:00
def get_default_cache_root ( ) :
return os . getenv (
" XDG_CACHE_HOME " ,
os . path . join ( os . path . expanduser ( " ~ " ) , " .cache " ) ,
)
def get_default_config_root ( ) :
return os . getenv (
" XDG_CONFIG_HOME " ,
os . path . join ( os . path . expanduser ( " ~ " ) , " .config " ) ,
)
2025-01-23 09:45:48 -05:00
def maybe_convert_int ( value : Optional [ str ] ) - > Optional [ int ] :
if value is None :
return None
return int ( value )
2025-05-16 00:07:28 -04:00
def get_vllm_port ( ) - > Optional [ int ] :
""" Get the port from VLLM_PORT environment variable.
2025-05-29 18:41:58 -04:00
2025-05-16 00:07:28 -04:00
Returns :
The port number as an integer if VLLM_PORT is set , None otherwise .
2025-05-29 18:41:58 -04:00
2025-05-16 00:07:28 -04:00
Raises :
ValueError : If VLLM_PORT is a URI , suggest k8s service discovery issue .
"""
if ' VLLM_PORT ' not in os . environ :
return None
port = os . getenv ( ' VLLM_PORT ' , ' 0 ' )
try :
return int ( port )
except ValueError as err :
from urllib . parse import urlparse
2025-05-30 14:43:47 +05:30
parsed = urlparse ( port )
if parsed . scheme :
raise ValueError (
f " VLLM_PORT ' { port } ' appears to be a URI. "
" This may be caused by a Kubernetes service discovery issue, "
" check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html "
) from None
2025-05-16 00:07:28 -04:00
raise ValueError (
f " VLLM_PORT ' { port } ' must be a valid integer " ) from err
2024-05-02 22:13:49 -07:00
# The begin-* and end* here are used by the documentation generator
# to extract the used env vars.
2025-05-23 11:09:53 +02:00
# --8<-- [start:env-vars-definition]
2024-05-02 22:13:49 -07:00
2025-03-03 01:34:51 +00:00
environment_variables : dict [ str , Callable [ [ ] , Any ] ] = {
2024-05-03 15:55:56 -07:00
# ================== Installation Time Env Vars ==================
2024-06-28 17:50:16 +04:00
# Target device of vLLM, supporting [cuda (by default),
2025-03-22 17:06:39 -04:00
# rocm, neuron, cpu]
2024-05-03 15:55:56 -07:00
" VLLM_TARGET_DEVICE " :
2025-08-02 04:41:40 +02:00
lambda : os . getenv ( " VLLM_TARGET_DEVICE " , " cuda " ) . lower ( ) ,
2024-05-03 15:55:56 -07:00
# Maximum number of compilation jobs to run in parallel.
# By default this is the number of CPUs
" MAX_JOBS " :
lambda : os . getenv ( " MAX_JOBS " , None ) ,
# Number of threads to use for nvcc
# By default this is 1.
# If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
" NVCC_THREADS " :
lambda : os . getenv ( " NVCC_THREADS " , None ) ,
# If set, vllm will use precompiled binaries (*.so)
" VLLM_USE_PRECOMPILED " :
2025-08-10 19:29:02 -04:00
lambda : os . environ . get ( " VLLM_USE_PRECOMPILED " , " " ) . strip ( ) . lower ( ) in
( " 1 " , " true " ) or bool ( os . environ . get ( " VLLM_PRECOMPILED_WHEEL_LOCATION " ) ) ,
# Used to mark that setup.py is running in a Docker build context,
# in order to force the use of precompiled binaries.
" VLLM_DOCKER_BUILD_CONTEXT " :
lambda : os . environ . get ( " VLLM_DOCKER_BUILD_CONTEXT " , " " ) . strip ( ) . lower ( ) in
( " 1 " , " true " ) ,
2024-05-03 15:55:56 -07:00
2025-03-03 00:43:14 -08:00
# Whether to force using nightly wheel in python build.
# This is used for testing the nightly wheel in python build.
" VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL " :
lambda : bool ( int ( os . getenv ( " VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL " , " 0 " ) )
) ,
2024-05-03 15:55:56 -07:00
# CMake build type
# If not set, defaults to "Debug" or "RelWithDebInfo"
# Available options: "Debug", "Release", "RelWithDebInfo"
" CMAKE_BUILD_TYPE " :
lambda : os . getenv ( " CMAKE_BUILD_TYPE " ) ,
# If set, vllm will print verbose logs during installation
" VERBOSE " :
lambda : bool ( int ( os . getenv ( ' VERBOSE ' , ' 0 ' ) ) ) ,
2025-03-10 17:36:21 +01:00
# Root directory for vLLM configuration files
2024-07-16 14:12:25 +08:00
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
2024-05-03 15:55:56 -07:00
# Note that this not only affects how vllm finds its configuration files
# during runtime, but also affects how vllm installs its configuration
# files during **installation**.
" VLLM_CONFIG_ROOT " :
2024-07-16 14:12:25 +08:00
lambda : os . path . expanduser (
os . getenv (
" VLLM_CONFIG_ROOT " ,
os . path . join ( get_default_config_root ( ) , " vllm " ) ,
) ) ,
2024-05-03 15:55:56 -07:00
# ================== Runtime Env Vars ==================
2025-03-10 17:36:21 +01:00
# Root directory for vLLM cache files
2024-07-16 14:12:25 +08:00
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
" VLLM_CACHE_ROOT " :
lambda : os . path . expanduser (
os . getenv (
" VLLM_CACHE_ROOT " ,
os . path . join ( get_default_cache_root ( ) , " vllm " ) ,
) ) ,
2024-08-21 16:25:34 -07:00
# used in distributed environment to determine the ip address
# of the current node, when the node has multiple network interfaces.
# If you are using multi-node inference, you should set this differently
# on each node.
2024-05-02 11:13:25 -07:00
' VLLM_HOST_IP ' :
2024-11-25 17:04:50 -08:00
lambda : os . getenv ( ' VLLM_HOST_IP ' , " " ) ,
2024-05-02 11:13:25 -07:00
2024-05-21 01:45:06 +08:00
# used in distributed environment to manually set the communication port
2024-06-06 22:15:11 -07:00
# Note: if VLLM_PORT is set, and some code asks for multiple ports, the
# VLLM_PORT will be used as the first port, and the rest will be generated
# by incrementing the VLLM_PORT value.
2024-05-21 01:45:06 +08:00
' VLLM_PORT ' :
2025-05-16 00:07:28 -04:00
get_vllm_port ,
2024-05-21 01:45:06 +08:00
2024-08-07 12:24:56 -04:00
# path used for ipc when the frontend api server is running in
# multi-processing mode to communicate with the backend engine process.
' VLLM_RPC_BASE_PATH ' :
lambda : os . getenv ( ' VLLM_RPC_BASE_PATH ' , tempfile . gettempdir ( ) ) ,
2024-08-02 21:27:28 -04:00
2024-05-02 11:13:25 -07:00
# If true, will load models from ModelScope instead of Hugging Face Hub.
# note that the value is true or false, not numbers
" VLLM_USE_MODELSCOPE " :
lambda : os . environ . get ( " VLLM_USE_MODELSCOPE " , " False " ) . lower ( ) == " true " ,
2024-06-20 22:12:35 -07:00
# Interval in seconds to log a warning message when the ring buffer is full
" VLLM_RINGBUFFER_WARNING_INTERVAL " :
lambda : int ( os . environ . get ( " VLLM_RINGBUFFER_WARNING_INTERVAL " , " 60 " ) ) ,
2024-05-02 11:13:25 -07:00
# path to cudatoolkit home directory, under which should be bin, include,
# and lib directories.
" CUDA_HOME " :
lambda : os . environ . get ( " CUDA_HOME " , None ) ,
# Path to the NCCL library file. It is needed because nccl>=2.19 brought
# by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
" VLLM_NCCL_SO_PATH " :
lambda : os . environ . get ( " VLLM_NCCL_SO_PATH " , None ) ,
# when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
# library file in the locations specified by `LD_LIBRARY_PATH`
" LD_LIBRARY_PATH " :
lambda : os . environ . get ( " LD_LIBRARY_PATH " , None ) ,
# flag to control if vllm should use triton flash attention
" VLLM_USE_TRITON_FLASH_ATTN " :
lambda : ( os . environ . get ( " VLLM_USE_TRITON_FLASH_ATTN " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-05-29 10:48:24 -04:00
# Use separate prefill and decode kernels for V1 attention instead of
# the unified triton kernel.
" VLLM_V1_USE_PREFILL_DECODE_ATTENTION " :
lambda :
( os . getenv ( " VLLM_V1_USE_PREFILL_DECODE_ATTENTION " , " False " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-08-05 22:37:21 -07:00
# Use AITER triton unified attention for V1 attention
" VLLM_USE_AITER_UNIFIED_ATTENTION " :
lambda :
( os . getenv ( " VLLM_USE_AITER_UNIFIED_ATTENTION " , " False " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-01-23 09:45:48 -05:00
# Force vllm to use a specific flash-attention version (2 or 3), only valid
# when using the flash-attention backend.
" VLLM_FLASH_ATTN_VERSION " :
lambda : maybe_convert_int ( os . environ . get ( " VLLM_FLASH_ATTN_VERSION " , None ) ) ,
2024-09-11 15:52:19 -04:00
# Internal flag to enable Dynamo fullgraph capture
" VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE " :
lambda : bool (
os . environ . get ( " VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE " , " 1 " ) != " 0 " ) ,
2024-10-29 23:03:49 -07:00
2025-05-29 18:41:58 -04:00
# Feature flag to enable/disable Inductor standalone compile.
# In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
# enabled by default.
" VLLM_USE_STANDALONE_COMPILE " :
lambda : os . environ . get ( " VLLM_USE_STANDALONE_COMPILE " , " 1 " ) == " 1 " ,
2025-05-09 15:59:04 -04:00
2024-05-02 11:13:25 -07:00
# local rank of the process in the distributed setting, used to determine
# the GPU device id
" LOCAL_RANK " :
lambda : int ( os . environ . get ( " LOCAL_RANK " , " 0 " ) ) ,
# used to control the visible devices in the distributed setting
" CUDA_VISIBLE_DEVICES " :
lambda : os . environ . get ( " CUDA_VISIBLE_DEVICES " , None ) ,
# timeout for each iteration in the engine
" VLLM_ENGINE_ITERATION_TIMEOUT_S " :
lambda : int ( os . environ . get ( " VLLM_ENGINE_ITERATION_TIMEOUT_S " , " 60 " ) ) ,
2025-03-10 17:36:21 +01:00
# API key for vLLM API server
2024-05-02 11:13:25 -07:00
" VLLM_API_KEY " :
lambda : os . environ . get ( " VLLM_API_KEY " , None ) ,
2025-03-27 13:49:38 -04:00
# Whether to log responses from API Server for debugging
" VLLM_DEBUG_LOG_API_SERVER_RESPONSE " :
2025-05-29 10:48:24 -04:00
lambda : os . environ . get ( " VLLM_DEBUG_LOG_API_SERVER_RESPONSE " , " False "
) . lower ( ) == " true " ,
2025-03-27 13:49:38 -04:00
2024-05-02 11:13:25 -07:00
# S3 access information, used for tensorizer to load model from S3
" S3_ACCESS_KEY_ID " :
2024-05-13 17:57:07 -04:00
lambda : os . environ . get ( " S3_ACCESS_KEY_ID " , None ) ,
2024-05-02 11:13:25 -07:00
" S3_SECRET_ACCESS_KEY " :
lambda : os . environ . get ( " S3_SECRET_ACCESS_KEY " , None ) ,
" S3_ENDPOINT_URL " :
lambda : os . environ . get ( " S3_ENDPOINT_URL " , None ) ,
# Usage stats collection
" VLLM_USAGE_STATS_SERVER " :
lambda : os . environ . get ( " VLLM_USAGE_STATS_SERVER " , " https://stats.vllm.ai " ) ,
" VLLM_NO_USAGE_STATS " :
lambda : os . environ . get ( " VLLM_NO_USAGE_STATS " , " 0 " ) == " 1 " ,
" VLLM_DO_NOT_TRACK " :
lambda : ( os . environ . get ( " VLLM_DO_NOT_TRACK " , None ) or os . environ . get (
" DO_NOT_TRACK " , None ) or " 0 " ) == " 1 " ,
" VLLM_USAGE_SOURCE " :
lambda : os . environ . get ( " VLLM_USAGE_SOURCE " , " production " ) ,
# Logging configuration
# If set to 0, vllm will not configure logging
# If set to 1, vllm will configure logging using the default configuration
# or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
" VLLM_CONFIGURE_LOGGING " :
lambda : int ( os . getenv ( " VLLM_CONFIGURE_LOGGING " , " 1 " ) ) ,
" VLLM_LOGGING_CONFIG_PATH " :
lambda : os . getenv ( " VLLM_LOGGING_CONFIG_PATH " ) ,
2024-05-24 23:49:49 -07:00
# this is used for configuring the default logging level
" VLLM_LOGGING_LEVEL " :
2025-03-24 11:27:30 -04:00
lambda : os . getenv ( " VLLM_LOGGING_LEVEL " , " INFO " ) . upper ( ) ,
2024-05-24 23:49:49 -07:00
2024-10-22 20:17:28 -07:00
# if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
" VLLM_LOGGING_PREFIX " :
lambda : os . getenv ( " VLLM_LOGGING_PREFIX " , " " ) ,
2025-02-04 18:46:26 -08:00
# if set, vllm will call logits processors in a thread pool with this many
# threads. This is useful when using custom logits processors that either
# (a) launch additional CUDA kernels or (b) do significant CPU-bound work
# while not holding the python GIL, or both.
" VLLM_LOGITS_PROCESSOR_THREADS " :
lambda : int ( os . getenv ( " VLLM_LOGITS_PROCESSOR_THREADS " , " 0 " ) )
if " VLLM_LOGITS_PROCESSOR_THREADS " in os . environ else None ,
2024-05-02 11:13:25 -07:00
# Trace function calls
# If set to 1, vllm will trace function calls
# Useful for debugging
" VLLM_TRACE_FUNCTION " :
lambda : int ( os . getenv ( " VLLM_TRACE_FUNCTION " , " 0 " ) ) ,
# Backend for attention computation
# Available options:
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
# - "FLASH_ATTN": use FlashAttention
# - "XFORMERS": use XFormers
# - "ROCM_FLASH": use ROCmFlashAttention
2024-07-15 12:16:51 +08:00
# - "FLASHINFER": use flashinfer
2025-03-06 05:28:50 +08:00
# - "FLASHMLA": use FlashMLA
2024-05-02 11:13:25 -07:00
" VLLM_ATTENTION_BACKEND " :
lambda : os . getenv ( " VLLM_ATTENTION_BACKEND " , None ) ,
2024-08-19 11:24:03 +08:00
# If set, vllm will use flashinfer sampler
" VLLM_USE_FLASHINFER_SAMPLER " :
2024-12-27 09:32:38 +09:00
lambda : bool ( int ( os . environ [ " VLLM_USE_FLASHINFER_SAMPLER " ] ) )
if " VLLM_USE_FLASHINFER_SAMPLER " in os . environ else None ,
2024-08-19 11:24:03 +08:00
2024-10-19 02:55:48 +02:00
# If set, vllm will force flashinfer to use tensor cores;
# otherwise will use heuristic based on model architecture.
" VLLM_FLASHINFER_FORCE_TENSOR_CORES " :
lambda : bool ( int ( os . getenv ( " VLLM_FLASHINFER_FORCE_TENSOR_CORES " , " 0 " ) ) ) ,
2024-07-31 12:02:17 -07:00
# Pipeline stage partition strategy
" VLLM_PP_LAYER_PARTITION " :
lambda : os . getenv ( " VLLM_PP_LAYER_PARTITION " , None ) ,
2024-07-27 04:50:10 +08:00
# (CPU backend only) CPU key-value cache space.
2025-07-22 00:07:08 +08:00
# default is None and will be set as 4 GB
2024-05-02 11:13:25 -07:00
" VLLM_CPU_KVCACHE_SPACE " :
2025-07-22 00:07:08 +08:00
lambda : int ( os . getenv ( " VLLM_CPU_KVCACHE_SPACE " , " 0 " ) )
if " VLLM_CPU_KVCACHE_SPACE " in os . environ else None ,
2024-05-02 11:13:25 -07:00
2024-07-27 04:50:10 +08:00
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
" VLLM_CPU_OMP_THREADS_BIND " :
2025-06-09 23:22:05 -07:00
lambda : os . getenv ( " VLLM_CPU_OMP_THREADS_BIND " , " auto " ) ,
# (CPU backend only) CPU cores not used by OMP threads .
# Those CPU cores will not be used by OMP threads of a rank.
" VLLM_CPU_NUM_OF_RESERVED_CPU " :
2025-07-19 20:13:55 +08:00
lambda : int ( os . getenv ( " VLLM_CPU_NUM_OF_RESERVED_CPU " , " 0 " ) )
if " VLLM_CPU_NUM_OF_RESERVED_CPU " in os . environ else None ,
2024-07-27 04:50:10 +08:00
2025-03-14 11:43:18 +08:00
# (CPU backend only) whether to use prepack for MoE layer. This will be
# passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
# need to set this to "0" (False).
" VLLM_CPU_MOE_PREPACK " :
lambda : bool ( int ( os . getenv ( " VLLM_CPU_MOE_PREPACK " , " 1 " ) ) ) ,
2025-07-01 15:25:03 +08:00
# (CPU backend only) whether to use SGL kernels, optimized for small batch.
" VLLM_CPU_SGL_KERNEL " :
lambda : bool ( int ( os . getenv ( " VLLM_CPU_SGL_KERNEL " , " 0 " ) ) ) ,
2024-07-17 22:27:09 -07:00
# If the env var is set, then all workers will execute as separate
# processes from the engine, and we use the same mechanism to trigger
# execution on all workers.
# Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
" VLLM_USE_RAY_SPMD_WORKER " :
2024-08-02 13:55:40 -07:00
lambda : bool ( int ( os . getenv ( " VLLM_USE_RAY_SPMD_WORKER " , " 0 " ) ) ) ,
2024-07-17 22:27:09 -07:00
2025-02-26 20:03:28 -08:00
# If the env var is set, it uses the Ray's Compiled Graph
# (previously known as ADAG) API which optimizes the
# control plane overhead.
2024-05-02 11:13:25 -07:00
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
2025-04-01 06:07:53 -07:00
# Note that this variable is set to 1 in V1 by default
# when ray distributed executor is used.
2024-05-02 11:13:25 -07:00
" VLLM_USE_RAY_COMPILED_DAG " :
2024-08-02 13:55:40 -07:00
lambda : bool ( int ( os . getenv ( " VLLM_USE_RAY_COMPILED_DAG " , " 0 " ) ) ) ,
2025-04-01 06:07:53 -07:00
# If the env var is set, Ray Compiled Graph uses the specified
# channel type to communicate between workers belonging to
# different pipeline-parallel stages.
# Available options:
# - "auto": use the default channel type
# - "nccl": use NCCL for communication
# - "shm": use shared memory and gRPC for communication
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
" VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE " :
lambda : os . getenv ( " VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE " , " auto " ) ,
2024-05-02 11:13:25 -07:00
2025-01-14 21:19:55 -08:00
# If the env var is set, it enables GPU communication overlap
2025-02-26 20:03:28 -08:00
# (experimental feature) in Ray's Compiled Graph. This flag is ignored if
2024-12-11 11:36:35 -08:00
# VLLM_USE_RAY_COMPILED_DAG is not set.
" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM " :
2025-01-14 21:19:55 -08:00
lambda : bool ( int ( os . getenv ( " VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM " , " 0 " ) )
2024-12-11 11:36:35 -08:00
) ,
2025-08-01 11:50:58 -07:00
# If the env var is set, it uses a Ray Communicator wrapping
# vLLM's pipeline parallelism communicator to interact with Ray's
# Compiled Graph. Otherwise, it uses Ray's NCCL communicator.
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
" VLLM_USE_RAY_WRAPPED_PP_COMM " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_RAY_WRAPPED_PP_COMM " , " 1 " ) ) ) ,
2024-05-02 11:13:25 -07:00
# Use dedicated multiprocess context for workers.
# Both spawn and fork work
" VLLM_WORKER_MULTIPROC_METHOD " :
2024-06-20 17:06:34 -07:00
lambda : os . getenv ( " VLLM_WORKER_MULTIPROC_METHOD " , " fork " ) ,
2024-06-07 11:23:32 -07:00
2024-07-16 14:12:25 +08:00
# Path to the cache for storing downloaded assets
" VLLM_ASSETS_CACHE " :
lambda : os . path . expanduser (
os . getenv (
" VLLM_ASSETS_CACHE " ,
os . path . join ( get_default_cache_root ( ) , " vllm " , " assets " ) ,
) ) ,
2024-06-07 11:23:32 -07:00
# Timeout for fetching images when serving multimodal models
# Default is 5 seconds
" VLLM_IMAGE_FETCH_TIMEOUT " :
lambda : int ( os . getenv ( " VLLM_IMAGE_FETCH_TIMEOUT " , " 5 " ) ) ,
2024-06-12 11:53:03 -07:00
2024-11-08 04:25:59 +08:00
# Timeout for fetching videos when serving multimodal models
2025-02-13 20:19:03 -08:00
# Default is 30 seconds
2024-11-08 04:25:59 +08:00
" VLLM_VIDEO_FETCH_TIMEOUT " :
2025-02-13 20:19:03 -08:00
lambda : int ( os . getenv ( " VLLM_VIDEO_FETCH_TIMEOUT " , " 30 " ) ) ,
2024-11-08 04:25:59 +08:00
2024-08-13 10:39:33 -07:00
# Timeout for fetching audio when serving multimodal models
2024-11-08 04:25:59 +08:00
# Default is 10 seconds
2024-08-13 10:39:33 -07:00
" VLLM_AUDIO_FETCH_TIMEOUT " :
2024-11-08 04:25:59 +08:00
lambda : int ( os . getenv ( " VLLM_AUDIO_FETCH_TIMEOUT " , " 10 " ) ) ,
2024-08-13 10:39:33 -07:00
2025-08-13 06:09:26 -07:00
# Max number of workers for the thread pool handling
# media bytes loading. Set to 1 to disable parallel processing.
# Default is 8
" VLLM_MEDIA_LOADING_THREAD_COUNT " :
lambda : int ( os . getenv ( " VLLM_MEDIA_LOADING_THREAD_COUNT " , " 8 " ) ) ,
2025-07-23 23:22:19 -04:00
# Maximum filesize in MB for a single audio file when processing
# speech-to-text requests. Files larger than this will be rejected.
# Default is 25 MB
" VLLM_MAX_AUDIO_CLIP_FILESIZE_MB " :
lambda : int ( os . getenv ( " VLLM_MAX_AUDIO_CLIP_FILESIZE_MB " , " 25 " ) ) ,
2025-05-14 22:26:49 -07:00
# Backend for Video IO
# - "opencv": Default backend that uses OpenCV stream buffered backend.
#
# Custom backend implementations can be registered
# via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and
# imported at runtime.
# If a non-existing backend is used, an AssertionError will be thrown.
" VLLM_VIDEO_LOADER_BACKEND " :
lambda : os . getenv ( " VLLM_VIDEO_LOADER_BACKEND " , " opencv " ) ,
2025-08-08 00:47:10 +08:00
# [DEPRECATED] Cache size (in GiB per process) for multimodal input cache
2025-08-07 16:45:04 +08:00
# Default is 4 GiB per API process + 4 GiB per engine core process
2025-03-15 17:52:05 +08:00
" VLLM_MM_INPUT_CACHE_GIB " :
2025-03-25 05:29:34 +08:00
lambda : int ( os . getenv ( " VLLM_MM_INPUT_CACHE_GIB " , " 4 " ) ) ,
2025-02-13 20:19:03 -08:00
2024-06-12 11:53:03 -07:00
# Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs.
" VLLM_XLA_CACHE_PATH " :
2024-07-16 14:12:25 +08:00
lambda : os . path . expanduser (
os . getenv (
2024-08-15 00:02:29 -07:00
" VLLM_XLA_CACHE_PATH " ,
2024-07-16 14:12:25 +08:00
os . path . join ( get_default_cache_root ( ) , " vllm " , " xla_cache " ) ,
) ) ,
2025-03-20 20:05:28 -07:00
# If set, assert on XLA recompilation after each execution step.
" VLLM_XLA_CHECK_RECOMPILATION " :
lambda : bool ( int ( os . getenv ( " VLLM_XLA_CHECK_RECOMPILATION " , " 0 " ) ) ) ,
2025-06-02 17:06:20 -07:00
# Enable SPMD mode for TPU backend.
" VLLM_XLA_USE_SPMD " :
lambda : bool ( int ( os . getenv ( " VLLM_XLA_USE_SPMD " , " 0 " ) ) ) ,
2024-07-02 00:08:29 +03:00
" VLLM_FUSED_MOE_CHUNK_SIZE " :
2024-08-30 11:11:39 +07:00
lambda : int ( os . getenv ( " VLLM_FUSED_MOE_CHUNK_SIZE " , " 32768 " ) ) ,
2025-06-22 15:17:49 -07:00
# Control whether to use fused MoE activation chunking. Current chunking
# logic is incompatible with torch.compile and causes IMA. See issue
# https://github.com/vllm-project/vllm/issues/19631.
" VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING " :
lambda : bool (
int ( os . getenv ( " VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING " , " 1 " ) ) ) ,
2024-07-13 11:52:22 -07:00
2024-08-08 10:47:48 -06:00
# If set, the OpenAI API server will stay alive even after the underlying
# AsyncLLMEngine errors and stops serving requests
" VLLM_KEEP_ALIVE_ON_ENGINE_DEATH " :
lambda : bool ( os . getenv ( " VLLM_KEEP_ALIVE_ON_ENGINE_DEATH " , 0 ) ) ,
2024-08-03 20:01:38 -03:00
# If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
# the user to specify a max sequence length greater than
# the max length derived from the model's config.json.
# To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
" VLLM_ALLOW_LONG_MAX_MODEL_LEN " :
lambda :
( os . environ . get ( " VLLM_ALLOW_LONG_MAX_MODEL_LEN " , " 0 " ) . strip ( ) . lower ( ) in
( " 1 " , " true " ) ) ,
2024-08-07 14:23:12 -04:00
# If set, forces FP8 Marlin to be used for FP8 quantization regardless
# of the hardware support for FP8 compute.
" VLLM_TEST_FORCE_FP8_MARLIN " :
lambda :
( os . environ . get ( " VLLM_TEST_FORCE_FP8_MARLIN " , " 0 " ) . strip ( ) . lower ( ) in
( " 1 " , " true " ) ) ,
2024-10-09 00:38:40 -07:00
" VLLM_TEST_FORCE_LOAD_FORMAT " :
lambda : os . getenv ( " VLLM_TEST_FORCE_LOAD_FORMAT " , " dummy " ) ,
2024-08-13 16:24:17 -07:00
2024-08-21 20:18:11 -06:00
# Time in ms for the zmq client to wait for a response from the backend
# server for simple data operations
2024-09-18 09:56:58 -04:00
" VLLM_RPC_TIMEOUT " :
lambda : int ( os . getenv ( " VLLM_RPC_TIMEOUT " , " 10000 " ) ) ,
2024-08-21 20:18:11 -06:00
2025-06-10 05:41:21 +08:00
# Timeout in seconds for keeping HTTP connections alive in API server
" VLLM_HTTP_TIMEOUT_KEEP_ALIVE " :
lambda : int ( os . environ . get ( " VLLM_HTTP_TIMEOUT_KEEP_ALIVE " , " 5 " ) ) ,
2024-08-13 16:24:17 -07:00
# a list of plugin names to load, separated by commas.
# if this is not set, it means all plugins will be loaded
# if this is set to an empty string, no plugins will be loaded
" VLLM_PLUGINS " :
lambda : None if " VLLM_PLUGINS " not in os . environ else os . environ [
" VLLM_PLUGINS " ] . split ( " , " ) ,
2024-08-21 15:39:26 -07:00
2025-05-12 10:39:10 -07:00
# a local directory to look in for unrecognized LoRA adapters.
# only works if plugins are enabled and
# VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
" VLLM_LORA_RESOLVER_CACHE_DIR " :
lambda : os . getenv ( " VLLM_LORA_RESOLVER_CACHE_DIR " , None ) ,
2024-08-21 15:39:26 -07:00
# Enables torch profiler if set. Path to the directory where torch profiler
# traces are saved. Note that it must be an absolute path.
" VLLM_TORCH_PROFILER_DIR " :
lambda : ( None if os . getenv ( " VLLM_TORCH_PROFILER_DIR " , None ) is None else os
. path . expanduser ( os . getenv ( " VLLM_TORCH_PROFILER_DIR " , " . " ) ) ) ,
2024-08-28 14:37:47 -05:00
2025-07-30 10:46:31 +08:00
# Enable torch profiler to record shapes if set
# VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
# not record shapes.
" VLLM_TORCH_PROFILER_RECORD_SHAPES " :
lambda : bool ( os . getenv ( " VLLM_TORCH_PROFILER_RECORD_SHAPES " , " 0 " ) != " 0 " ) ,
# Enable torch profiler to profile memory if set
# VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
# will not profile memory.
" VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY " :
lambda : bool (
os . getenv ( " VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY " , " 0 " ) != " 0 " ) ,
# Enable torch profiler to profile stack if set
# VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
# profile stack by default.
" VLLM_TORCH_PROFILER_WITH_STACK " :
lambda : bool ( os . getenv ( " VLLM_TORCH_PROFILER_WITH_STACK " , " 1 " ) != " 0 " ) ,
# Enable torch profiler to profile flops if set
# VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
# not profile flops.
" VLLM_TORCH_PROFILER_WITH_FLOPS " :
lambda : bool ( os . getenv ( " VLLM_TORCH_PROFILER_WITH_FLOPS " , " 0 " ) != " 0 " ) ,
2024-08-28 14:37:47 -05:00
# If set, vLLM will use Triton implementations of AWQ.
" VLLM_USE_TRITON_AWQ " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_TRITON_AWQ " , " 0 " ) ) ) ,
2024-09-05 18:10:33 -07:00
# If set, allow loading or unloading lora adapters in runtime,
" VLLM_ALLOW_RUNTIME_LORA_UPDATING " :
lambda :
( os . environ . get ( " VLLM_ALLOW_RUNTIME_LORA_UPDATING " , " 0 " ) . strip ( ) . lower ( ) in
( " 1 " , " true " ) ) ,
2024-09-27 14:27:56 -07:00
2025-07-30 22:05:04 +08:00
# We assume drivers can report p2p status correctly.
# If the program hangs when using custom allreduce,
# potantially caused by a bug in the driver (535 series),
# if might be helpful to set VLLM_SKIP_P2P_CHECK=0
# so that vLLM can verify if p2p is actually working.
# See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
2024-09-27 14:27:56 -07:00
" VLLM_SKIP_P2P_CHECK " :
2025-07-30 22:05:04 +08:00
lambda : os . getenv ( " VLLM_SKIP_P2P_CHECK " , " 1 " ) == " 1 " ,
2024-10-09 23:17:17 -07:00
2024-10-17 09:48:26 -04:00
# List of quantization kernels that should be disabled, used for testing
# and performance comparisons. Currently only affects MPLinearKernel
# selection
# (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
" VLLM_DISABLED_KERNELS " :
lambda : [ ] if " VLLM_DISABLED_KERNELS " not in os . environ else os . environ [
" VLLM_DISABLED_KERNELS " ] . split ( " , " ) ,
2024-10-22 01:24:07 -07:00
# If set, use the V1 code path.
" VLLM_USE_V1 " :
2025-03-15 01:02:20 -04:00
lambda : bool ( int ( os . getenv ( " VLLM_USE_V1 " , " 1 " ) ) ) ,
2024-11-11 18:05:38 -05:00
2025-03-22 13:36:14 +08:00
# Disable aiter ops unless specifically enabled.
# Acts as a parent switch to enable the rest of the other operations.
" VLLM_ROCM_USE_AITER " :
lambda : ( os . getenv ( " VLLM_ROCM_USE_AITER " , " False " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-04-22 17:46:28 +08:00
# Whether to use aiter paged attention.
# By default is disabled.
" VLLM_ROCM_USE_AITER_PAGED_ATTN " :
lambda : ( os . getenv ( " VLLM_ROCM_USE_AITER_PAGED_ATTN " , " False " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-03-29 18:33:56 +08:00
# use aiter linear op if aiter ops are enabled
# The following list of related ops
# - scaled_mm (per-tensor / rowwise)
" VLLM_ROCM_USE_AITER_LINEAR " :
lambda : ( os . getenv ( " VLLM_ROCM_USE_AITER_LINEAR " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-03-26 16:30:30 +08:00
# Whether to use aiter moe ops.
# By default is enabled.
" VLLM_ROCM_USE_AITER_MOE " :
lambda : ( os . getenv ( " VLLM_ROCM_USE_AITER_MOE " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-03-22 13:36:14 +08:00
# use aiter rms norm op if aiter ops are enabled.
" VLLM_ROCM_USE_AITER_RMSNORM " :
lambda : ( os . getenv ( " VLLM_ROCM_USE_AITER_RMSNORM " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-04-23 00:31:13 +08:00
# Whether to use aiter mla ops.
# By default is enabled.
" VLLM_ROCM_USE_AITER_MLA " :
lambda : ( os . getenv ( " VLLM_ROCM_USE_AITER_MLA " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-06-18 23:46:51 +08:00
# Whether to use aiter mha ops.
# By default is enabled.
" VLLM_ROCM_USE_AITER_MHA " :
lambda : ( os . getenv ( " VLLM_ROCM_USE_AITER_MHA " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-04-21 22:46:22 -05:00
# use rocm skinny gemms
" VLLM_ROCM_USE_SKINNY_GEMM " :
lambda : ( os . getenv ( " VLLM_ROCM_USE_SKINNY_GEMM " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-02-22 08:54:38 -05:00
# Pad the fp8 weights to 256 bytes for ROCm
" VLLM_ROCM_FP8_PADDING " :
lambda : bool ( int ( os . getenv ( " VLLM_ROCM_FP8_PADDING " , " 1 " ) ) ) ,
2025-03-20 06:14:20 +01:00
2025-03-24 19:45:30 -04:00
# Pad the weights for the moe kernel
" VLLM_ROCM_MOE_PADDING " :
lambda : bool ( int ( os . getenv ( " VLLM_ROCM_MOE_PADDING " , " 1 " ) ) ) ,
2025-03-27 01:46:12 -04:00
# custom paged attention kernel for MI3* cards
" VLLM_ROCM_CUSTOM_PAGED_ATTN " :
lambda : ( os . getenv ( " VLLM_ROCM_CUSTOM_PAGED_ATTN " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-06-27 11:54:24 +08:00
# Custom quick allreduce kernel for MI3* cards
# Choice of quantization level: FP, INT8, INT6, INT4 or NONE
# Recommended for large models to get allreduce
" VLLM_ROCM_QUICK_REDUCE_QUANTIZATION " :
lambda : os . getenv ( " VLLM_ROCM_QUICK_REDUCE_QUANTIZATION " , " NONE " ) . upper ( ) ,
# Custom quick allreduce kernel for MI3* cards
# Due to the lack of the bfloat16 asm instruction, bfloat16
# kernels are slower than fp16,
# If environment variable is set to 1, the input is converted to fp16
" VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16 " :
lambda :
( os . getenv ( " VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16 " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
# Custom quick allreduce kernel for MI3* cards.
# Controls the maximum allowed number of data bytes(MB) for custom quick
# allreduce communication.
# Default: 2048 MB.
# Data exceeding this size will use either custom allreduce or RCCL
# communication.
" VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB " :
lambda : maybe_convert_int (
os . environ . get ( " VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB " , None ) ) ,
2025-03-20 06:14:20 +01:00
# Divisor for dynamic query scale factor calculation for FP8 KV Cache
" Q_SCALE_CONSTANT " :
lambda : int ( os . getenv ( " Q_SCALE_CONSTANT " , " 200 " ) ) ,
2025-01-23 13:04:03 -05:00
# Divisor for dynamic key scale factor calculation for FP8 KV Cache
" K_SCALE_CONSTANT " :
lambda : int ( os . getenv ( " K_SCALE_CONSTANT " , " 200 " ) ) ,
# Divisor for dynamic value scale factor calculation for FP8 KV Cache
" V_SCALE_CONSTANT " :
lambda : int ( os . getenv ( " V_SCALE_CONSTANT " , " 100 " ) ) ,
2025-03-20 06:14:20 +01:00
2024-11-11 18:05:38 -05:00
# If set, enable multiprocessing in LLM for the V1 code path.
" VLLM_ENABLE_V1_MULTIPROCESSING " :
2024-12-13 19:27:32 -05:00
lambda : bool ( int ( os . getenv ( " VLLM_ENABLE_V1_MULTIPROCESSING " , " 1 " ) ) ) ,
2024-12-10 12:40:52 -08:00
" VLLM_LOG_BATCHSIZE_INTERVAL " :
lambda : float ( os . getenv ( " VLLM_LOG_BATCHSIZE_INTERVAL " , " -1 " ) ) ,
2024-12-16 16:15:22 -08:00
" VLLM_DISABLE_COMPILE_CACHE " :
lambda : bool ( int ( os . getenv ( " VLLM_DISABLE_COMPILE_CACHE " , " 0 " ) ) ) ,
2025-01-22 10:52:27 -08:00
# If set, vllm will run in development mode, which will enable
# some additional endpoints for developing and debugging,
# e.g. `/reset_prefix_cache`
" VLLM_SERVER_DEV_MODE " :
lambda : bool ( int ( os . getenv ( " VLLM_SERVER_DEV_MODE " , " 0 " ) ) ) ,
2025-01-22 14:22:12 -08:00
# Controls the maximum number of requests to handle in a
# single asyncio task when processing per-token outputs in the
# V1 AsyncLLM interface. It is applicable when handling a high
# concurrency of streaming requests.
# Setting this too high can result in a higher variance of
# inter-message latencies. Setting it too low can negatively impact
# TTFT and overall throughput.
" VLLM_V1_OUTPUT_PROC_CHUNK_SIZE " :
lambda : int ( os . getenv ( " VLLM_V1_OUTPUT_PROC_CHUNK_SIZE " , " 128 " ) ) ,
2025-01-31 02:49:37 -05:00
# If set, vLLM will disable the MLA attention optimizations.
" VLLM_MLA_DISABLE " :
lambda : bool ( int ( os . getenv ( " VLLM_MLA_DISABLE " , " 0 " ) ) ) ,
2025-02-06 02:03:19 +08:00
# Number of GPUs per worker in Ray, if it is set to be a fraction,
# it allows ray to schedule multiple actors on a single GPU,
# so that users can colocate other actors on the same GPUs as vLLM.
" VLLM_RAY_PER_WORKER_GPUS " :
lambda : float ( os . getenv ( " VLLM_RAY_PER_WORKER_GPUS " , " 1.0 " ) ) ,
# Bundle indices for Ray, if it is set, it can control precisely
# which indices are used for the Ray bundle, for every worker.
# Format: comma-separated list of integers, e.g. "0,1,2,3"
" VLLM_RAY_BUNDLE_INDICES " :
lambda : os . getenv ( " VLLM_RAY_BUNDLE_INDICES " , " " ) ,
2025-02-12 09:06:13 -08:00
# In some system, find_loaded_library() may not work. So we allow users to
# specify the path through environment variable VLLM_CUDART_SO_PATH.
" VLLM_CUDART_SO_PATH " :
lambda : os . getenv ( " VLLM_CUDART_SO_PATH " , None ) ,
2025-02-19 11:40:19 +08:00
2025-02-22 19:28:59 +08:00
# Rank of the process in the data parallel setting
" VLLM_DP_RANK " :
lambda : int ( os . getenv ( " VLLM_DP_RANK " , " 0 " ) ) ,
2025-03-27 16:14:41 -07:00
# Rank of the process in the data parallel setting.
# Defaults to VLLM_DP_RANK when not set.
" VLLM_DP_RANK_LOCAL " :
lambda : int (
os . getenv ( " VLLM_DP_RANK_LOCAL " , sys . modules [ __name__ ] . VLLM_DP_RANK ) ) ,
2025-02-22 19:28:59 +08:00
# World size of the data parallel setting
" VLLM_DP_SIZE " :
lambda : int ( os . getenv ( " VLLM_DP_SIZE " , " 1 " ) ) ,
# IP address of the master node in the data parallel setting
" VLLM_DP_MASTER_IP " :
lambda : os . getenv ( " VLLM_DP_MASTER_IP " , " 127.0.0.1 " ) ,
# Port of the master node in the data parallel setting
" VLLM_DP_MASTER_PORT " :
lambda : int ( os . getenv ( " VLLM_DP_MASTER_PORT " , " 0 " ) ) ,
2025-02-22 19:19:45 -08:00
2025-06-12 14:01:16 -04:00
# In the context of executing MoE models with Data-Parallel, Expert-Parallel
# and Batched All-to-All dispatch/combine kernels, VLLM_MOE_DP_CHUNK_SIZE
# dictates the quantum of tokens that can be dispatched from a DP
# rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
# units.
" VLLM_MOE_DP_CHUNK_SIZE " :
lambda : int ( os . getenv ( " VLLM_MOE_DP_CHUNK_SIZE " , " 256 " ) ) ,
2025-06-09 10:50:39 -04:00
# Randomize inputs during dummy runs when using Data Parallel
" VLLM_RANDOMIZE_DP_DUMMY_INPUTS " :
lambda : os . environ . get ( " VLLM_RANDOMIZE_DP_DUMMY_INPUTS " , " 0 " ) == " 1 " ,
2025-02-22 19:19:45 -08:00
# Whether to use S3 path for model loading in CI via RunAI Streamer
" VLLM_CI_USE_S3 " :
lambda : os . environ . get ( " VLLM_CI_USE_S3 " , " 0 " ) == " 1 " ,
2025-03-08 00:53:38 +08:00
2025-03-27 17:21:23 +08:00
# Use model_redirect to redirect the model name to a local folder.
2025-04-06 20:51:45 +08:00
# `model_redirect` can be a json file mapping the model between
# repo_id and local folder:
# {"meta-llama/Llama-3.2-1B": "/tmp/Llama-3.2-1B"}
# or a space separated values table file:
# meta-llama/Llama-3.2-1B /tmp/Llama-3.2-1B
2025-03-27 17:21:23 +08:00
" VLLM_MODEL_REDIRECT_PATH " :
lambda : os . environ . get ( " VLLM_MODEL_REDIRECT_PATH " , None ) ,
2025-03-08 00:53:38 +08:00
# Whether to use atomicAdd reduce in gptq/awq marlin kernel.
" VLLM_MARLIN_USE_ATOMIC_ADD " :
lambda : os . environ . get ( " VLLM_MARLIN_USE_ATOMIC_ADD " , " 0 " ) == " 1 " ,
2025-03-14 23:57:55 -04:00
# Whether to turn on the outlines cache for V0
# This cache is unbounded and on disk, so it's not safe to use in
# an environment with potentially malicious users.
" VLLM_V0_USE_OUTLINES_CACHE " :
lambda : os . environ . get ( " VLLM_V0_USE_OUTLINES_CACHE " , " 0 " ) == " 1 " ,
2025-03-20 19:19:40 -07:00
2025-07-10 14:30:26 -05:00
# Whether to turn on the outlines cache for V1
# This cache is unbounded and on disk, so it's not safe to use in
# an environment with potentially malicious users.
" VLLM_V1_USE_OUTLINES_CACHE " :
lambda : os . environ . get ( " VLLM_V1_USE_OUTLINES_CACHE " , " 0 " ) == " 1 " ,
2025-03-25 14:27:22 -07:00
# Gap between padding buckets for the forward pass. So we have
# 8, we will run forward pass with [16, 24, 32, ...].
" VLLM_TPU_BUCKET_PADDING_GAP " :
lambda : int ( os . environ [ " VLLM_TPU_BUCKET_PADDING_GAP " ] )
2025-03-26 17:35:05 -04:00
if " VLLM_TPU_BUCKET_PADDING_GAP " in os . environ else 0 ,
2025-06-25 15:51:02 -07:00
" VLLM_TPU_MOST_MODEL_LEN " :
lambda : maybe_convert_int ( os . environ . get ( " VLLM_TPU_MOST_MODEL_LEN " , None ) ) ,
2025-04-01 12:07:43 -04:00
2025-07-30 10:02:12 -07:00
# Whether using Pathways
" VLLM_TPU_USING_PATHWAYS " :
lambda : bool ( " proxy " in os . getenv ( " JAX_PLATFORMS " , " " ) . lower ( ) ) ,
2025-04-01 12:07:43 -04:00
# Allow use of DeepGemm kernels for fused moe ops.
" VLLM_USE_DEEP_GEMM " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_DEEP_GEMM " , " 0 " ) ) ) ,
2025-04-08 22:13:22 -04:00
2025-08-11 12:39:08 -04:00
# Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
# E8M0 is faster on B200 but may reduce accuracy.
" VLLM_USE_DEEP_GEMM_E8M0 " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_DEEP_GEMM_E8M0 " , " 1 " ) ) ) ,
2025-08-02 08:12:03 +05:30
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
# JIT all the required kernels before model execution so there is no
# JIT'ing in the hot-path. However, this warmup increases the engine
# startup time by a couple of minutes.
# Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup.
" VLLM_SKIP_DEEP_GEMM_WARMUP " :
lambda : bool ( int ( os . getenv ( " VLLM_SKIP_DEEP_GEMM_WARMUP " , " 0 " ) ) ) ,
2025-07-19 02:33:01 -07:00
# Allow use of FlashInfer MoE kernels for fused moe ops.
" VLLM_USE_FLASHINFER_MOE_FP8 " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_FLASHINFER_MOE_FP8 " , " 0 " ) ) ) ,
2025-07-17 23:32:45 -05:00
# Allow use of FlashInfer CUTLASS kernels for fused moe ops.
2025-07-19 02:33:01 -07:00
" VLLM_USE_FLASHINFER_MOE_FP4 " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_FLASHINFER_MOE_FP4 " , " 0 " ) ) ) ,
2025-07-17 23:32:45 -05:00
2025-08-06 12:37:27 -07:00
# If set to 1, use the FlashInfer
# MXFP8 (activation) x MXFP4 (weight) MoE backend.
" VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 " , " 0 " ) ) ) ,
# If set to 1, use the FlashInfer
# BF16 (activation) x MXFP4 (weight) MoE backend.
" VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 " , " 0 " ) ) ) ,
2025-04-08 22:13:22 -04:00
# Control the cache sized used by the xgrammar compiler. The default
# of 512 MB should be enough for roughly 1000 JSON schemas.
# It can be changed with this variable if needed for some reason.
" VLLM_XGRAMMAR_CACHE_MB " :
lambda : int ( os . getenv ( " VLLM_XGRAMMAR_CACHE_MB " , " 512 " ) ) ,
2025-04-17 04:28:32 +02:00
# Control the threshold for msgspec to use 'zero copy' for
# serialization/deserialization of tensors. Tensors below
# this limit will be encoded into the msgpack buffer, and
# tensors above will instead be sent via a separate message.
# While the sending side still actually copies the tensor
# in all cases, on the receiving side, tensors above this
# limit will actually be zero-copy decoded.
" VLLM_MSGPACK_ZERO_COPY_THRESHOLD " :
lambda : int ( os . getenv ( " VLLM_MSGPACK_ZERO_COPY_THRESHOLD " , " 256 " ) ) ,
2025-05-08 01:34:02 -04:00
# If set, allow insecure serialization using pickle.
# This is useful for environments where it is deemed safe to use the
# insecure method and it is needed for some reason.
" VLLM_ALLOW_INSECURE_SERIALIZATION " :
lambda : bool ( int ( os . getenv ( " VLLM_ALLOW_INSECURE_SERIALIZATION " , " 0 " ) ) ) ,
2025-05-12 12:46:16 -04:00
# IP address used for NIXL handshake between remote agents.
" VLLM_NIXL_SIDE_CHANNEL_HOST " :
lambda : os . getenv ( " VLLM_NIXL_SIDE_CHANNEL_HOST " , " localhost " ) ,
# Port used for NIXL handshake between remote agents.
" VLLM_NIXL_SIDE_CHANNEL_PORT " :
lambda : int ( os . getenv ( " VLLM_NIXL_SIDE_CHANNEL_PORT " , " 5557 " ) ) ,
2025-05-14 10:46:49 +08:00
# all2all backend for vllm's expert parallel communication
2025-05-23 23:43:43 +08:00
# Available options:
# - "naive": naive all2all implementation using all-reduce
# - "pplx": use pplx kernels
2025-06-03 15:30:02 -04:00
# - "deepep_high_throughput", use deepep high-throughput kernels
# - "deepep_low_latency", use deepep low-latency kernels
2025-05-14 10:46:49 +08:00
" VLLM_ALL2ALL_BACKEND " :
lambda : os . getenv ( " VLLM_ALL2ALL_BACKEND " , " naive " ) ,
2025-05-23 15:52:20 -07:00
2025-08-07 21:18:22 -05:00
# Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both
# require compute capability 10.0 or above.
# Available options:
# - "throughput": [default]
# Uses CUTLASS kernels optimized for high-throughput batch inference.
# - "latency":
# Uses TensorRT-LLM kernels optimized for low-latency inference.
# To set this backend, define the environment variable:
# export VLLM_FLASHINFER_MOE_BACKEND=latency.
# If not set, defaults to "throughput".
" VLLM_FLASHINFER_MOE_BACKEND " : lambda : os . getenv (
" VLLM_FLASHINFER_MOE_BACKEND " , " throughput "
) ,
2025-05-23 15:52:20 -07:00
# Control the maximum number of tokens per expert supported by the
# NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
# the blockscale tensor of activations NVFP4 Quantization.
# This is used to prevent the kernel from running out of memory.
" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE " :
lambda : int ( os . getenv ( " VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE " , " 163840 " ) ) ,
2025-05-30 17:02:54 -04:00
2025-08-06 23:06:20 -07:00
# MoE routing strategy selector.
# See `RoutingSimulator.get_available_strategies()` # for available
# strategies.
# Cutstom routing strategies can be registered by
# RoutingSimulator.register_strategy()
# Note: custom strategies may not produce correct model outputs
" VLLM_MOE_ROUTING_SIMULATION_STRATEGY " :
lambda : os . environ . get ( " VLLM_MOE_ROUTING_SIMULATION_STRATEGY " , " " ) . lower ( ) ,
2025-05-30 17:02:54 -04:00
# Regex timeout for use by the vLLM tool parsing plugins.
" VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS " :
lambda : int ( os . getenv ( " VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS " , " 1 " ) ) ,
2025-06-05 19:53:08 +03:00
# Reduce CPU usage when vLLM is idle. Enabling this will incur small
# latency penalty when a request eventually comes.
" VLLM_SLEEP_WHEN_IDLE " :
lambda : bool ( int ( os . getenv ( " VLLM_SLEEP_WHEN_IDLE " , " 0 " ) ) ) ,
2025-06-08 06:39:12 -07:00
# Control the max chunk bytes (in MB) for the rpc message queue.
# Object larger than this threshold will be broadcast to worker
# processes via zmq.
" VLLM_MQ_MAX_CHUNK_BYTES_MB " :
lambda : int ( os . getenv ( " VLLM_MQ_MAX_CHUNK_BYTES_MB " , " 16 " ) ) ,
2025-06-17 15:09:22 +02:00
2025-06-23 11:36:26 +08:00
# Timeout in seconds for execute_model RPC calls in multiprocessing
# executor (only applies when TP > 1).
" VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS " :
lambda : int ( os . getenv ( " VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS " , " 300 " ) ) ,
2025-06-17 15:09:22 +02:00
# KV Cache layout used throughout vllm.
# Some common values are:
# - NHD
# - HND
# Where N=num_blocks, H=num_heads and D=head_size. The default value will
# leave the layout choice to the backend. Mind that backends may only
# implement and support a subset of all possible layouts.
" VLLM_KV_CACHE_LAYOUT " :
2025-06-20 07:47:16 -07:00
lambda : os . getenv ( " VLLM_KV_CACHE_LAYOUT " , None ) ,
# Enable checking whether the generated logits contain NaNs,
# indicating corrupted output. Useful for debugging low level bugs
# or bad hardware but it may add compute overhead.
" VLLM_COMPUTE_NANS_IN_LOGITS " :
lambda : bool ( int ( os . getenv ( " VLLM_COMPUTE_NANS_IN_LOGITS " , " 0 " ) ) ) ,
2025-06-25 14:28:19 -04:00
# Controls whether or not emulations are used for NVFP4
# generations on machines < 100 for compressed-tensors
# models
" VLLM_USE_NVFP4_CT_EMULATIONS " :
2025-07-08 09:56:40 +02:00
lambda : bool ( int ( os . getenv ( " VLLM_USE_NVFP4_CT_EMULATIONS " , " 0 " ) ) ) ,
# Time (in seconds) after which the KV cache on the producer side is
# automatically cleared if no READ notification is received from the
# consumer. This is only applicable when using NixlConnector in a
# disaggregated decode-prefill setup.
" VLLM_NIXL_ABORT_REQUEST_TIMEOUT " :
2025-07-11 02:23:23 -07:00
lambda : int ( os . getenv ( " VLLM_NIXL_ABORT_REQUEST_TIMEOUT " , " 120 " ) ) ,
2025-07-15 17:56:45 -07:00
# Controls whether or not to use cudnn prefill
" VLLM_USE_CUDNN_PREFILL " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_CUDNN_PREFILL " , " 0 " ) ) ) ,
2025-08-06 18:07:41 -07:00
# If set to 1, use the TRTLLM attention backend in flashinfer.
" VLLM_USE_TRTLLM_ATTENTION " :
lambda : os . getenv ( " VLLM_USE_TRTLLM_ATTENTION " , None ) ,
2025-07-15 14:23:52 -07:00
2025-07-23 20:20:14 -04:00
# Controls garbage collection during CUDA graph capture.
# If set to 0 (default), enables GC freezing to speed up capture time.
# If set to 1, allows GC to run during capture.
" VLLM_ENABLE_CUDAGRAPH_GC " :
lambda : bool ( int ( os . getenv ( " VLLM_ENABLE_CUDAGRAPH_GC " , " 0 " ) ) ) ,
2025-07-15 14:23:52 -07:00
# Used to force set up loopback IP
" VLLM_LOOPBACK_IP " :
lambda : os . getenv ( " VLLM_LOOPBACK_IP " , " " ) ,
2025-07-24 18:15:23 +08:00
# Used to set the process name prefix for vLLM processes.
# This is useful for debugging and monitoring purposes.
# The default value is "VLLM".
" VLLM_PROCESS_NAME_PREFIX " :
lambda : os . getenv ( " VLLM_PROCESS_NAME_PREFIX " , " VLLM " ) ,
2025-07-28 18:49:04 -04:00
# Allow chunked local attention with hybrid kv cache manager.
# Currently using the Hybrid KV cache manager with chunked local attention
# in the Llama4 models (the only models currently using chunked local attn)
# causes a latency regression. For this reason, we disable it by default.
# This flag is used to allow users to enable it if they want to (to save on
# kv-cache memory usage and enable longer contexts)
# TODO(lucas): Remove this flag once latency regression is resolved.
" VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE " :
lambda : bool ( int ( os . getenv ( \
" VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE " , " 0 " ) ) ) ,
2025-08-03 04:04:21 -07:00
# Enables support for the "store" option in the OpenAI Responses API.
# When set to 1, vLLM's OpenAI server will retain the input and output
2025-08-04 05:12:48 -07:00
# messages for those requests in memory. By default, this is disabled (0),
# and the "store" option is ignored.
2025-08-03 04:04:21 -07:00
# NOTE/WARNING:
# 1. Messages are kept in memory only (not persisted to disk) and will be
# lost when the vLLM server shuts down.
# 2. Enabling this option will cause a memory leak, as stored messages are
# never removed from memory until the server terminates.
" VLLM_ENABLE_RESPONSES_API_STORE " :
lambda : bool ( int ( os . getenv ( " VLLM_ENABLE_RESPONSES_API_STORE " , " 0 " ) ) ) ,
2025-08-13 20:27:25 +08:00
# Allows vllm to find tuned config under customized folder
" VLLM_TUNED_CONFIG_FOLDER " :
lambda : os . getenv ( " VLLM_TUNED_CONFIG_FOLDER " , None ) ,
2024-05-02 11:13:25 -07:00
}
2025-05-23 11:09:53 +02:00
# --8<-- [end:env-vars-definition]
2024-05-02 22:13:49 -07:00
2024-05-02 11:13:25 -07:00
2024-07-16 14:12:25 +08:00
def __getattr__ ( name : str ) :
2024-05-02 11:13:25 -07:00
# lazy evaluation of environment variables
if name in environment_variables :
return environment_variables [ name ] ( )
raise AttributeError ( f " module { __name__ !r} has no attribute { name !r} " )
def __dir__ ( ) :
return list ( environment_variables . keys ( ) )
2025-03-15 01:02:20 -04:00
def is_set ( name : str ) :
""" Check if an environment variable is explicitly set. """
if name in environment_variables :
return name in os . environ
raise AttributeError ( f " module { __name__ !r} has no attribute { name !r} " )
def set_vllm_use_v1 ( use_v1 : bool ) :
if is_set ( " VLLM_USE_V1 " ) :
raise ValueError (
" Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
" explicitly by the user. Please raise this as a Github "
" Issue and explicitly set VLLM_USE_V1=0 or 1. " )
os . environ [ " VLLM_USE_V1 " ] = " 1 " if use_v1 else " 0 "
2025-03-23 23:53:09 +08:00
def compute_hash ( ) - > str :
"""
WARNING : Whenever a new key is added to this environment
variables , ensure that it is included in the factors list if
it affects the computation graph . For example , different values
of VLLM_PP_LAYER_PARTITION will generate different computation
2025-04-19 14:14:36 +08:00
graphs , so it is included in the factors list . The env vars that
2025-03-23 23:53:09 +08:00
affect the choice of different kernels or attention backends should
also be included in the factors list .
"""
factors : list [ Any ] = [ ]
# summarize environment variables
def factorize ( name : str ) :
if __getattr__ ( name ) :
factors . append ( __getattr__ ( name ) )
else :
factors . append ( " None " )
# The values of envs may affects the computation graph.
# TODO(DefTruth): hash all environment variables?
# for key in environment_variables:
# factorize(key)
environment_variables_to_hash = [
" VLLM_PP_LAYER_PARTITION " ,
" VLLM_MLA_DISABLE " ,
" VLLM_USE_TRITON_FLASH_ATTN " ,
" VLLM_USE_TRITON_AWQ " ,
" VLLM_DP_RANK " ,
" VLLM_DP_SIZE " ,
2025-05-29 18:41:58 -04:00
" VLLM_USE_STANDALONE_COMPILE " ,
2025-06-25 02:22:58 -04:00
" VLLM_FUSED_MOE_CHUNK_SIZE " ,
2025-03-23 23:53:09 +08:00
]
for key in environment_variables_to_hash :
if key in environment_variables :
factorize ( key )
2025-04-24 08:25:00 +05:30
hash_str = hashlib . md5 ( str ( factors ) . encode ( ) ,
usedforsecurity = False ) . hexdigest ( )
2025-03-23 23:53:09 +08:00
return hash_str