2024-05-02 11:13:25 -07:00
import os
2024-08-07 12:24:56 -04:00
import tempfile
2024-08-13 16:24:17 -07:00
from typing import TYPE_CHECKING , Any , Callable , Dict , List , Optional
2024-05-02 11:13:25 -07:00
if TYPE_CHECKING :
VLLM_HOST_IP : str = " "
2024-05-21 01:45:06 +08:00
VLLM_PORT : Optional [ int ] = None
2024-08-07 12:24:56 -04:00
VLLM_RPC_BASE_PATH : str = tempfile . gettempdir ( )
2024-05-02 11:13:25 -07:00
VLLM_USE_MODELSCOPE : bool = False
2024-06-20 22:12:35 -07:00
VLLM_RINGBUFFER_WARNING_INTERVAL : int = 60
2024-05-02 11:13:25 -07:00
VLLM_NCCL_SO_PATH : Optional [ str ] = None
LD_LIBRARY_PATH : Optional [ str ] = None
VLLM_USE_TRITON_FLASH_ATTN : bool = False
2025-01-23 09:45:48 -05:00
VLLM_FLASH_ATTN_VERSION : Optional [ int ] = None
2024-05-02 11:13:25 -07:00
LOCAL_RANK : int = 0
CUDA_VISIBLE_DEVICES : Optional [ str ] = None
VLLM_ENGINE_ITERATION_TIMEOUT_S : int = 60
VLLM_API_KEY : Optional [ str ] = None
S3_ACCESS_KEY_ID : Optional [ str ] = None
S3_SECRET_ACCESS_KEY : Optional [ str ] = None
S3_ENDPOINT_URL : Optional [ str ] = None
2024-07-16 14:12:25 +08:00
VLLM_CACHE_ROOT : str = os . path . expanduser ( " ~/.cache/vllm " )
VLLM_CONFIG_ROOT : str = os . path . expanduser ( " ~/.config/vllm " )
2024-05-02 11:13:25 -07:00
VLLM_USAGE_STATS_SERVER : str = " https://stats.vllm.ai "
VLLM_NO_USAGE_STATS : bool = False
VLLM_DO_NOT_TRACK : bool = False
VLLM_USAGE_SOURCE : str = " "
VLLM_CONFIGURE_LOGGING : int = 1
2024-05-24 23:49:49 -07:00
VLLM_LOGGING_LEVEL : str = " INFO "
2024-10-22 20:17:28 -07:00
VLLM_LOGGING_PREFIX : str = " "
2024-05-02 11:13:25 -07:00
VLLM_LOGGING_CONFIG_PATH : Optional [ str ] = None
VLLM_TRACE_FUNCTION : int = 0
VLLM_ATTENTION_BACKEND : Optional [ str ] = None
2024-12-27 09:32:38 +09:00
VLLM_USE_FLASHINFER_SAMPLER : Optional [ bool ] = None
2024-09-01 21:23:29 -07:00
VLLM_USE_FLASHINFER_REJECTION_SAMPLER : bool = False
2024-10-19 02:55:48 +02:00
VLLM_FLASHINFER_FORCE_TENSOR_CORES : bool = False
2024-07-31 12:02:17 -07:00
VLLM_PP_LAYER_PARTITION : Optional [ str ] = None
2024-05-02 11:13:25 -07:00
VLLM_CPU_KVCACHE_SPACE : int = 0
2024-07-27 04:50:10 +08:00
VLLM_CPU_OMP_THREADS_BIND : str = " "
2024-10-03 01:50:01 +04:00
VLLM_OPENVINO_DEVICE : str = " CPU "
2024-06-28 17:50:16 +04:00
VLLM_OPENVINO_KVCACHE_SPACE : int = 0
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION : Optional [ str ] = None
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS : bool = False
2024-07-16 14:12:25 +08:00
VLLM_XLA_CACHE_PATH : str = os . path . join ( VLLM_CACHE_ROOT , " xla_cache " )
2024-07-02 00:08:29 +03:00
VLLM_FUSED_MOE_CHUNK_SIZE : int = 64 * 1024
2024-07-17 22:27:09 -07:00
VLLM_USE_RAY_SPMD_WORKER : bool = False
2024-05-02 11:13:25 -07:00
VLLM_USE_RAY_COMPILED_DAG : bool = False
2024-08-02 13:55:40 -07:00
VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL : bool = True
2025-01-14 21:19:55 -08:00
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM : bool = False
2024-06-20 17:06:34 -07:00
VLLM_WORKER_MULTIPROC_METHOD : str = " fork "
2024-07-16 14:12:25 +08:00
VLLM_ASSETS_CACHE : str = os . path . join ( VLLM_CACHE_ROOT , " assets " )
2024-06-07 11:23:32 -07:00
VLLM_IMAGE_FETCH_TIMEOUT : int = 5
2024-11-21 15:06:42 +08:00
VLLM_VIDEO_FETCH_TIMEOUT : int = 30
2024-11-08 04:25:59 +08:00
VLLM_AUDIO_FETCH_TIMEOUT : int = 10
2024-05-03 15:55:56 -07:00
VLLM_TARGET_DEVICE : str = " cuda "
MAX_JOBS : Optional [ str ] = None
NVCC_THREADS : Optional [ str ] = None
VLLM_USE_PRECOMPILED : bool = False
2024-07-13 11:52:22 -07:00
VLLM_NO_DEPRECATION_WARNING : bool = False
2024-08-08 10:47:48 -06:00
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH : bool = False
2024-05-03 15:55:56 -07:00
CMAKE_BUILD_TYPE : Optional [ str ] = None
VERBOSE : bool = False
2024-08-03 20:01:38 -03:00
VLLM_ALLOW_LONG_MAX_MODEL_LEN : bool = False
2024-08-07 14:23:12 -04:00
VLLM_TEST_FORCE_FP8_MARLIN : bool = False
2024-09-18 09:56:58 -04:00
VLLM_RPC_TIMEOUT : int = 10000 # ms
2024-08-13 16:24:17 -07:00
VLLM_PLUGINS : Optional [ List [ str ] ] = None
2024-08-21 15:39:26 -07:00
VLLM_TORCH_PROFILER_DIR : Optional [ str ] = None
2024-09-18 18:38:11 +08:00
VLLM_USE_TRITON_AWQ : bool = False
2024-09-05 18:10:33 -07:00
VLLM_ALLOW_RUNTIME_LORA_UPDATING : bool = False
2024-09-27 14:27:56 -07:00
VLLM_SKIP_P2P_CHECK : bool = False
2024-10-17 09:48:26 -04:00
VLLM_DISABLED_KERNELS : List [ str ] = [ ]
2024-10-22 01:24:07 -07:00
VLLM_USE_V1 : bool = False
2024-12-13 19:27:32 -05:00
VLLM_ENABLE_V1_MULTIPROCESSING : bool = True
2024-12-10 12:40:52 -08:00
VLLM_LOG_BATCHSIZE_INTERVAL : float = - 1
2024-12-16 16:15:22 -08:00
VLLM_DISABLE_COMPILE_CACHE : bool = False
2025-01-22 10:52:27 -08:00
VLLM_SERVER_DEV_MODE : bool = False
2025-01-22 14:22:12 -08:00
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE : int = 128
2024-05-02 11:13:25 -07:00
2024-07-16 14:12:25 +08:00
def get_default_cache_root ( ) :
return os . getenv (
" XDG_CACHE_HOME " ,
os . path . join ( os . path . expanduser ( " ~ " ) , " .cache " ) ,
)
def get_default_config_root ( ) :
return os . getenv (
" XDG_CONFIG_HOME " ,
os . path . join ( os . path . expanduser ( " ~ " ) , " .config " ) ,
)
2025-01-23 09:45:48 -05:00
def maybe_convert_int ( value : Optional [ str ] ) - > Optional [ int ] :
if value is None :
return None
return int ( value )
2024-05-02 22:13:49 -07:00
# The begin-* and end* here are used by the documentation generator
# to extract the used env vars.
# begin-env-vars-definition
2024-05-02 11:13:25 -07:00
environment_variables : Dict [ str , Callable [ [ ] , Any ] ] = {
2024-05-03 15:55:56 -07:00
# ================== Installation Time Env Vars ==================
2024-06-28 17:50:16 +04:00
# Target device of vLLM, supporting [cuda (by default),
# rocm, neuron, cpu, openvino]
2024-05-03 15:55:56 -07:00
" VLLM_TARGET_DEVICE " :
lambda : os . getenv ( " VLLM_TARGET_DEVICE " , " cuda " ) ,
# Maximum number of compilation jobs to run in parallel.
# By default this is the number of CPUs
" MAX_JOBS " :
lambda : os . getenv ( " MAX_JOBS " , None ) ,
# Number of threads to use for nvcc
# By default this is 1.
# If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
" NVCC_THREADS " :
lambda : os . getenv ( " NVCC_THREADS " , None ) ,
# If set, vllm will use precompiled binaries (*.so)
" VLLM_USE_PRECOMPILED " :
2024-12-04 22:48:13 +01:00
lambda : bool ( os . environ . get ( " VLLM_USE_PRECOMPILED " ) ) or bool (
os . environ . get ( " VLLM_PRECOMPILED_WHEEL_LOCATION " ) ) ,
2024-05-03 15:55:56 -07:00
# CMake build type
# If not set, defaults to "Debug" or "RelWithDebInfo"
# Available options: "Debug", "Release", "RelWithDebInfo"
" CMAKE_BUILD_TYPE " :
lambda : os . getenv ( " CMAKE_BUILD_TYPE " ) ,
# If set, vllm will print verbose logs during installation
" VERBOSE " :
lambda : bool ( int ( os . getenv ( ' VERBOSE ' , ' 0 ' ) ) ) ,
# Root directory for VLLM configuration files
2024-07-16 14:12:25 +08:00
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
2024-05-03 15:55:56 -07:00
# Note that this not only affects how vllm finds its configuration files
# during runtime, but also affects how vllm installs its configuration
# files during **installation**.
" VLLM_CONFIG_ROOT " :
2024-07-16 14:12:25 +08:00
lambda : os . path . expanduser (
os . getenv (
" VLLM_CONFIG_ROOT " ,
os . path . join ( get_default_config_root ( ) , " vllm " ) ,
) ) ,
2024-05-03 15:55:56 -07:00
# ================== Runtime Env Vars ==================
2024-07-16 14:12:25 +08:00
# Root directory for VLLM cache files
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
" VLLM_CACHE_ROOT " :
lambda : os . path . expanduser (
os . getenv (
" VLLM_CACHE_ROOT " ,
os . path . join ( get_default_cache_root ( ) , " vllm " ) ,
) ) ,
2024-08-21 16:25:34 -07:00
# used in distributed environment to determine the ip address
# of the current node, when the node has multiple network interfaces.
# If you are using multi-node inference, you should set this differently
# on each node.
2024-05-02 11:13:25 -07:00
' VLLM_HOST_IP ' :
2024-11-25 17:04:50 -08:00
lambda : os . getenv ( ' VLLM_HOST_IP ' , " " ) ,
2024-05-02 11:13:25 -07:00
2024-05-21 01:45:06 +08:00
# used in distributed environment to manually set the communication port
2024-06-06 22:15:11 -07:00
# Note: if VLLM_PORT is set, and some code asks for multiple ports, the
# VLLM_PORT will be used as the first port, and the rest will be generated
# by incrementing the VLLM_PORT value.
2024-05-21 01:45:06 +08:00
# '0' is used to make mypy happy
' VLLM_PORT ' :
lambda : int ( os . getenv ( ' VLLM_PORT ' , ' 0 ' ) )
if ' VLLM_PORT ' in os . environ else None ,
2024-08-07 12:24:56 -04:00
# path used for ipc when the frontend api server is running in
# multi-processing mode to communicate with the backend engine process.
' VLLM_RPC_BASE_PATH ' :
lambda : os . getenv ( ' VLLM_RPC_BASE_PATH ' , tempfile . gettempdir ( ) ) ,
2024-08-02 21:27:28 -04:00
2024-05-02 11:13:25 -07:00
# If true, will load models from ModelScope instead of Hugging Face Hub.
# note that the value is true or false, not numbers
" VLLM_USE_MODELSCOPE " :
lambda : os . environ . get ( " VLLM_USE_MODELSCOPE " , " False " ) . lower ( ) == " true " ,
2024-06-20 22:12:35 -07:00
# Interval in seconds to log a warning message when the ring buffer is full
" VLLM_RINGBUFFER_WARNING_INTERVAL " :
lambda : int ( os . environ . get ( " VLLM_RINGBUFFER_WARNING_INTERVAL " , " 60 " ) ) ,
2024-05-02 11:13:25 -07:00
# path to cudatoolkit home directory, under which should be bin, include,
# and lib directories.
" CUDA_HOME " :
lambda : os . environ . get ( " CUDA_HOME " , None ) ,
# Path to the NCCL library file. It is needed because nccl>=2.19 brought
# by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
" VLLM_NCCL_SO_PATH " :
lambda : os . environ . get ( " VLLM_NCCL_SO_PATH " , None ) ,
# when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
# library file in the locations specified by `LD_LIBRARY_PATH`
" LD_LIBRARY_PATH " :
lambda : os . environ . get ( " LD_LIBRARY_PATH " , None ) ,
# flag to control if vllm should use triton flash attention
" VLLM_USE_TRITON_FLASH_ATTN " :
lambda : ( os . environ . get ( " VLLM_USE_TRITON_FLASH_ATTN " , " True " ) . lower ( ) in
( " true " , " 1 " ) ) ,
2025-01-23 09:45:48 -05:00
# Force vllm to use a specific flash-attention version (2 or 3), only valid
# when using the flash-attention backend.
" VLLM_FLASH_ATTN_VERSION " :
lambda : maybe_convert_int ( os . environ . get ( " VLLM_FLASH_ATTN_VERSION " , None ) ) ,
2024-09-11 15:52:19 -04:00
# Internal flag to enable Dynamo fullgraph capture
" VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE " :
lambda : bool (
os . environ . get ( " VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE " , " 1 " ) != " 0 " ) ,
2024-10-29 23:03:49 -07:00
2024-05-02 11:13:25 -07:00
# local rank of the process in the distributed setting, used to determine
# the GPU device id
" LOCAL_RANK " :
lambda : int ( os . environ . get ( " LOCAL_RANK " , " 0 " ) ) ,
# used to control the visible devices in the distributed setting
" CUDA_VISIBLE_DEVICES " :
lambda : os . environ . get ( " CUDA_VISIBLE_DEVICES " , None ) ,
# timeout for each iteration in the engine
" VLLM_ENGINE_ITERATION_TIMEOUT_S " :
lambda : int ( os . environ . get ( " VLLM_ENGINE_ITERATION_TIMEOUT_S " , " 60 " ) ) ,
# API key for VLLM API server
" VLLM_API_KEY " :
lambda : os . environ . get ( " VLLM_API_KEY " , None ) ,
# S3 access information, used for tensorizer to load model from S3
" S3_ACCESS_KEY_ID " :
2024-05-13 17:57:07 -04:00
lambda : os . environ . get ( " S3_ACCESS_KEY_ID " , None ) ,
2024-05-02 11:13:25 -07:00
" S3_SECRET_ACCESS_KEY " :
lambda : os . environ . get ( " S3_SECRET_ACCESS_KEY " , None ) ,
" S3_ENDPOINT_URL " :
lambda : os . environ . get ( " S3_ENDPOINT_URL " , None ) ,
# Usage stats collection
" VLLM_USAGE_STATS_SERVER " :
lambda : os . environ . get ( " VLLM_USAGE_STATS_SERVER " , " https://stats.vllm.ai " ) ,
" VLLM_NO_USAGE_STATS " :
lambda : os . environ . get ( " VLLM_NO_USAGE_STATS " , " 0 " ) == " 1 " ,
" VLLM_DO_NOT_TRACK " :
lambda : ( os . environ . get ( " VLLM_DO_NOT_TRACK " , None ) or os . environ . get (
" DO_NOT_TRACK " , None ) or " 0 " ) == " 1 " ,
" VLLM_USAGE_SOURCE " :
lambda : os . environ . get ( " VLLM_USAGE_SOURCE " , " production " ) ,
# Logging configuration
# If set to 0, vllm will not configure logging
# If set to 1, vllm will configure logging using the default configuration
# or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
" VLLM_CONFIGURE_LOGGING " :
lambda : int ( os . getenv ( " VLLM_CONFIGURE_LOGGING " , " 1 " ) ) ,
" VLLM_LOGGING_CONFIG_PATH " :
lambda : os . getenv ( " VLLM_LOGGING_CONFIG_PATH " ) ,
2024-05-24 23:49:49 -07:00
# this is used for configuring the default logging level
" VLLM_LOGGING_LEVEL " :
lambda : os . getenv ( " VLLM_LOGGING_LEVEL " , " INFO " ) ,
2024-10-22 20:17:28 -07:00
# if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
" VLLM_LOGGING_PREFIX " :
lambda : os . getenv ( " VLLM_LOGGING_PREFIX " , " " ) ,
2024-05-02 11:13:25 -07:00
# Trace function calls
# If set to 1, vllm will trace function calls
# Useful for debugging
" VLLM_TRACE_FUNCTION " :
lambda : int ( os . getenv ( " VLLM_TRACE_FUNCTION " , " 0 " ) ) ,
# Backend for attention computation
# Available options:
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
# - "FLASH_ATTN": use FlashAttention
# - "XFORMERS": use XFormers
# - "ROCM_FLASH": use ROCmFlashAttention
2024-07-15 12:16:51 +08:00
# - "FLASHINFER": use flashinfer
2024-05-02 11:13:25 -07:00
" VLLM_ATTENTION_BACKEND " :
lambda : os . getenv ( " VLLM_ATTENTION_BACKEND " , None ) ,
2024-08-19 11:24:03 +08:00
# If set, vllm will use flashinfer sampler
" VLLM_USE_FLASHINFER_SAMPLER " :
2024-12-27 09:32:38 +09:00
lambda : bool ( int ( os . environ [ " VLLM_USE_FLASHINFER_SAMPLER " ] ) )
if " VLLM_USE_FLASHINFER_SAMPLER " in os . environ else None ,
2024-08-19 11:24:03 +08:00
2024-10-19 02:55:48 +02:00
# If set, vllm will force flashinfer to use tensor cores;
# otherwise will use heuristic based on model architecture.
" VLLM_FLASHINFER_FORCE_TENSOR_CORES " :
lambda : bool ( int ( os . getenv ( " VLLM_FLASHINFER_FORCE_TENSOR_CORES " , " 0 " ) ) ) ,
2024-07-31 12:02:17 -07:00
# Pipeline stage partition strategy
" VLLM_PP_LAYER_PARTITION " :
lambda : os . getenv ( " VLLM_PP_LAYER_PARTITION " , None ) ,
2024-07-27 04:50:10 +08:00
# (CPU backend only) CPU key-value cache space.
2024-05-02 11:13:25 -07:00
# default is 4GB
" VLLM_CPU_KVCACHE_SPACE " :
lambda : int ( os . getenv ( " VLLM_CPU_KVCACHE_SPACE " , " 0 " ) ) ,
2024-07-27 04:50:10 +08:00
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
" VLLM_CPU_OMP_THREADS_BIND " :
lambda : os . getenv ( " VLLM_CPU_OMP_THREADS_BIND " , " all " ) ,
2024-10-03 01:50:01 +04:00
# OpenVINO device selection
# default is CPU
" VLLM_OPENVINO_DEVICE " :
lambda : os . getenv ( " VLLM_OPENVINO_DEVICE " , " CPU " ) . upper ( ) ,
2024-06-28 17:50:16 +04:00
# OpenVINO key-value cache space
# default is 4GB
" VLLM_OPENVINO_KVCACHE_SPACE " :
lambda : int ( os . getenv ( " VLLM_OPENVINO_KVCACHE_SPACE " , " 0 " ) ) ,
# OpenVINO KV cache precision
# default is bf16 if natively supported by platform, otherwise f16
# To enable KV cache compression, please, explicitly specify u8
" VLLM_OPENVINO_CPU_KV_CACHE_PRECISION " :
lambda : os . getenv ( " VLLM_OPENVINO_CPU_KV_CACHE_PRECISION " , None ) ,
# Enables weights compression during model export via HF Optimum
# default is False
" VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS " :
lambda : bool ( os . getenv ( " VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS " , False ) ) ,
2024-07-17 22:27:09 -07:00
# If the env var is set, then all workers will execute as separate
# processes from the engine, and we use the same mechanism to trigger
# execution on all workers.
# Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
" VLLM_USE_RAY_SPMD_WORKER " :
2024-08-02 13:55:40 -07:00
lambda : bool ( int ( os . getenv ( " VLLM_USE_RAY_SPMD_WORKER " , " 0 " ) ) ) ,
2024-07-17 22:27:09 -07:00
2024-05-02 11:13:25 -07:00
# If the env var is set, it uses the Ray's compiled DAG API
# which optimizes the control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
" VLLM_USE_RAY_COMPILED_DAG " :
2024-08-02 13:55:40 -07:00
lambda : bool ( int ( os . getenv ( " VLLM_USE_RAY_COMPILED_DAG " , " 0 " ) ) ) ,
# If the env var is set, it uses NCCL for communication in
# Ray's compiled DAG. This flag is ignored if
# VLLM_USE_RAY_COMPILED_DAG is not set.
" VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL " , " 1 " ) )
) ,
2024-05-02 11:13:25 -07:00
2025-01-14 21:19:55 -08:00
# If the env var is set, it enables GPU communication overlap
# (experimental feature) in Ray's compiled DAG. This flag is ignored if
2024-12-11 11:36:35 -08:00
# VLLM_USE_RAY_COMPILED_DAG is not set.
" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM " :
2025-01-14 21:19:55 -08:00
lambda : bool ( int ( os . getenv ( " VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM " , " 0 " ) )
2024-12-11 11:36:35 -08:00
) ,
2024-05-02 11:13:25 -07:00
# Use dedicated multiprocess context for workers.
# Both spawn and fork work
" VLLM_WORKER_MULTIPROC_METHOD " :
2024-06-20 17:06:34 -07:00
lambda : os . getenv ( " VLLM_WORKER_MULTIPROC_METHOD " , " fork " ) ,
2024-06-07 11:23:32 -07:00
2024-07-16 14:12:25 +08:00
# Path to the cache for storing downloaded assets
" VLLM_ASSETS_CACHE " :
lambda : os . path . expanduser (
os . getenv (
" VLLM_ASSETS_CACHE " ,
os . path . join ( get_default_cache_root ( ) , " vllm " , " assets " ) ,
) ) ,
2024-06-07 11:23:32 -07:00
# Timeout for fetching images when serving multimodal models
# Default is 5 seconds
" VLLM_IMAGE_FETCH_TIMEOUT " :
lambda : int ( os . getenv ( " VLLM_IMAGE_FETCH_TIMEOUT " , " 5 " ) ) ,
2024-06-12 11:53:03 -07:00
2024-11-08 04:25:59 +08:00
# Timeout for fetching videos when serving multimodal models
# Default is 15 seconds
" VLLM_VIDEO_FETCH_TIMEOUT " :
lambda : int ( os . getenv ( " VLLM_VIDEO_FETCH_TIMEOUT " , " 15 " ) ) ,
2024-08-13 10:39:33 -07:00
# Timeout for fetching audio when serving multimodal models
2024-11-08 04:25:59 +08:00
# Default is 10 seconds
2024-08-13 10:39:33 -07:00
" VLLM_AUDIO_FETCH_TIMEOUT " :
2024-11-08 04:25:59 +08:00
lambda : int ( os . getenv ( " VLLM_AUDIO_FETCH_TIMEOUT " , " 10 " ) ) ,
2024-08-13 10:39:33 -07:00
2024-06-12 11:53:03 -07:00
# Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs.
" VLLM_XLA_CACHE_PATH " :
2024-07-16 14:12:25 +08:00
lambda : os . path . expanduser (
os . getenv (
2024-08-15 00:02:29 -07:00
" VLLM_XLA_CACHE_PATH " ,
2024-07-16 14:12:25 +08:00
os . path . join ( get_default_cache_root ( ) , " vllm " , " xla_cache " ) ,
) ) ,
2024-07-02 00:08:29 +03:00
" VLLM_FUSED_MOE_CHUNK_SIZE " :
2024-08-30 11:11:39 +07:00
lambda : int ( os . getenv ( " VLLM_FUSED_MOE_CHUNK_SIZE " , " 32768 " ) ) ,
2024-07-13 11:52:22 -07:00
# If set, vllm will skip the deprecation warnings.
" VLLM_NO_DEPRECATION_WARNING " :
lambda : bool ( int ( os . getenv ( " VLLM_NO_DEPRECATION_WARNING " , " 0 " ) ) ) ,
2024-08-03 20:01:38 -03:00
2024-08-08 10:47:48 -06:00
# If set, the OpenAI API server will stay alive even after the underlying
# AsyncLLMEngine errors and stops serving requests
" VLLM_KEEP_ALIVE_ON_ENGINE_DEATH " :
lambda : bool ( os . getenv ( " VLLM_KEEP_ALIVE_ON_ENGINE_DEATH " , 0 ) ) ,
2024-08-03 20:01:38 -03:00
# If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
# the user to specify a max sequence length greater than
# the max length derived from the model's config.json.
# To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
" VLLM_ALLOW_LONG_MAX_MODEL_LEN " :
lambda :
( os . environ . get ( " VLLM_ALLOW_LONG_MAX_MODEL_LEN " , " 0 " ) . strip ( ) . lower ( ) in
( " 1 " , " true " ) ) ,
2024-08-07 14:23:12 -04:00
# If set, forces FP8 Marlin to be used for FP8 quantization regardless
# of the hardware support for FP8 compute.
" VLLM_TEST_FORCE_FP8_MARLIN " :
lambda :
( os . environ . get ( " VLLM_TEST_FORCE_FP8_MARLIN " , " 0 " ) . strip ( ) . lower ( ) in
( " 1 " , " true " ) ) ,
2024-10-09 00:38:40 -07:00
" VLLM_TEST_FORCE_LOAD_FORMAT " :
lambda : os . getenv ( " VLLM_TEST_FORCE_LOAD_FORMAT " , " dummy " ) ,
2024-08-13 16:24:17 -07:00
2024-08-21 20:18:11 -06:00
# Time in ms for the zmq client to wait for a response from the backend
# server for simple data operations
2024-09-18 09:56:58 -04:00
" VLLM_RPC_TIMEOUT " :
lambda : int ( os . getenv ( " VLLM_RPC_TIMEOUT " , " 10000 " ) ) ,
2024-08-21 20:18:11 -06:00
2024-08-13 16:24:17 -07:00
# a list of plugin names to load, separated by commas.
# if this is not set, it means all plugins will be loaded
# if this is set to an empty string, no plugins will be loaded
" VLLM_PLUGINS " :
lambda : None if " VLLM_PLUGINS " not in os . environ else os . environ [
" VLLM_PLUGINS " ] . split ( " , " ) ,
2024-08-21 15:39:26 -07:00
# Enables torch profiler if set. Path to the directory where torch profiler
# traces are saved. Note that it must be an absolute path.
" VLLM_TORCH_PROFILER_DIR " :
lambda : ( None if os . getenv ( " VLLM_TORCH_PROFILER_DIR " , None ) is None else os
. path . expanduser ( os . getenv ( " VLLM_TORCH_PROFILER_DIR " , " . " ) ) ) ,
2024-08-28 14:37:47 -05:00
# If set, vLLM will use Triton implementations of AWQ.
" VLLM_USE_TRITON_AWQ " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_TRITON_AWQ " , " 0 " ) ) ) ,
2024-09-05 18:10:33 -07:00
# If set, allow loading or unloading lora adapters in runtime,
" VLLM_ALLOW_RUNTIME_LORA_UPDATING " :
lambda :
( os . environ . get ( " VLLM_ALLOW_RUNTIME_LORA_UPDATING " , " 0 " ) . strip ( ) . lower ( ) in
( " 1 " , " true " ) ) ,
2024-09-27 14:27:56 -07:00
# By default, vLLM will check the peer-to-peer capability itself,
# in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
# If this env var is set to 1, vLLM will skip the peer-to-peer check,
# and trust the driver's peer-to-peer capability report.
" VLLM_SKIP_P2P_CHECK " :
lambda : os . getenv ( " VLLM_SKIP_P2P_CHECK " , " 0 " ) == " 1 " ,
2024-10-09 23:17:17 -07:00
2024-10-17 09:48:26 -04:00
# List of quantization kernels that should be disabled, used for testing
# and performance comparisons. Currently only affects MPLinearKernel
# selection
# (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
" VLLM_DISABLED_KERNELS " :
lambda : [ ] if " VLLM_DISABLED_KERNELS " not in os . environ else os . environ [
" VLLM_DISABLED_KERNELS " ] . split ( " , " ) ,
2024-10-22 01:24:07 -07:00
# If set, use the V1 code path.
" VLLM_USE_V1 " :
lambda : bool ( int ( os . getenv ( " VLLM_USE_V1 " , " 0 " ) ) ) ,
2024-11-11 18:05:38 -05:00
# If set, enable multiprocessing in LLM for the V1 code path.
" VLLM_ENABLE_V1_MULTIPROCESSING " :
2024-12-13 19:27:32 -05:00
lambda : bool ( int ( os . getenv ( " VLLM_ENABLE_V1_MULTIPROCESSING " , " 1 " ) ) ) ,
2024-12-10 12:40:52 -08:00
" VLLM_LOG_BATCHSIZE_INTERVAL " :
lambda : float ( os . getenv ( " VLLM_LOG_BATCHSIZE_INTERVAL " , " -1 " ) ) ,
2024-12-16 16:15:22 -08:00
" VLLM_DISABLE_COMPILE_CACHE " :
lambda : bool ( int ( os . getenv ( " VLLM_DISABLE_COMPILE_CACHE " , " 0 " ) ) ) ,
2025-01-22 10:52:27 -08:00
# If set, vllm will run in development mode, which will enable
# some additional endpoints for developing and debugging,
# e.g. `/reset_prefix_cache`
" VLLM_SERVER_DEV_MODE " :
lambda : bool ( int ( os . getenv ( " VLLM_SERVER_DEV_MODE " , " 0 " ) ) ) ,
2025-01-22 14:22:12 -08:00
# Controls the maximum number of requests to handle in a
# single asyncio task when processing per-token outputs in the
# V1 AsyncLLM interface. It is applicable when handling a high
# concurrency of streaming requests.
# Setting this too high can result in a higher variance of
# inter-message latencies. Setting it too low can negatively impact
# TTFT and overall throughput.
" VLLM_V1_OUTPUT_PROC_CHUNK_SIZE " :
lambda : int ( os . getenv ( " VLLM_V1_OUTPUT_PROC_CHUNK_SIZE " , " 128 " ) ) ,
2024-05-02 11:13:25 -07:00
}
2024-05-02 22:13:49 -07:00
# end-env-vars-definition
2024-05-02 11:13:25 -07:00
2024-07-16 14:12:25 +08:00
def __getattr__ ( name : str ) :
2024-05-02 11:13:25 -07:00
# lazy evaluation of environment variables
if name in environment_variables :
return environment_variables [ name ] ( )
raise AttributeError ( f " module { __name__ !r} has no attribute { name !r} " )
def __dir__ ( ) :
return list ( environment_variables . keys ( ) )