[core][distributed] exact ray placement control (#12732)
Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
14
vllm/envs.py
14
vllm/envs.py
@@ -85,6 +85,8 @@ if TYPE_CHECKING:
|
||||
VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
|
||||
VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
|
||||
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
|
||||
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
|
||||
VLLM_RAY_BUNDLE_INDICES: str = ""
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -550,6 +552,18 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
|
||||
),
|
||||
|
||||
# Number of GPUs per worker in Ray, if it is set to be a fraction,
|
||||
# it allows ray to schedule multiple actors on a single GPU,
|
||||
# so that users can colocate other actors on the same GPUs as vLLM.
|
||||
"VLLM_RAY_PER_WORKER_GPUS":
|
||||
lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
|
||||
|
||||
# Bundle indices for Ray, if it is set, it can control precisely
|
||||
# which indices are used for the Ray bundle, for every worker.
|
||||
# Format: comma-separated list of integers, e.g. "0,1,2,3"
|
||||
"VLLM_RAY_BUNDLE_INDICES":
|
||||
lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
|
||||
|
||||
# When on a Nvidia GPU aligns single entries (within a page) so they are 256
|
||||
# byte aligned for better performance, this increases the memory usage of
|
||||
# the cache. Currently this only affects MLA that results in non-256
|
||||
|
||||
Reference in New Issue
Block a user