[1/N] Elastic EP Milestone 2 (#34861)
Signed-off-by: Yongji Wu <wuyongji317@gmail.com> Signed-off-by: Itay Alroy <ialroy@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Ron Tourgeman <rtourgeman@nvidia.com> Co-authored-by: Yongji Wu <wuyongji317@gmail.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Ron Tourgeman <rtourgeman@nvidia.com>
This commit is contained in:
12
vllm/envs.py
12
vllm/envs.py
@@ -243,6 +243,8 @@ if TYPE_CHECKING:
|
||||
VLLM_LORA_DISABLE_PDL: bool = False
|
||||
VLLM_ENABLE_CUDA_COMPATIBILITY: bool = False
|
||||
VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
|
||||
VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
|
||||
VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -1617,6 +1619,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_CUDA_COMPATIBILITY_PATH": lambda: os.environ.get(
|
||||
"VLLM_CUDA_COMPATIBILITY_PATH", None
|
||||
),
|
||||
# Whether it is a scale up launch engine for elastic EP,
|
||||
# Should only be set by EngineCoreClient.
|
||||
"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": lambda: bool(
|
||||
int(os.getenv("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH", "0"))
|
||||
),
|
||||
# Whether to wait for all requests to drain before sending the
|
||||
# scaling command in elastic EP.
|
||||
"VLLM_ELASTIC_EP_DRAIN_REQUESTS": lambda: bool(
|
||||
int(os.getenv("VLLM_ELASTIC_EP_DRAIN_REQUESTS", "0"))
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user