[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
10
vllm/envs.py
10
vllm/envs.py
@@ -230,6 +230,8 @@ if TYPE_CHECKING:
|
||||
VLLM_USE_V2_MODEL_RUNNER: bool = False
|
||||
VLLM_LOG_MODEL_INSPECTION: bool = False
|
||||
VLLM_DEBUG_MFU_METRICS: bool = False
|
||||
VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY: bool = False
|
||||
VLLM_WEIGHT_OFFLOADING_DISABLE_UVA: bool = False
|
||||
VLLM_DISABLE_LOG_LOGO: bool = False
|
||||
VLLM_LORA_DISABLE_PDL: bool = False
|
||||
|
||||
@@ -1542,6 +1544,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_DEBUG_MFU_METRICS": lambda: bool(
|
||||
int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0"))
|
||||
),
|
||||
# Disable using pytorch's pin memory for CPU offloading.
|
||||
"VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": lambda: bool(
|
||||
int(os.getenv("VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY", "0"))
|
||||
),
|
||||
# Disable using UVA (Unified Virtual Addressing) for CPU offloading.
|
||||
"VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": lambda: bool(
|
||||
int(os.getenv("VLLM_WEIGHT_OFFLOADING_DISABLE_UVA", "0"))
|
||||
),
|
||||
# Disable logging of vLLM logo at server startup time.
|
||||
"VLLM_DISABLE_LOG_LOGO": lambda: bool(int(os.getenv("VLLM_DISABLE_LOG_LOGO", "0"))),
|
||||
# Disable PDL for LoRA, as enabling PDL with LoRA on SM100 causes
|
||||
|
||||
Reference in New Issue
Block a user