[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)

Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
Wei Zhao
2026-02-13 11:11:26 -05:00
committed by GitHub
parent 4a9952ec1b
commit 59d53066d8
6 changed files with 127 additions and 62 deletions

View File

@@ -230,6 +230,8 @@ if TYPE_CHECKING:
VLLM_USE_V2_MODEL_RUNNER: bool = False
VLLM_LOG_MODEL_INSPECTION: bool = False
VLLM_DEBUG_MFU_METRICS: bool = False
VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY: bool = False
VLLM_WEIGHT_OFFLOADING_DISABLE_UVA: bool = False
VLLM_DISABLE_LOG_LOGO: bool = False
VLLM_LORA_DISABLE_PDL: bool = False
@@ -1542,6 +1544,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DEBUG_MFU_METRICS": lambda: bool(
int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0"))
),
# Disable using pytorch's pin memory for CPU offloading.
"VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": lambda: bool(
int(os.getenv("VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY", "0"))
),
# Disable using UVA (Unified Virtual Addressing) for CPU offloading.
"VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": lambda: bool(
int(os.getenv("VLLM_WEIGHT_OFFLOADING_DISABLE_UVA", "0"))
),
# Disable logging of vLLM logo at server startup time.
"VLLM_DISABLE_LOG_LOGO": lambda: bool(int(os.getenv("VLLM_DISABLE_LOG_LOGO", "0"))),
# Disable PDL for LoRA, as enabling PDL with LoRA on SM100 causes