[Core] Freeze gc during cuda graph capture to speed up init (#21146)

Signed-off-by: Codex <codex@openai.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-07-23 20:20:14 -04:00
committed by GitHub
parent 82ec66f514
commit f3137cdd81
2 changed files with 23 additions and 1 deletions

View File

@@ -140,6 +140,7 @@ if TYPE_CHECKING:
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
VLLM_USE_CUDNN_PREFILL: bool = False
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
VLLM_LOOPBACK_IP: str = ""
@@ -968,6 +969,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_TRTLLM_DECODE_ATTENTION":
lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None),
# Controls garbage collection during CUDA graph capture.
# If set to 0 (default), enables GC freezing to speed up capture time.
# If set to 1, allows GC to run during capture.
"VLLM_ENABLE_CUDAGRAPH_GC":
lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))),
# Used to force set up loopback IP
"VLLM_LOOPBACK_IP":
lambda: os.getenv("VLLM_LOOPBACK_IP", ""),