[UX][Startup] Account for CUDA graphs during memory profiling (#30515)

2026-03-07 16:49:23 -05:00
parent 85f50eb41f
commit ebb9cc5f2b
6 changed files with 360 additions and 61 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -244,6 +244,7 @@ if TYPE_CHECKING:
    VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
    VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
    VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
+    VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = False


 def get_default_cache_root():
@@ -1628,6 +1629,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_ELASTIC_EP_DRAIN_REQUESTS": lambda: bool(
        int(os.getenv("VLLM_ELASTIC_EP_DRAIN_REQUESTS", "0"))
    ),
+    # If set to 1, enable CUDA graph memory estimation during memory profiling.
+    # This profiles CUDA graph memory usage to provide more accurate KV cache
+    # memory allocation. Disabled by default to preserve existing behavior.
+    "VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS": lambda: bool(
+        int(os.getenv("VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS", "0"))
+    ),
 }