[UX][Startup] Account for CUDA graphs during memory profiling (#30515)
This commit is contained in:
@@ -244,6 +244,7 @@ if TYPE_CHECKING:
|
||||
VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
|
||||
VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
|
||||
VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
|
||||
VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = False
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -1628,6 +1629,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_ELASTIC_EP_DRAIN_REQUESTS": lambda: bool(
|
||||
int(os.getenv("VLLM_ELASTIC_EP_DRAIN_REQUESTS", "0"))
|
||||
),
|
||||
# If set to 1, enable CUDA graph memory estimation during memory profiling.
|
||||
# This profiles CUDA graph memory usage to provide more accurate KV cache
|
||||
# memory allocation. Disabled by default to preserve existing behavior.
|
||||
"VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS": lambda: bool(
|
||||
int(os.getenv("VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS", "0"))
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user