[Cleanup] Refactor profiling env vars into a CLI config (#29912)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-12-09 13:29:33 -05:00
parent d471b2aff0
commit e858bfe051
22 changed files with 433 additions and 252 deletions
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig
 from vllm.config.observability import ObservabilityConfig
 from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
+from vllm.config.profiler import ProfilerConfig
 from vllm.config.scheduler import SchedulerConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.config.speech_to_text import SpeechToTextConfig
@@ -89,6 +90,8 @@ __all__ = [
    "SpeechToTextConfig",
    # From vllm.config.structured_outputs
    "StructuredOutputsConfig",
+    # From vllm.config.profiler
+    "ProfilerConfig",
    # From vllm.config.utils
    "ConfigType",
    "SupportsMetricsInfo",
--- a/vllm/config/profiler.py
+++ b/vllm/config/profiler.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Any, Literal
+
+from pydantic import Field, model_validator
+from pydantic.dataclasses import dataclass
+from typing_extensions import Self
+
+import vllm.envs as envs
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
+
+logger = init_logger(__name__)
+
+ProfilerKind = Literal["torch", "cuda"]
+
+
+@config
+@dataclass
+class ProfilerConfig:
+    """Dataclass which contains profiler config for the engine."""
+
+    profiler: ProfilerKind | None = None
+    """Which profiler to use. Defaults to None. Options are:
+
+    - 'torch': Use PyTorch profiler.\n
+    - 'cuda': Use CUDA profiler."""
+
+    torch_profiler_dir: str = ""
+    """Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
+    worker's traces (CPU & GPU) will be saved under this directory. Note that
+    it must be an absolute path."""
+
+    torch_profiler_with_stack: bool = True
+    """If `True`, enables stack tracing in the torch profiler. Enabled by default."""
+
+    torch_profiler_with_flops: bool = False
+    """If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
+
+    torch_profiler_use_gzip: bool = True
+    """If `True`, saves torch profiler traces in gzip format. Enabled by default"""
+
+    torch_profiler_dump_cuda_time_total: bool = True
+    """If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
+
+    torch_profiler_record_shapes: bool = False
+    """If `True`, records tensor shapes in the torch profiler. Disabled by default."""
+
+    torch_profiler_with_memory: bool = False
+    """If `True`, enables memory profiling in the torch profiler.
+    Disabled by default."""
+
+    ignore_frontend: bool = False
+    """If `True`, disables the front-end profiling of AsyncLLM when using the 
+    'torch' profiler. This is needed to reduce overhead when using delay/limit options,
+    since the front-end profiling does not track iterations and will capture the
+    entire range.
+    """
+
+    delay_iterations: int = Field(default=0, ge=0)
+    """Number of engine iterations to skip before starting profiling.
+    Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
+    """
+
+    max_iterations: int = Field(default=0, ge=0)
+    """Maximum number of engine iterations to profile after starting profiling.
+    Defaults to 0, meaning no limit.
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
+        """Get field from env var if set, with deprecation warning."""
+
+        if envs.is_set(env_var_name):
+            value = getattr(envs, env_var_name)
+            logger.warning_once(
+                "Using %s environment variable is deprecated and will be removed in "
+                "v0.14.0 or v1.0.0, whichever is soonest. Please use "
+                "--profiler-config.%s command line argument or "
+                "ProfilerConfig(%s=...) config field instead.",
+                env_var_name,
+                field_name,
+                field_name,
+            )
+            return value
+        return None
+
+    def _set_from_env_if_set(
+        self,
+        field_name: str,
+        env_var_name: str,
+        to_bool: bool = True,
+        to_int: bool = False,
+    ) -> None:
+        """Set field from env var if set, with deprecation warning."""
+        value = self._get_from_env_if_set(field_name, env_var_name)
+        if value is not None:
+            if to_bool:
+                value = value == "1"
+            if to_int:
+                value = int(value)
+            setattr(self, field_name, value)
+
+    @model_validator(mode="after")
+    def _validate_profiler_config(self) -> Self:
+        maybe_use_cuda_profiler = self._get_from_env_if_set(
+            "profiler", "VLLM_TORCH_CUDA_PROFILE"
+        )
+        if maybe_use_cuda_profiler is not None:
+            self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
+        else:
+            self._set_from_env_if_set(
+                "torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
+            )
+            if self.torch_profiler_dir:
+                self.profiler = "torch"
+                self._set_from_env_if_set(
+                    "torch_profiler_record_shapes",
+                    "VLLM_TORCH_PROFILER_RECORD_SHAPES",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_with_memory",
+                    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_with_stack",
+                    "VLLM_TORCH_PROFILER_WITH_STACK",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_with_flops",
+                    "VLLM_TORCH_PROFILER_WITH_FLOPS",
+                )
+                self._set_from_env_if_set(
+                    "ignore_frontend",
+                    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_use_gzip",
+                    "VLLM_TORCH_PROFILER_USE_GZIP",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_dump_cuda_time_total",
+                    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
+                )
+
+        self._set_from_env_if_set(
+            "delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
+        )
+        self._set_from_env_if_set(
+            "max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
+        )
+
+        has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
+        if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
+            logger.warning_once(
+                "Using 'torch' profiler with delay_iterations or max_iterations "
+                "while ignore_frontend is False may result in high overhead."
+            )
+
+        profiler_dir = self.torch_profiler_dir
+        if profiler_dir and self.profiler != "torch":
+            raise ValueError(
+                "torch_profiler_dir is only applicable when profiler is set to 'torch'"
+            )
+        if self.profiler == "torch" and not profiler_dir:
+            raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
+
+        if profiler_dir:
+            is_gs_path = (
+                profiler_dir.startswith("gs://")
+                and profiler_dir[5:]
+                and profiler_dir[5] != "/"
+            )
+            if not is_gs_path:
+                self.torch_profiler_dir = os.path.abspath(
+                    os.path.expanduser(profiler_dir)
+                )
+
+        return self
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -39,6 +39,7 @@ from .lora import LoRAConfig
 from .model import ModelConfig
 from .observability import ObservabilityConfig
 from .parallel import ParallelConfig
+from .profiler import ProfilerConfig
 from .scheduler import SchedulerConfig
 from .speculative import SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
@@ -218,6 +219,8 @@ class VllmConfig:
    You can specify the full compilation config like so:
    `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
    """
+    profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig)
+    """Profiling configuration."""
    kv_transfer_config: KVTransferConfig | None = None
    """The configurations for distributed KV cache transfer."""
    kv_events_config: KVEventsConfig | None = None
@@ -296,6 +299,8 @@ class VllmConfig:
            vllm_factors.append("None")
        if self.structured_outputs_config:
            vllm_factors.append(self.structured_outputs_config.compute_hash())
+        if self.profiler_config:
+            vllm_factors.append(self.profiler_config.compute_hash())
        else:
            vllm_factors.append("None")
        vllm_factors.append(self.observability_config.compute_hash())