vllm/vllm/compilation/monitor.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import contextlib
import time
from collections.abc import Generator

from vllm.config import CompilationMode, VllmConfig
from vllm.logger import init_logger

logger = init_logger(__name__)

# Shared global so backends.py can read the start time for Dynamo timing.
torch_compile_start_time: float = 0.0


@contextlib.contextmanager
def monitor_torch_compile(
    vllm_config: VllmConfig,
    message: str = "torch.compile took %.2f s in total",
) -> Generator[None, None, None]:
    """Context manager that times torch.compile and manages depyf debugging.

    On normal exit: logs the compile time and exits depyf.
    On exception: cleans up depyf without logging (compilation failed).
    """
    global torch_compile_start_time
    torch_compile_start_time = time.perf_counter()

    compilation_config = vllm_config.compilation_config
    depyf_cm = None
    path = vllm_config.compile_debug_dump_path()
    if compilation_config.mode == CompilationMode.VLLM_COMPILE and path:
        import depyf

        path.mkdir(parents=True, exist_ok=True)
        logger.debug("Dumping depyf output to %s", path)
        depyf_cm = depyf.prepare_debug(path.as_posix())
        depyf_cm.__enter__()

    try:
        yield
    except Exception:
        raise
    else:
        total_compile_time = time.perf_counter() - torch_compile_start_time
        if compilation_config.mode == CompilationMode.VLLM_COMPILE:
            logger.info_once(message, total_compile_time, scope="local")
    finally:
        if depyf_cm is not None:
            try:
                depyf_cm.__exit__(None, None, None)
            except Exception:
                logger.warning("Exception during depyf cleanup.", exc_info=True)


@contextlib.contextmanager
def monitor_profiling_run() -> Generator[None, None, None]:
    """Context manager that times the initial profiling run.

    Asserts that no backend compilation occurs during the profiling run
    (all compilation should have completed before this point).
    """
    from vllm.compilation.counter import compilation_counter

    backend_compilations_before = compilation_counter.num_backend_compilations
    start = time.perf_counter()
    yield
    elapsed = time.perf_counter() - start
    assert (
        compilation_counter.num_backend_compilations == backend_compilations_before
    ), (
        "backend compilation occurred during the initial profiling run; "
        "all compilation should be complete before the profiling run starts."
    )
    logger.info_once(
        "Initial profiling/warmup run took %.2f s",
        elapsed,
        scope="local",
    )


cudagraph_capturing_enabled: bool = True


def validate_cudagraph_capturing_enabled() -> None:
    # used to monitor whether a cudagraph capturing is legal at runtime.
    # should be called before any cudagraph capturing.
    # if an illegal cudagraph capturing happens, raise an error.
    global cudagraph_capturing_enabled
    if not cudagraph_capturing_enabled:
        raise RuntimeError(
            "CUDA graph capturing detected at an inappropriate "
            "time. This operation is currently disabled."
        )


def set_cudagraph_capturing_enabled(enabled: bool) -> None:
    global cudagraph_capturing_enabled
    cudagraph_capturing_enabled = enabled