diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 50d0df589..09fd1f750 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -249,7 +249,7 @@ class CompilerManager: if graph_index == 0: # before compiling the first graph, record the start time global compilation_start_time - compilation_start_time = time.time() + compilation_start_time = time.perf_counter() compilation_counter.num_backend_compilations += 1 @@ -261,8 +261,7 @@ class CompilerManager: if graph_index == num_graphs - 1: # after loading the last graph for this shape, record the time. # there can be multiple graphs due to piecewise compilation. - now = time.time() - elapsed = now - compilation_start_time + elapsed = time.perf_counter() - compilation_start_time compilation_config.compilation_time += elapsed logger.info_once( "Directly load the compiled graph(s) for compile range %s " @@ -362,8 +361,7 @@ class CompilerManager: # after compiling the last graph, record the end time if graph_index == num_graphs - 1: - now = time.time() - elapsed = now - compilation_start_time + elapsed = time.perf_counter() - compilation_start_time compilation_config.compilation_time += elapsed logger.info_once( "Compiling a graph for compile range %s takes %.2f s", @@ -974,7 +972,7 @@ class VllmBackend: compilation_counter.num_graphs_seen += 1 from .monitor import torch_compile_start_time - dynamo_time = time.time() - torch_compile_start_time + dynamo_time = time.perf_counter() - torch_compile_start_time logger.info_once( "Dynamo bytecode transform time: %.2f s", dynamo_time, scope="local" ) diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index 2bad5f0a1..43b9ae508 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -14,7 +14,7 @@ torch_compile_start_time: float = 0.0 def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None: global torch_compile_start_time - torch_compile_start_time = time.time() + torch_compile_start_time = time.perf_counter() compilation_config: CompilationConfig = vllm_config.compilation_config path = vllm_config.compile_debug_dump_path() @@ -30,10 +30,11 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None: def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None: compilation_config: CompilationConfig = vllm_config.compilation_config + total_compile_time: float = time.perf_counter() - torch_compile_start_time if compilation_config.mode == CompilationMode.VLLM_COMPILE: logger.info_once( "torch.compile takes %.2f s in total", - compilation_config.compilation_time, + total_compile_time, scope="local", ) global context_manager diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 4f6ae2505..f9eb24589 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -5,6 +5,7 @@ import dataclasses import io import json import pickle +import time from collections.abc import Callable from pickle import Pickler from typing import Any @@ -164,7 +165,16 @@ class PiecewiseBackend: if self.is_last_graph and not self.to_be_compiled_ranges: # no specific sizes to compile # save the hash of the inductor graph for the next run + time_before_saving = time.perf_counter() self.vllm_backend.compiler_manager.save_to_file() + elapsed = time.perf_counter() - time_before_saving + if elapsed > 1: + logger.info_once( + "Saved compiler manager cache in %.2f seconds.", + elapsed, + scope="local", + ) + end_monitoring_torch_compile(self.vllm_config) # Call the completion callback (e.g., to save AOT compiled function) if self.on_compilation_complete is not None: