diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index ce2da3cf2..353149aa8 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -263,11 +263,12 @@ class CompilerManager: now = time.time() elapsed = now - compilation_start_time compilation_config.compilation_time += elapsed - logger.info( + logger.info_once( "Directly load the compiled graph(s) for compile range %s " "from the cache, took %.3f s", str(compile_range), elapsed, + scope="local", ) return compiled_graph diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 19e85ff62..db3275e08 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -91,14 +91,16 @@ def is_deep_gemm_e8m0_used() -> bool: _lazy_init() if _fp8_gemm_nt_impl is None: - logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found") + logger.info_once( + "DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found", scope="local" + ) return False if envs.VLLM_USE_DEEP_GEMM_E8M0: - logger.info_once("DeepGEMM E8M0 enabled on current platform.") + logger.info_once("DeepGEMM E8M0 enabled on current platform.", scope="local") return True - logger.info_once("DeepGEMM E8M0 disabled on current configuration.") + logger.info_once("DeepGEMM E8M0 disabled on current configuration.", scope="local") return False diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f1eb4b2bc..969627170 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -119,9 +119,6 @@ class Worker(WorkerBase): self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER - if self.use_v2_model_runner: - logger.info_once("Using V2 Model Runner", scope="global") - def sleep(self, level: int = 1) -> None: from vllm.device_allocator.cumem import CuMemAllocator @@ -240,6 +237,9 @@ class Worker(WorkerBase): current_platform.dist_backend, ) + if self.use_v2_model_runner: + logger.info_once("Using V2 Model Runner", scope="local") + # Set random seed. set_random_seed(self.model_config.seed)