[Misc] Move print_*_once from utils to logger (#11298)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Co-authored-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
2025-01-09 12:48:12 +08:00
parent 730e9592e9
commit d848800e88
21 changed files with 129 additions and 72 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -32,8 +32,7 @@ from vllm.transformers_utils.config import (
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, print_warning_once, random_uuid,
-                        resolve_obj_by_qualname)
+                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)

 if TYPE_CHECKING:
    from ray.util.placement_group import PlacementGroup
@@ -314,7 +313,7 @@ class ModelConfig:
                sliding_window_len_min = get_min_sliding_window(
                    self.hf_text_config.sliding_window)

-                print_warning_once(
+                logger.warning_once(
                    f"{self.hf_text_config.model_type} has interleaved "
                    "attention, which is currently not supported by the "
                    "XFORMERS backend. Disabling sliding window and capping "
@@ -2758,7 +2757,7 @@ class CompilationConfig(BaseModel):

        def model_post_init(self, __context: Any) -> None:
            if not self.enable_reshape and self.enable_fusion:
-                print_warning_once(
+                logger.warning_once(
                    "Fusion enabled but reshape elimination disabled."
                    "RMSNorm + quant (fp8) fusion might not work")

@@ -3151,7 +3150,7 @@ class VllmConfig:
            self.scheduler_config.chunked_prefill_enabled and \
            self.model_config.dtype == torch.float32 and \
            current_platform.get_device_capability() == (7, 5):
-            print_warning_once(
+            logger.warning_once(
                "Turing devices tensor cores do not support float32 matmul. "
                "To workaround this limitation, vLLM will set 'ieee' input "
                "precision for chunked prefill triton kernels.")