[Log] Reduce duplicate log (#37313)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-03-18 10:57:44 -04:00
parent de1a86b7de
commit c373b5c00d
8 changed files with 20 additions and 10 deletions
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -371,13 +371,15 @@ class CompilerManager:
                logger.info_once(
                    "Cache the graph of compile range %s for later use",
                    str(compile_range),
+                    scope="local",
                )
-            logger.debug(
+            logger.debug_once(
                "Store the %s-th graph for compile range%s from %s via handle %s",
                graph_index,
                str(compile_range),
                self.compiler.name,
                handle,
+                scope="local",
            )

        # after compiling the last graph, record the end time
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -228,9 +228,10 @@ class SchedulerConfig:
        self.encoder_cache_size = self.max_num_batched_tokens

        if self.enable_chunked_prefill:
-            logger.info(
+            logger.info_once(
                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
                self.max_num_batched_tokens,
+                scope="local",
            )

        if self.max_num_partial_prefills > 1:
--- a/vllm/model_executor/layers/attention/mm_encoder_attention.py
+++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py
@@ -227,7 +227,9 @@ class MMEncoderAttention(CustomOp):
        if self.attn_backend == AttentionBackendEnum.FLASHINFER:
            _get_flashinfer_workspace_buffer()

-        logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
+        logger.info_once(
+            f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
+        )

    @classmethod
    def enabled(cls) -> bool:
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -192,14 +192,15 @@ class ChunkGatedDeltaRule(CustomOp):
            use_flashinfer = supports_flashinfer

        if use_flashinfer:
-            logger.info_once("Using FlashInfer GDN prefill kernel")
+            logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
            logger.info_once(
                "FlashInfer GDN prefill kernel is JIT-compiled; first run may "
                "take a while to compile. Set `--gdn-prefill-backend triton` to "
-                "avoid JIT compile time."
+                "avoid JIT compile time.",
+                scope="local",
            )
        else:
-            logger.info_once("Using Triton/FLA GDN prefill kernel")
+            logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")

        self._forward_method = (
            self.forward_cuda if use_flashinfer else self.forward_native
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -387,7 +387,8 @@ class CudaPlatformBase(Platform):
                    )
                if is_backend_supported:
                    logger.info_once(
-                        f"Using backend {vit_attn_backend} for vit attention"
+                        f"Using backend {vit_attn_backend} for vit attention",
+                        scope="local",
                    )
                    return vit_attn_backend
            except ImportError:
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -998,12 +998,13 @@ def set_multiprocessing_worker_envs():
        "OMP_NUM_THREADS" not in os.environ
        and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
    ):
-        logger.warning(
+        logger.warning_once(
            "Reducing Torch parallelism from %d threads to %d to avoid "
            "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
            "external environment to tune this value as needed.",
            current_parallelism,
            default_omp_num_threads,
+            scope="local",
        )
        os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
        torch.set_num_threads(default_omp_num_threads)
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
    # this optimization if we run into this case.
    if parallel_config.disable_nccl_for_dp_synchronization:
        logger.info_once(
-            "Using CPU all reduce to synchronize DP padding between ranks."
+            "Using CPU all reduce to synchronize DP padding between ranks.",
+            scope="local",
        )
        device = "cpu"
        group = get_dp_group().cpu_group
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5510,13 +5510,14 @@ class GPUModelRunner(
                            dummy_modality
                        ]

-                        logger.info(
+                        logger.info_once(
                            "Encoder cache will be initialized with a "
                            "budget of %s tokens, and profiled with "
                            "%s %s items of the maximum feature size.",
                            encoder_budget,
                            max_mm_items_per_batch,
                            dummy_modality,
+                            scope="local",
                        )

                        # Create dummy batch of multimodal inputs.