[Log] Reduce duplicate log (#37313)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye
2026-03-18 10:57:44 -04:00
committed by GitHub
parent de1a86b7de
commit c373b5c00d
8 changed files with 20 additions and 10 deletions

View File

@@ -371,13 +371,15 @@ class CompilerManager:
logger.info_once(
"Cache the graph of compile range %s for later use",
str(compile_range),
scope="local",
)
logger.debug(
logger.debug_once(
"Store the %s-th graph for compile range%s from %s via handle %s",
graph_index,
str(compile_range),
self.compiler.name,
handle,
scope="local",
)
# after compiling the last graph, record the end time

View File

@@ -228,9 +228,10 @@ class SchedulerConfig:
self.encoder_cache_size = self.max_num_batched_tokens
if self.enable_chunked_prefill:
logger.info(
logger.info_once(
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
self.max_num_batched_tokens,
scope="local",
)
if self.max_num_partial_prefills > 1:

View File

@@ -227,7 +227,9 @@ class MMEncoderAttention(CustomOp):
if self.attn_backend == AttentionBackendEnum.FLASHINFER:
_get_flashinfer_workspace_buffer()
logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
logger.info_once(
f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
)
@classmethod
def enabled(cls) -> bool:

View File

@@ -192,14 +192,15 @@ class ChunkGatedDeltaRule(CustomOp):
use_flashinfer = supports_flashinfer
if use_flashinfer:
logger.info_once("Using FlashInfer GDN prefill kernel")
logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
logger.info_once(
"FlashInfer GDN prefill kernel is JIT-compiled; first run may "
"take a while to compile. Set `--gdn-prefill-backend triton` to "
"avoid JIT compile time."
"avoid JIT compile time.",
scope="local",
)
else:
logger.info_once("Using Triton/FLA GDN prefill kernel")
logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")
self._forward_method = (
self.forward_cuda if use_flashinfer else self.forward_native

View File

@@ -387,7 +387,8 @@ class CudaPlatformBase(Platform):
)
if is_backend_supported:
logger.info_once(
f"Using backend {vit_attn_backend} for vit attention"
f"Using backend {vit_attn_backend} for vit attention",
scope="local",
)
return vit_attn_backend
except ImportError:

View File

@@ -998,12 +998,13 @@ def set_multiprocessing_worker_envs():
"OMP_NUM_THREADS" not in os.environ
and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
):
logger.warning(
logger.warning_once(
"Reducing Torch parallelism from %d threads to %d to avoid "
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
"external environment to tune this value as needed.",
current_parallelism,
default_omp_num_threads,
scope="local",
)
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
torch.set_num_threads(default_omp_num_threads)

View File

@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
# this optimization if we run into this case.
if parallel_config.disable_nccl_for_dp_synchronization:
logger.info_once(
"Using CPU all reduce to synchronize DP padding between ranks."
"Using CPU all reduce to synchronize DP padding between ranks.",
scope="local",
)
device = "cpu"
group = get_dp_group().cpu_group

View File

@@ -5510,13 +5510,14 @@ class GPUModelRunner(
dummy_modality
]
logger.info(
logger.info_once(
"Encoder cache will be initialized with a "
"budget of %s tokens, and profiled with "
"%s %s items of the maximum feature size.",
encoder_budget,
max_mm_items_per_batch,
dummy_modality,
scope="local",
)
# Create dummy batch of multimodal inputs.