[Log] Reduce duplicate log (#37313)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye
2026-03-18 10:57:44 -04:00
committed by GitHub
parent de1a86b7de
commit c373b5c00d
8 changed files with 20 additions and 10 deletions

View File

@@ -371,13 +371,15 @@ class CompilerManager:
logger.info_once( logger.info_once(
"Cache the graph of compile range %s for later use", "Cache the graph of compile range %s for later use",
str(compile_range), str(compile_range),
scope="local",
) )
logger.debug( logger.debug_once(
"Store the %s-th graph for compile range%s from %s via handle %s", "Store the %s-th graph for compile range%s from %s via handle %s",
graph_index, graph_index,
str(compile_range), str(compile_range),
self.compiler.name, self.compiler.name,
handle, handle,
scope="local",
) )
# after compiling the last graph, record the end time # after compiling the last graph, record the end time

View File

@@ -228,9 +228,10 @@ class SchedulerConfig:
self.encoder_cache_size = self.max_num_batched_tokens self.encoder_cache_size = self.max_num_batched_tokens
if self.enable_chunked_prefill: if self.enable_chunked_prefill:
logger.info( logger.info_once(
"Chunked prefill is enabled with max_num_batched_tokens=%d.", "Chunked prefill is enabled with max_num_batched_tokens=%d.",
self.max_num_batched_tokens, self.max_num_batched_tokens,
scope="local",
) )
if self.max_num_partial_prefills > 1: if self.max_num_partial_prefills > 1:

View File

@@ -227,7 +227,9 @@ class MMEncoderAttention(CustomOp):
if self.attn_backend == AttentionBackendEnum.FLASHINFER: if self.attn_backend == AttentionBackendEnum.FLASHINFER:
_get_flashinfer_workspace_buffer() _get_flashinfer_workspace_buffer()
logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.") logger.info_once(
f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
)
@classmethod @classmethod
def enabled(cls) -> bool: def enabled(cls) -> bool:

View File

@@ -192,14 +192,15 @@ class ChunkGatedDeltaRule(CustomOp):
use_flashinfer = supports_flashinfer use_flashinfer = supports_flashinfer
if use_flashinfer: if use_flashinfer:
logger.info_once("Using FlashInfer GDN prefill kernel") logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
logger.info_once( logger.info_once(
"FlashInfer GDN prefill kernel is JIT-compiled; first run may " "FlashInfer GDN prefill kernel is JIT-compiled; first run may "
"take a while to compile. Set `--gdn-prefill-backend triton` to " "take a while to compile. Set `--gdn-prefill-backend triton` to "
"avoid JIT compile time." "avoid JIT compile time.",
scope="local",
) )
else: else:
logger.info_once("Using Triton/FLA GDN prefill kernel") logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")
self._forward_method = ( self._forward_method = (
self.forward_cuda if use_flashinfer else self.forward_native self.forward_cuda if use_flashinfer else self.forward_native

View File

@@ -387,7 +387,8 @@ class CudaPlatformBase(Platform):
) )
if is_backend_supported: if is_backend_supported:
logger.info_once( logger.info_once(
f"Using backend {vit_attn_backend} for vit attention" f"Using backend {vit_attn_backend} for vit attention",
scope="local",
) )
return vit_attn_backend return vit_attn_backend
except ImportError: except ImportError:

View File

@@ -998,12 +998,13 @@ def set_multiprocessing_worker_envs():
"OMP_NUM_THREADS" not in os.environ "OMP_NUM_THREADS" not in os.environ
and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
): ):
logger.warning( logger.warning_once(
"Reducing Torch parallelism from %d threads to %d to avoid " "Reducing Torch parallelism from %d threads to %d to avoid "
"unnecessary CPU contention. Set OMP_NUM_THREADS in the " "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
"external environment to tune this value as needed.", "external environment to tune this value as needed.",
current_parallelism, current_parallelism,
default_omp_num_threads, default_omp_num_threads,
scope="local",
) )
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
torch.set_num_threads(default_omp_num_threads) torch.set_num_threads(default_omp_num_threads)

View File

@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
# this optimization if we run into this case. # this optimization if we run into this case.
if parallel_config.disable_nccl_for_dp_synchronization: if parallel_config.disable_nccl_for_dp_synchronization:
logger.info_once( logger.info_once(
"Using CPU all reduce to synchronize DP padding between ranks." "Using CPU all reduce to synchronize DP padding between ranks.",
scope="local",
) )
device = "cpu" device = "cpu"
group = get_dp_group().cpu_group group = get_dp_group().cpu_group

View File

@@ -5510,13 +5510,14 @@ class GPUModelRunner(
dummy_modality dummy_modality
] ]
logger.info( logger.info_once(
"Encoder cache will be initialized with a " "Encoder cache will be initialized with a "
"budget of %s tokens, and profiled with " "budget of %s tokens, and profiled with "
"%s %s items of the maximum feature size.", "%s %s items of the maximum feature size.",
encoder_budget, encoder_budget,
max_mm_items_per_batch, max_mm_items_per_batch,
dummy_modality, dummy_modality,
scope="local",
) )
# Create dummy batch of multimodal inputs. # Create dummy batch of multimodal inputs.