[Log] Reduce duplicate log (#37313)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -371,13 +371,15 @@ class CompilerManager:
|
||||
logger.info_once(
|
||||
"Cache the graph of compile range %s for later use",
|
||||
str(compile_range),
|
||||
scope="local",
|
||||
)
|
||||
logger.debug(
|
||||
logger.debug_once(
|
||||
"Store the %s-th graph for compile range%s from %s via handle %s",
|
||||
graph_index,
|
||||
str(compile_range),
|
||||
self.compiler.name,
|
||||
handle,
|
||||
scope="local",
|
||||
)
|
||||
|
||||
# after compiling the last graph, record the end time
|
||||
|
||||
@@ -228,9 +228,10 @@ class SchedulerConfig:
|
||||
self.encoder_cache_size = self.max_num_batched_tokens
|
||||
|
||||
if self.enable_chunked_prefill:
|
||||
logger.info(
|
||||
logger.info_once(
|
||||
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
|
||||
self.max_num_batched_tokens,
|
||||
scope="local",
|
||||
)
|
||||
|
||||
if self.max_num_partial_prefills > 1:
|
||||
|
||||
@@ -227,7 +227,9 @@ class MMEncoderAttention(CustomOp):
|
||||
if self.attn_backend == AttentionBackendEnum.FLASHINFER:
|
||||
_get_flashinfer_workspace_buffer()
|
||||
|
||||
logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
|
||||
logger.info_once(
|
||||
f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def enabled(cls) -> bool:
|
||||
|
||||
@@ -192,14 +192,15 @@ class ChunkGatedDeltaRule(CustomOp):
|
||||
use_flashinfer = supports_flashinfer
|
||||
|
||||
if use_flashinfer:
|
||||
logger.info_once("Using FlashInfer GDN prefill kernel")
|
||||
logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
|
||||
logger.info_once(
|
||||
"FlashInfer GDN prefill kernel is JIT-compiled; first run may "
|
||||
"take a while to compile. Set `--gdn-prefill-backend triton` to "
|
||||
"avoid JIT compile time."
|
||||
"avoid JIT compile time.",
|
||||
scope="local",
|
||||
)
|
||||
else:
|
||||
logger.info_once("Using Triton/FLA GDN prefill kernel")
|
||||
logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")
|
||||
|
||||
self._forward_method = (
|
||||
self.forward_cuda if use_flashinfer else self.forward_native
|
||||
|
||||
@@ -387,7 +387,8 @@ class CudaPlatformBase(Platform):
|
||||
)
|
||||
if is_backend_supported:
|
||||
logger.info_once(
|
||||
f"Using backend {vit_attn_backend} for vit attention"
|
||||
f"Using backend {vit_attn_backend} for vit attention",
|
||||
scope="local",
|
||||
)
|
||||
return vit_attn_backend
|
||||
except ImportError:
|
||||
|
||||
@@ -998,12 +998,13 @@ def set_multiprocessing_worker_envs():
|
||||
"OMP_NUM_THREADS" not in os.environ
|
||||
and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
|
||||
):
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"Reducing Torch parallelism from %d threads to %d to avoid "
|
||||
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
|
||||
"external environment to tune this value as needed.",
|
||||
current_parallelism,
|
||||
default_omp_num_threads,
|
||||
scope="local",
|
||||
)
|
||||
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
||||
torch.set_num_threads(default_omp_num_threads)
|
||||
|
||||
@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
|
||||
# this optimization if we run into this case.
|
||||
if parallel_config.disable_nccl_for_dp_synchronization:
|
||||
logger.info_once(
|
||||
"Using CPU all reduce to synchronize DP padding between ranks."
|
||||
"Using CPU all reduce to synchronize DP padding between ranks.",
|
||||
scope="local",
|
||||
)
|
||||
device = "cpu"
|
||||
group = get_dp_group().cpu_group
|
||||
|
||||
@@ -5510,13 +5510,14 @@ class GPUModelRunner(
|
||||
dummy_modality
|
||||
]
|
||||
|
||||
logger.info(
|
||||
logger.info_once(
|
||||
"Encoder cache will be initialized with a "
|
||||
"budget of %s tokens, and profiled with "
|
||||
"%s %s items of the maximum feature size.",
|
||||
encoder_budget,
|
||||
max_mm_items_per_batch,
|
||||
dummy_modality,
|
||||
scope="local",
|
||||
)
|
||||
|
||||
# Create dummy batch of multimodal inputs.
|
||||
|
||||
Reference in New Issue
Block a user