[Log] Reduce duplicate log (#37313)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -371,13 +371,15 @@ class CompilerManager:
|
|||||||
logger.info_once(
|
logger.info_once(
|
||||||
"Cache the graph of compile range %s for later use",
|
"Cache the graph of compile range %s for later use",
|
||||||
str(compile_range),
|
str(compile_range),
|
||||||
|
scope="local",
|
||||||
)
|
)
|
||||||
logger.debug(
|
logger.debug_once(
|
||||||
"Store the %s-th graph for compile range%s from %s via handle %s",
|
"Store the %s-th graph for compile range%s from %s via handle %s",
|
||||||
graph_index,
|
graph_index,
|
||||||
str(compile_range),
|
str(compile_range),
|
||||||
self.compiler.name,
|
self.compiler.name,
|
||||||
handle,
|
handle,
|
||||||
|
scope="local",
|
||||||
)
|
)
|
||||||
|
|
||||||
# after compiling the last graph, record the end time
|
# after compiling the last graph, record the end time
|
||||||
|
|||||||
@@ -228,9 +228,10 @@ class SchedulerConfig:
|
|||||||
self.encoder_cache_size = self.max_num_batched_tokens
|
self.encoder_cache_size = self.max_num_batched_tokens
|
||||||
|
|
||||||
if self.enable_chunked_prefill:
|
if self.enable_chunked_prefill:
|
||||||
logger.info(
|
logger.info_once(
|
||||||
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
|
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
|
||||||
self.max_num_batched_tokens,
|
self.max_num_batched_tokens,
|
||||||
|
scope="local",
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.max_num_partial_prefills > 1:
|
if self.max_num_partial_prefills > 1:
|
||||||
|
|||||||
@@ -227,7 +227,9 @@ class MMEncoderAttention(CustomOp):
|
|||||||
if self.attn_backend == AttentionBackendEnum.FLASHINFER:
|
if self.attn_backend == AttentionBackendEnum.FLASHINFER:
|
||||||
_get_flashinfer_workspace_buffer()
|
_get_flashinfer_workspace_buffer()
|
||||||
|
|
||||||
logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
|
logger.info_once(
|
||||||
|
f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def enabled(cls) -> bool:
|
def enabled(cls) -> bool:
|
||||||
|
|||||||
@@ -192,14 +192,15 @@ class ChunkGatedDeltaRule(CustomOp):
|
|||||||
use_flashinfer = supports_flashinfer
|
use_flashinfer = supports_flashinfer
|
||||||
|
|
||||||
if use_flashinfer:
|
if use_flashinfer:
|
||||||
logger.info_once("Using FlashInfer GDN prefill kernel")
|
logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
"FlashInfer GDN prefill kernel is JIT-compiled; first run may "
|
"FlashInfer GDN prefill kernel is JIT-compiled; first run may "
|
||||||
"take a while to compile. Set `--gdn-prefill-backend triton` to "
|
"take a while to compile. Set `--gdn-prefill-backend triton` to "
|
||||||
"avoid JIT compile time."
|
"avoid JIT compile time.",
|
||||||
|
scope="local",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info_once("Using Triton/FLA GDN prefill kernel")
|
logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")
|
||||||
|
|
||||||
self._forward_method = (
|
self._forward_method = (
|
||||||
self.forward_cuda if use_flashinfer else self.forward_native
|
self.forward_cuda if use_flashinfer else self.forward_native
|
||||||
|
|||||||
@@ -387,7 +387,8 @@ class CudaPlatformBase(Platform):
|
|||||||
)
|
)
|
||||||
if is_backend_supported:
|
if is_backend_supported:
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
f"Using backend {vit_attn_backend} for vit attention"
|
f"Using backend {vit_attn_backend} for vit attention",
|
||||||
|
scope="local",
|
||||||
)
|
)
|
||||||
return vit_attn_backend
|
return vit_attn_backend
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|||||||
@@ -998,12 +998,13 @@ def set_multiprocessing_worker_envs():
|
|||||||
"OMP_NUM_THREADS" not in os.environ
|
"OMP_NUM_THREADS" not in os.environ
|
||||||
and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
|
and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
|
||||||
):
|
):
|
||||||
logger.warning(
|
logger.warning_once(
|
||||||
"Reducing Torch parallelism from %d threads to %d to avoid "
|
"Reducing Torch parallelism from %d threads to %d to avoid "
|
||||||
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
|
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
|
||||||
"external environment to tune this value as needed.",
|
"external environment to tune this value as needed.",
|
||||||
current_parallelism,
|
current_parallelism,
|
||||||
default_omp_num_threads,
|
default_omp_num_threads,
|
||||||
|
scope="local",
|
||||||
)
|
)
|
||||||
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
||||||
torch.set_num_threads(default_omp_num_threads)
|
torch.set_num_threads(default_omp_num_threads)
|
||||||
|
|||||||
@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
|
|||||||
# this optimization if we run into this case.
|
# this optimization if we run into this case.
|
||||||
if parallel_config.disable_nccl_for_dp_synchronization:
|
if parallel_config.disable_nccl_for_dp_synchronization:
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
"Using CPU all reduce to synchronize DP padding between ranks."
|
"Using CPU all reduce to synchronize DP padding between ranks.",
|
||||||
|
scope="local",
|
||||||
)
|
)
|
||||||
device = "cpu"
|
device = "cpu"
|
||||||
group = get_dp_group().cpu_group
|
group = get_dp_group().cpu_group
|
||||||
|
|||||||
@@ -5510,13 +5510,14 @@ class GPUModelRunner(
|
|||||||
dummy_modality
|
dummy_modality
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.info(
|
logger.info_once(
|
||||||
"Encoder cache will be initialized with a "
|
"Encoder cache will be initialized with a "
|
||||||
"budget of %s tokens, and profiled with "
|
"budget of %s tokens, and profiled with "
|
||||||
"%s %s items of the maximum feature size.",
|
"%s %s items of the maximum feature size.",
|
||||||
encoder_budget,
|
encoder_budget,
|
||||||
max_mm_items_per_batch,
|
max_mm_items_per_batch,
|
||||||
dummy_modality,
|
dummy_modality,
|
||||||
|
scope="local",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create dummy batch of multimodal inputs.
|
# Create dummy batch of multimodal inputs.
|
||||||
|
|||||||
Reference in New Issue
Block a user