[Misc] improve warning/assert messages (#32226)
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
This commit is contained in:
@@ -546,7 +546,7 @@ class InductorAdaptor(CompilerInterface):
|
||||
hash_str, example_inputs, True, False
|
||||
)
|
||||
assert inductor_compiled_graph is not None, (
|
||||
"Inductor cache lookup failed. Please remove"
|
||||
"Inductor cache lookup failed. Please remove "
|
||||
f"the cache directory and try again." # noqa
|
||||
)
|
||||
elif torch.__version__ >= "2.6":
|
||||
@@ -557,7 +557,7 @@ class InductorAdaptor(CompilerInterface):
|
||||
hash_str, example_inputs, True, None, constants
|
||||
)
|
||||
assert inductor_compiled_graph is not None, (
|
||||
"Inductor cache lookup failed. Please remove"
|
||||
"Inductor cache lookup failed. Please remove "
|
||||
f"the cache directory and try again." # noqa
|
||||
)
|
||||
|
||||
|
||||
@@ -949,8 +949,8 @@ class CompilationConfig:
|
||||
)
|
||||
if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
|
||||
logger.warning_once(
|
||||
"Piecewise compilation with empty splitting_ops do not"
|
||||
"contains piecewise cudagraph. Setting cudagraph_"
|
||||
"Piecewise compilation with empty splitting_ops does not "
|
||||
"contain piecewise cudagraph. Setting cudagraph_"
|
||||
"mode to NONE. Hint: If you are using attention "
|
||||
"backends that support cudagraph, consider manually "
|
||||
"setting cudagraph_mode to FULL or FULL_DECODE_ONLY "
|
||||
@@ -959,8 +959,8 @@ class CompilationConfig:
|
||||
self.cudagraph_mode = CUDAGraphMode.NONE
|
||||
elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
||||
logger.warning_once(
|
||||
"Piecewise compilation with empty splitting_ops do "
|
||||
"not contains piecewise cudagraph. Setting "
|
||||
"Piecewise compilation with empty splitting_ops does "
|
||||
"not contain piecewise cudagraph. Setting "
|
||||
"cudagraph_mode to FULL."
|
||||
)
|
||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||
|
||||
@@ -1494,7 +1494,7 @@ class ModelConfig:
|
||||
|
||||
if self.runner_type != "pooling" and head_dtype != self.dtype:
|
||||
logger.warning_once(
|
||||
"`head_dtype` currently only supports pooling models."
|
||||
"`head_dtype` currently only supports pooling models, "
|
||||
"fallback to model dtype [%s].",
|
||||
self.dtype,
|
||||
)
|
||||
|
||||
@@ -672,9 +672,9 @@ class VllmConfig:
|
||||
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
||||
):
|
||||
logger.warning(
|
||||
"Inductor compilation was disabled by user settings,"
|
||||
"Optimizations settings that are only active during"
|
||||
"Inductor compilation will be ignored."
|
||||
"Inductor compilation was disabled by user settings, "
|
||||
"optimizations settings that are only active during "
|
||||
"inductor compilation will be ignored."
|
||||
)
|
||||
|
||||
def has_blocked_weights():
|
||||
@@ -790,7 +790,7 @@ class VllmConfig:
|
||||
|
||||
logger.warning_once(
|
||||
"--kv-sharing-fast-prefill requires changes on model side for "
|
||||
"correctness and to realize prefill savings. "
|
||||
"correctness and to realize prefill savings."
|
||||
)
|
||||
# TODO: Move after https://github.com/vllm-project/vllm/pull/26847 lands
|
||||
self._set_compile_ranges()
|
||||
@@ -813,7 +813,7 @@ class VllmConfig:
|
||||
and not self.cache_config.enable_prefix_caching
|
||||
):
|
||||
logger.warning(
|
||||
"KV cache events are on, but prefix caching is not enabled."
|
||||
"KV cache events are on, but prefix caching is not enabled. "
|
||||
"Use --enable-prefix-caching to enable."
|
||||
)
|
||||
if (
|
||||
@@ -822,9 +822,9 @@ class VllmConfig:
|
||||
and not self.kv_events_config.enable_kv_cache_events
|
||||
):
|
||||
logger.warning(
|
||||
"KV cache events are disabled,"
|
||||
"but the scheduler is configured to publish them."
|
||||
"Modify KVEventsConfig.enable_kv_cache_events"
|
||||
"KV cache events are disabled, "
|
||||
"but the scheduler is configured to publish them. "
|
||||
"Modify KVEventsConfig.enable_kv_cache_events "
|
||||
"to True to enable."
|
||||
)
|
||||
current_platform.check_and_update_config(self)
|
||||
@@ -893,7 +893,7 @@ class VllmConfig:
|
||||
else "pipeline parallelism"
|
||||
)
|
||||
logger.warning_once(
|
||||
"Sequence parallelism not supported with"
|
||||
"Sequence parallelism not supported with "
|
||||
"native rms_norm when using %s, "
|
||||
"this will likely lead to an error.",
|
||||
regime,
|
||||
@@ -910,7 +910,7 @@ class VllmConfig:
|
||||
logger.warning_once(
|
||||
"No piecewise cudagraph for executing cascade attention."
|
||||
" Will fall back to eager execution if a batch runs "
|
||||
"into cascade attentions"
|
||||
"into cascade attentions."
|
||||
)
|
||||
|
||||
if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
|
||||
|
||||
@@ -170,7 +170,7 @@ def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None:
|
||||
|
||||
config_path = Path(f"{user_defined_config_folder}/{config_fname}")
|
||||
if not config_path.exists():
|
||||
logger.warning_once(f"No LoRA kernel configs founded in {config_path}")
|
||||
logger.warning_once(f"No LoRA kernel configs found in {config_path}")
|
||||
return None
|
||||
|
||||
# Load json
|
||||
|
||||
@@ -67,7 +67,7 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
|
||||
# 3. fallback for unsupported combinations
|
||||
if device_capability.major == 10 and fa_version == 3:
|
||||
logger.warning_once(
|
||||
"Cannot use FA version 3 on Blackwell platform "
|
||||
"Cannot use FA version 3 on Blackwell platform, "
|
||||
"defaulting to FA version 2."
|
||||
)
|
||||
fa_version = 2
|
||||
|
||||
Reference in New Issue
Block a user