diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index b7cf3614e..bb478fceb 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -546,7 +546,7 @@ class InductorAdaptor(CompilerInterface): hash_str, example_inputs, True, False ) assert inductor_compiled_graph is not None, ( - "Inductor cache lookup failed. Please remove" + "Inductor cache lookup failed. Please remove " f"the cache directory and try again." # noqa ) elif torch.__version__ >= "2.6": @@ -557,7 +557,7 @@ class InductorAdaptor(CompilerInterface): hash_str, example_inputs, True, None, constants ) assert inductor_compiled_graph is not None, ( - "Inductor cache lookup failed. Please remove" + "Inductor cache lookup failed. Please remove " f"the cache directory and try again." # noqa ) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 7bffa53bd..d9176773f 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -949,8 +949,8 @@ class CompilationConfig: ) if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: logger.warning_once( - "Piecewise compilation with empty splitting_ops do not" - "contains piecewise cudagraph. Setting cudagraph_" + "Piecewise compilation with empty splitting_ops does not " + "contain piecewise cudagraph. Setting cudagraph_" "mode to NONE. Hint: If you are using attention " "backends that support cudagraph, consider manually " "setting cudagraph_mode to FULL or FULL_DECODE_ONLY " @@ -959,8 +959,8 @@ class CompilationConfig: self.cudagraph_mode = CUDAGraphMode.NONE elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: logger.warning_once( - "Piecewise compilation with empty splitting_ops do " - "not contains piecewise cudagraph. Setting " + "Piecewise compilation with empty splitting_ops does " + "not contain piecewise cudagraph. Setting " "cudagraph_mode to FULL." ) self.cudagraph_mode = CUDAGraphMode.FULL diff --git a/vllm/config/model.py b/vllm/config/model.py index da3a4618c..f46918029 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1494,7 +1494,7 @@ class ModelConfig: if self.runner_type != "pooling" and head_dtype != self.dtype: logger.warning_once( - "`head_dtype` currently only supports pooling models." + "`head_dtype` currently only supports pooling models, " "fallback to model dtype [%s].", self.dtype, ) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 88c2e100a..a84acd8e6 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -672,9 +672,9 @@ class VllmConfig: and self.compilation_config.mode != CompilationMode.VLLM_COMPILE ): logger.warning( - "Inductor compilation was disabled by user settings," - "Optimizations settings that are only active during" - "Inductor compilation will be ignored." + "Inductor compilation was disabled by user settings, " + "optimizations settings that are only active during " + "inductor compilation will be ignored." ) def has_blocked_weights(): @@ -790,7 +790,7 @@ class VllmConfig: logger.warning_once( "--kv-sharing-fast-prefill requires changes on model side for " - "correctness and to realize prefill savings. " + "correctness and to realize prefill savings." ) # TODO: Move after https://github.com/vllm-project/vllm/pull/26847 lands self._set_compile_ranges() @@ -813,7 +813,7 @@ class VllmConfig: and not self.cache_config.enable_prefix_caching ): logger.warning( - "KV cache events are on, but prefix caching is not enabled." + "KV cache events are on, but prefix caching is not enabled. " "Use --enable-prefix-caching to enable." ) if ( @@ -822,9 +822,9 @@ class VllmConfig: and not self.kv_events_config.enable_kv_cache_events ): logger.warning( - "KV cache events are disabled," - "but the scheduler is configured to publish them." - "Modify KVEventsConfig.enable_kv_cache_events" + "KV cache events are disabled, " + "but the scheduler is configured to publish them. " + "Modify KVEventsConfig.enable_kv_cache_events " "to True to enable." ) current_platform.check_and_update_config(self) @@ -893,7 +893,7 @@ class VllmConfig: else "pipeline parallelism" ) logger.warning_once( - "Sequence parallelism not supported with" + "Sequence parallelism not supported with " "native rms_norm when using %s, " "this will likely lead to an error.", regime, @@ -910,7 +910,7 @@ class VllmConfig: logger.warning_once( "No piecewise cudagraph for executing cascade attention." " Will fall back to eager execution if a batch runs " - "into cascade attentions" + "into cascade attentions." ) if self.compilation_config.cudagraph_mode.requires_piecewise_compilation(): diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 66703a36a..51535f32c 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -170,7 +170,7 @@ def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None: config_path = Path(f"{user_defined_config_folder}/{config_fname}") if not config_path.exists(): - logger.warning_once(f"No LoRA kernel configs founded in {config_path}") + logger.warning_once(f"No LoRA kernel configs found in {config_path}") return None # Load json diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py index eb902afd0..130d85efb 100644 --- a/vllm/v1/attention/backends/fa_utils.py +++ b/vllm/v1/attention/backends/fa_utils.py @@ -67,7 +67,7 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None: # 3. fallback for unsupported combinations if device_capability.major == 10 and fa_version == 3: logger.warning_once( - "Cannot use FA version 3 on Blackwell platform " + "Cannot use FA version 3 on Blackwell platform, " "defaulting to FA version 2." ) fa_version = 2