diff --git a/vllm/model_executor/models/kimi_k25.py b/vllm/model_executor/models/kimi_k25.py index d1e7b3582..a9b85f073 100644 --- a/vllm/model_executor/models/kimi_k25.py +++ b/vllm/model_executor/models/kimi_k25.py @@ -369,7 +369,7 @@ class KimiK25ForConditionalGeneration( target_dtype = next(self.vision_tower.parameters()).dtype pixel_values = pixel_values.to(target_dtype) assert isinstance(grid_thws, torch.Tensor), ( - f"expect grid_thws to be a tensor, get {type(grid_thws)}" + f"expect grid_thws to be a tensor, got {type(grid_thws)}" ) # In some cases (e.g. with merger), grid_thws has an extra middle dimension grid_thws = grid_thws.reshape(-1, grid_thws.shape[-1]) diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 23fb7d9e9..16874c177 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -749,7 +749,10 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat prefix_kv_lens = None suffix_kv_lens = None if use_cascade: - raise NotImplementedError("Not yet my friend") + raise NotImplementedError( + "Cascade prefix attention is not yet implemented " + "for FlexAttention backend" + ) block_size = self.kv_cache_spec.block_size max_possible_seq_len = self.model_config.max_model_len diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 0f41993fc..c64355ff5 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -253,7 +253,7 @@ def make_local_attention_virtual_batches( # seqlens_q_local = [2, 2, 1, 4, 4, 1, 4, 1] # # First Get batched arange. (E.g., [2, 4, 2] -> [0, 1, 0, 1, 2, 3, 0, 1]) - # (TODO: max a utility to share this code with _prepare_inputs) + # (TODO: make a utility to share this code with _prepare_inputs) # arange step 1. [2, 4, 2] -> [2, 6, 8] cu_num_blocks = np.cumsum(local_blocks) virtual_batches = cu_num_blocks[-1] diff --git a/vllm/v1/attention/selector.py b/vllm/v1/attention/selector.py index 40cc10278..c37b709fe 100644 --- a/vllm/v1/attention/selector.py +++ b/vllm/v1/attention/selector.py @@ -149,8 +149,8 @@ def _cached_get_mamba_attn_backend( selected_backend = MambaAttentionBackendEnum[backend_name] except KeyError as e: raise ValueError( - f"Invalid mamba attention backend type: '{backend_name}'. Valid " - f"backends are: {list(MambaAttentionBackendEnum.__members__.keys())}" + f"Invalid mamba attention backend type: '{mamba_type}'. Valid " + f"types are: {list(MAMBA_TYPE_TO_BACKEND_MAP.keys())}" ) from e mamba_attn_backend = selected_backend.get_class()