Reapply [Attention] Refactor check_and_update_config (#35122)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-03-09 10:17:14 -04:00
parent 5578f2a4d3
commit 77a73458e3
32 changed files with 311 additions and 279 deletions
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -217,10 +217,9 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
                mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token
            )

-        # override attention block size if either (a) the
-        # user has not set it or (b) the user has set it
-        # too small.
-        if cache_config.block_size is None or cache_config.block_size < attn_block_size:
+        # override attention block size if it is too small,
+        # even if the user has explicitly set it
+        if cache_config.block_size < attn_block_size:
            cache_config.block_size = attn_block_size
            logger.info(
                "Setting attention block size to %d tokens "
--- a/vllm/model_executor/models/whisper_causal.py
+++ b/vllm/model_executor/models/whisper_causal.py
@@ -290,16 +290,13 @@ class WhisperCausalAttentionWithBlockPooling(Attention):

        if cache_config is not None:
            kv_cache_dtype = cache_config.cache_dtype
-            block_size = cache_config.block_size
        else:
            kv_cache_dtype = "auto"
-            block_size = 16

        underlying_attn_backend = get_attn_backend(
            head_size,
            dtype,
            kv_cache_dtype,
-            block_size,
            attn_type=attn_type,
        )
        attn_backend = create_whisper_attention_backend_with_block_pooling(