[Hybrid] Add mamba_block_size to Engine Args (#27289)

Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
2025-10-28 14:54:24 +02:00
parent 259504e147
commit 05181cc57f
3 changed files with 25 additions and 8 deletions
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -291,9 +291,8 @@ class MambaModelConfig(VerifyAndUpdateConfig):
        model_config = vllm_config.model_config
        cache_config = vllm_config.cache_config

-        # Set mamba block size to max_model_len (this may get
-        # override by prefix caching logic later)
-        cache_config.mamba_block_size = model_config.max_model_len
+        if cache_config.mamba_block_size is None:
+            cache_config.mamba_block_size = model_config.max_model_len

        if cache_config.enable_prefix_caching:
            if model_config.supports_mamba_prefix_caching:
@@ -333,6 +332,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
        if not envs.VLLM_USE_V1:
            return

+        # Save the user input before it gets modified by MambaModelConfig
+        mamba_block_size = vllm_config.cache_config.mamba_block_size
        # Enable FULL_AND_PIECEWISE by default
        MambaModelConfig.verify_and_update_config(vllm_config)

@@ -386,7 +387,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
            # With prefix caching, select attention block size to
            # optimize for mamba kernel performance

-            # mamba SSD kernel uses a chunk_size, e.g. 256
+            # Mamba2 SSD kernel uses a chunk_size, e.g. 256
            # Align the block to the kernel: use lowest multiple of chunk_size
            # of attention tokens that would fit mamba_page_size:
            # e.g. for mamba page size = 788kB
@@ -404,7 +405,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
            def lcm(a, b):
                return a * b // gcd(a, b)

-            base_chunk_size = model_config.get_mamba_chunk_size()
+            base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size()
+
            attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token)

            chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)