[Hybrid Allocator] Support KV cache groups with different block_size (#29143)

Signed-off-by: Yifan Qiao <yifanqiao@berkeley.edu> Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-11-25 07:30:57 -08:00
parent e502098643
commit 48ddb02b79
11 changed files with 472 additions and 87 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1816,9 +1816,11 @@ class EngineArgs:
        if model_config.runner_type != "pooling":
            default_chunked_prefill = True

-            # Disable prefix caching default for hybrid models
-            # since the feature is still experimental.
-            default_prefix_caching = not model_config.is_hybrid
+            # Disable prefix caching default for hybrid models and mamba-only
+            # models since the feature is still experimental.
+            default_prefix_caching = not (
+                model_config.is_hybrid or model_config.is_attention_free
+            )
        else:
            assert model_config.pooler_config is not None