[v1][attention] Support Hybrid Allocator + FlashInfer (#21412)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-07-29 18:45:29 -07:00
parent 0e36abf993
commit 555e7225bc
16 changed files with 85 additions and 57 deletions
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -70,8 +70,8 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
    full_cudagraph_supported: ClassVar[bool] = False

    @abstractmethod
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
        self.kv_cache_spec = kv_cache_spec

    @abstractmethod
@@ -164,14 +164,14 @@ class PerLayerParameters:


 def get_per_layer_parameters(
-        vllm_config: VllmConfig,
+        vllm_config: VllmConfig, layer_names: list[str],
        cls_: type['AttentionImpl']) -> dict[str, PerLayerParameters]:
    """
-    Scan all attention layers and determine some hyperparameters
+    Scan layers in `layer_names` and determine some hyperparameters
    to use during `plan`.
    """

-    layers = get_layers_from_vllm_config(vllm_config, Attention)
+    layers = get_layers_from_vllm_config(vllm_config, Attention, layer_names)
    per_layer_params: dict[str, PerLayerParameters] = {}

    for key, layer in layers.items():
@@ -208,6 +208,10 @@ def infer_global_hyperparameters(
    param_sets = list(per_layer_params.values())
    global_params = param_sets[0]
    for params in param_sets:
+        if params.window_left != global_params.window_left:
+            raise ValueError(
+                "Window left is not the same for all layers. One potential fix "
+                "is to set disable_sliding_window=True")
        assert params == global_params, (
            "FlashInfer backend currently only supports models in which all "
            "layers share the same values for the following hyperparameters: "