[Hybrid Allocator] Support Pipeline Parallel (#23974)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-14 15:55:17 -07:00
parent 90f3f7d73e
commit 8e5cdcda4e
7 changed files with 472 additions and 235 deletions
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -5,7 +5,7 @@
 import os
 from collections import defaultdict, deque
 from collections.abc import Iterable, Sequence
-from dataclasses import astuple, dataclass
+from dataclasses import dataclass
 from typing import Any, Callable, NewType, Optional, Union

 from vllm import envs
@@ -811,59 +811,21 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
    return page_sizes.pop()


-def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
-                                      kv_cache_spec: dict[str, KVCacheSpec],
-                                      available_memory: int) -> KVCacheConfig:
+def _get_kv_cache_groups_uniform_type(
+        kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
    """
    Generates the KV cache configuration for a model with one type of KV cache.
    Divide the available memory equally among all layers.

    Args:
-        vllm_config: The global VllmConfig
-        kv_cache_spec: The kv cache spec of each attention layer in the model
-        available_memory: Memory available for KV cache in bytes.
+        kv_cache_specs: The kv cache spec of each attention layer in the model

    Returns:
-        The generated KVCacheConfig
+        The generated KVCacheGroupSpecs
    """

-    page_size = get_uniform_page_size(kv_cache_spec)
-    num_blocks = get_num_blocks(vllm_config, len(kv_cache_spec),
-                                available_memory, page_size)
-
-    per_layer_size = page_size * num_blocks
-    # All layers have the same KV cache spec, so we create one kv cache group
-    # for all layers.
-    grouped_layer_names = [list(kv_cache_spec.keys())]
-
-    # Each layer uses a separate Tensor to store its KV cache.
-    kv_cache_tensors = [
-        KVCacheTensor(size=per_layer_size, shared_by=[layer_name])
-        for layer_name in kv_cache_spec
-    ]
-
-    kv_cache_config = KVCacheConfig(
-        num_blocks=num_blocks,
-        kv_cache_tensors=kv_cache_tensors,
-        kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
-                                                    grouped_layer_names),
-    )
-
-    num_tokens = num_blocks * vllm_config.cache_config.block_size
-    if vllm_config.parallel_config.decode_context_parallel_size > 1:
-        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
-        logger.info(
-            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
-            vllm_config.parallel_config.decode_context_parallel_size)
-
-    num_tokens_str = f"{num_tokens:,}"
-    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
-    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
-    max_concurrency = get_max_concurrency_for_kv_cache_config(
-        vllm_config, kv_cache_config)
-    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
-                max_model_len_str, max_concurrency)
-    return kv_cache_config
+    return create_kv_cache_group_specs(kv_cache_specs,
+                                       [list(kv_cache_specs.keys())])


 def is_kv_cache_page_size_uniform(
@@ -888,11 +850,10 @@ def is_kv_cache_type_attention_free(
    return not kv_cache_spec


-def _get_kv_cache_config_uniform_page_size(
-        vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
-        available_memory: int) -> KVCacheConfig:
+def _get_kv_cache_groups_uniform_page_size(
+        kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
    """
-    Generates the KV cache configuration for hybrid models with multiple 
+    Generates the KV cache groups for hybrid models with multiple 
    attention types but still with a uniform page size (physical memory per 
    block per layer) for all layers.

@@ -949,11 +910,9 @@ def _get_kv_cache_config_uniform_page_size(
    memory per block is the same for all groups.

    Args:
-        vllm_config: The global VllmConfig
        kv_cache_spec: The KVCacheSpec of each attention layer in the model
-        available_memory: Memory available for KV cache in bytes.
    Returns:
-        The generated KVCacheConfig
+        The generated KVCacheGroupSpecs
    """
    # Group all layers by kv_cache_spec.
    # E.g., 2 full attention layers and 3 sliding window attention layers,
@@ -966,7 +925,7 @@ def _get_kv_cache_config_uniform_page_size(
    # group identical. Add padding to the last group of each type if necessary.
    # E.g., (full.0, full.1), (sw.0, sw.1, sw.2)
    # split to 3 groups with 2 layers each:
-    # (full.0, full.1), (sw.0, sw.1), (sw.2, padding).
+    # (full.0, full.1), (sw.0, sw.2), (sw.1, padding).
    # FIXME(Chen): At the moment of writing this code (2025-06-02), all
    # open-source hybrid model follows a n:1 pattern between different attention
    # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and
@@ -984,19 +943,60 @@ def _get_kv_cache_config_uniform_page_size(
                num_padding_layers,
                num_padding_layers / len(layers) * 100,
            )
-        for i in range(0, len(layers), group_size):
-            grouped_layers.append(layers[i:i + group_size])
-    kv_cache_groups = create_kv_cache_group_specs(kv_cache_spec,
-                                                  grouped_layers)
+        num_groups = cdiv(len(layers), group_size)
+        # In PP case, say if we have
+        # - stage 0: full.0, sw.0, sw.1
+        # - stage 1: full.1, sw.2, sw.3
+        # We should have 3 groups: (full.0, full.1), (sw.0, sw.2), (sw.1, sw.3)
+        # It can't be (full.0, full.1), (sw.0, sw.1), (sw.2, sw.3) because
+        # the 3 groups in stage 0 will be (full.0), (sw.0, sw.1), (empty group)
+        # and it will be padded to (full.0, padding), (sw.0, sw.1),
+        # (padding, padding) to ensure the number of layers in each group is
+        # the same and will cause memory waste.
+        # To avoid this, we assign layers[i::num_groups] to the i-th group
+        # instead of layers[i * group_size: (i + 1) * group_size]
+        for i in range(num_groups):
+            grouped_layers.append(layers[i::num_groups])
+    return create_kv_cache_group_specs(kv_cache_spec, grouped_layers)
+
+
+def get_kv_cache_config_from_groups(vllm_config: VllmConfig,
+                                    kv_cache_groups: list[KVCacheGroupSpec],
+                                    kv_cache_specs: dict[str, KVCacheSpec],
+                                    available_memory: int) -> KVCacheConfig:
+    """
+    Generate the KV cache configuration from the KV cache groups and spec
+    of each layer.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_groups: The KV cache groups
+        kv_cache_specs: The KV cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes
+    Returns:
+        The generated KVCacheConfig
+    """
+    if len(kv_cache_groups) == 0:
+        # Attention free models do not have KV cache.
+        # Return num_blocks=1 as BlockPool always needs a null_block.
+        return KVCacheConfig(
+            num_blocks=1,
+            kv_cache_tensors=[],
+            kv_cache_groups=kv_cache_groups,
+        )

    # Determine how model runners should initialize the KV cache tensors.
    # We will have group_size memory pools, each is shared by one layer from
    # each group. As layers of different groups have different block table,
    # they will use different parts of the shared Tensor.
-    # The memory layout in the example will be:
-    # full.0, sw.0, sw.2: share a Tensor with size=available_memory//2
-    # full.1, sw.1: share another Tensor with size=available_memory//2
-    page_size = get_uniform_page_size(kv_cache_spec)
+    # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2),
+    # (sw.1, padding) will be: (group_size = 2)
+    # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2
+    # full.1, sw.2: share another Tensor with size=available_memory//2
+    group_size = max(len(group.layer_names) for group in kv_cache_groups)
+
+    page_size = get_uniform_page_size(kv_cache_specs)
+    assert group_size > 0, "group_size must be greater than 0"
    num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
                                page_size)
    per_memory_pool_size = page_size * num_blocks
@@ -1004,8 +1004,8 @@ def _get_kv_cache_config_uniform_page_size(
    for i in range(group_size):
        shared_by = []
        for j in range(len(kv_cache_groups)):
-            if i < len(grouped_layers[j]):
-                shared_by.append(grouped_layers[j][i])
+            if i < len(kv_cache_groups[j].layer_names):
+                shared_by.append(kv_cache_groups[j].layer_names[i])
        kv_cache_tensors.append(
            KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by))

@@ -1019,7 +1019,12 @@ def _get_kv_cache_config_uniform_page_size(
        [group.kv_cache_spec.block_size for group in kv_cache_groups])

    # Print the KV cache size and maximum concurrency.
-    num_tokens = num_blocks // len(grouped_layers) * min_block_size
+    num_tokens = num_blocks // len(kv_cache_groups) * min_block_size
+    if vllm_config.parallel_config.decode_context_parallel_size > 1:
+        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
+        logger.info(
+            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
+            vllm_config.parallel_config.decode_context_parallel_size)
    num_tokens_str = f"{num_tokens:,}"
    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
@@ -1030,10 +1035,6 @@ def _get_kv_cache_config_uniform_page_size(
    return kv_cache_config


-def _get_kv_cache_config_attention_free() -> KVCacheConfig:
-    return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
-
-
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
    """
    This function tries to convert the KV cache specs to one type if the model
@@ -1087,72 +1088,112 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
                         "convert the KV cache specs to one unified type.")


-def get_kv_cache_config(
-    vllm_config: VllmConfig,
-    kv_cache_spec: dict[str, KVCacheSpec],
-    available_memory: int,
-) -> KVCacheConfig:
+def get_kv_cache_groups(
+        vllm_config: VllmConfig,
+        kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
    """
-    Generates the KV cache configuration for a model.
+    Split the layers in the model into groups with the same KV cache spec.

    Args:
        vllm_config: The global VllmConfig
        kv_cache_spec: The kv cache spec of each attention layer in the model
-        available_memory: Memory available for KV cache in bytes.

    Returns:
-        The generated KVCacheConfigs
+        The generated KVCacheGroups
    """
-    check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
    if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
        unify_hybrid_kv_cache_specs(kv_cache_spec)

    if is_kv_cache_type_attention_free(kv_cache_spec):
-        # This returns a kv_cache config with 0 kv_cache groups and 1 block
-        # to allow for the KVCache manager to handle attention free models.
-        return _get_kv_cache_config_attention_free()
+        # This returns an empty list to allow for the KVCacheManager to handle
+        # attention free models.
+        return []
    elif is_kv_cache_type_uniform(kv_cache_spec):
        # KV cache of all layers are the same, which is true for
        # most models. Allocate the same amount of memory for
        # each layer.
-        return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
-                                                 available_memory)
+        return _get_kv_cache_groups_uniform_type(kv_cache_spec)
    elif is_kv_cache_page_size_uniform(kv_cache_spec):
        # Model contains multiple attention types, but KV cache of all layers
        # have the same physical memory per block per layer. Split the layers
        # into groups with the same number of layers, and thus same total page
        # size.
-        return _get_kv_cache_config_uniform_page_size(vllm_config,
-                                                      kv_cache_spec,
-                                                      available_memory)
+        return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)

    raise NotImplementedError


-def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
+def get_kv_cache_configs(vllm_config: VllmConfig,
+                         kv_cache_specs: list[dict[str, KVCacheSpec]],
+                         available_memory: list[int]) -> list[KVCacheConfig]:
    """
-    Make the KV cache configurations for each worker consistent, so that all
-    workers can be controlled by the same KVCacheManager.
-    This function verifies that the layer group of each worker are the same,
-    and changes the num_blocks of each worker to the smallest among all workers.
+    Generates the KV cache configurations for a model. 
+    Since we use a shared centralized controller for all workers, we need the
+    `kv_cache_config` to be consistent across all workers to make sure
+    the KV cache allocation can be applied to all workers. However, different
+    workers may have different memory available, and different type of layers
+    (when pipeline parallel is enabled). To handle the difference between
+    workers, the current implementation is:
+    1. Merge the KV cache specs of all workers to get the KVCacheSpecs for
+       the whole model.
+    2. Generate the KV cache groups based on the layer ratio of the whole model.
+    3. Generate the KV cache configs for each worker based on the KV cache
+       grouping strategy. (This is reasonable because the layer ratio of
+       different PP stages are similar.)
+    4. Change the num_blocks of each worker to the smallest among all workers.

    Args:
-        kv_cache_configs: The KV cache configurations for each worker. Will be
-            in-place modified to make them consistent.
+        vllm_config: The global VllmConfig
+        kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker.
+        available_memory: Memory available for KV cache in bytes for each 
+        worker.
+
+    Returns:
+        The generated KVCacheConfigs for each worker.
    """

-    # Sort the kv cache groups by their KV cache spec.
-    # This can avoid the inconsistency caused by the order of groups.
-    for kv_cache_config in kv_cache_configs:
-        kv_cache_config.kv_cache_groups.sort(key=lambda x: (type(
-            x.kv_cache_spec).__name__, astuple(x.kv_cache_spec)))
+    # Check if the available memory is enough for each worker.
+    for kv_cache_spec_one_worker, available_memory_one_worker in zip(
+            kv_cache_specs, available_memory):
+        check_enough_kv_cache_memory(vllm_config, kv_cache_spec_one_worker,
+                                     available_memory_one_worker)

-    # Verify that the groups of each rank are the same.
-    for kv_cache_config in kv_cache_configs[1:]:
-        for group_rank_0, group_rank_i in zip(
-                kv_cache_configs[0].kv_cache_groups,
-                kv_cache_config.kv_cache_groups):
-            assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec
+    # Merge the KV cache specs of all workers. Different PP stages may have
+    # different layer names, and different TP ranks of the same PP stage should
+    # have the same KV cache spec.
+    merged_kv_cache_specs: dict[str, KVCacheSpec] = {}
+    for kv_cache_spec_one_worker in kv_cache_specs:
+        for layer_name, layer_spec in kv_cache_spec_one_worker.items():
+            if layer_name not in merged_kv_cache_specs:
+                merged_kv_cache_specs[layer_name] = layer_spec
+            else:
+                assert merged_kv_cache_specs[layer_name] == layer_spec, (
+                    "The KV cache specs for the same layer are different "
+                    "across workers. This is not supported yet.")
+    global_kv_cache_groups = get_kv_cache_groups(vllm_config,
+                                                 merged_kv_cache_specs)
+
+    kv_cache_configs: list[KVCacheConfig] = []
+    for kv_cache_spec_one_worker, available_memory_one_worker in zip(
+            kv_cache_specs, available_memory):
+        kv_cache_groups_one_worker: list[KVCacheGroupSpec] = []
+        for group in global_kv_cache_groups:
+            group_layer_names_one_worker = [
+                layer_name for layer_name in group.layer_names
+                if layer_name in kv_cache_spec_one_worker
+            ]
+            kv_cache_groups_one_worker.append(
+                KVCacheGroupSpec(group_layer_names_one_worker,
+                                 group.kv_cache_spec))
+        assert sum(
+            len(group.layer_names) for group in
+            kv_cache_groups_one_worker) == len(kv_cache_spec_one_worker), (
+                "Some layers are not assigned to any group.")
+        kv_cache_configs.append(
+            get_kv_cache_config_from_groups(vllm_config,
+                                            kv_cache_groups_one_worker,
+                                            kv_cache_spec_one_worker,
+                                            available_memory_one_worker))

    # Change the num_blocks of each rank to the smallest among all ranks. We
    # do not need to shrink the tensor size because it is valid to only use the
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -29,10 +29,9 @@ from vllm.transformers_utils.config import (
    maybe_register_config_serialize_by_value)
 from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket,
                        resolve_obj_by_qualname, set_process_title)
-from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_config,
+from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_configs,
                                         get_request_block_hasher,
-                                         init_none_hash,
-                                         unify_kv_cache_configs)
+                                         init_none_hash)
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
@@ -191,18 +190,9 @@ class EngineCore:
            available_gpu_memory = [0] * len(kv_cache_specs)

        assert len(kv_cache_specs) == len(available_gpu_memory)
-        # Get the kv cache tensor size
-        kv_cache_configs = [
-            get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
-                                available_gpu_memory_one_worker)
-            for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
-            zip(kv_cache_specs, available_gpu_memory)
-        ]

-        # Since we use a shared centralized controller, we need the
-        # `kv_cache_config` to be consistent across all workers to make sure
-        # all the memory operators can be applied to all workers.
-        unify_kv_cache_configs(kv_cache_configs)
+        kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
+                                                available_gpu_memory)

        # All workers have the same kv_cache_config except layer names, so use
        # an arbitrary one to initialize the scheduler.