[Hybrid Allocator] Support Pipeline Parallel (#23974)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
import os
|
||||
from collections import defaultdict, deque
|
||||
from collections.abc import Iterable, Sequence
|
||||
from dataclasses import astuple, dataclass
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, NewType, Optional, Union
|
||||
|
||||
from vllm import envs
|
||||
@@ -811,59 +811,21 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
|
||||
return page_sizes.pop()
|
||||
|
||||
|
||||
def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int) -> KVCacheConfig:
|
||||
def _get_kv_cache_groups_uniform_type(
|
||||
kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
|
||||
"""
|
||||
Generates the KV cache configuration for a model with one type of KV cache.
|
||||
Divide the available memory equally among all layers.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes.
|
||||
kv_cache_specs: The kv cache spec of each attention layer in the model
|
||||
|
||||
Returns:
|
||||
The generated KVCacheConfig
|
||||
The generated KVCacheGroupSpecs
|
||||
"""
|
||||
|
||||
page_size = get_uniform_page_size(kv_cache_spec)
|
||||
num_blocks = get_num_blocks(vllm_config, len(kv_cache_spec),
|
||||
available_memory, page_size)
|
||||
|
||||
per_layer_size = page_size * num_blocks
|
||||
# All layers have the same KV cache spec, so we create one kv cache group
|
||||
# for all layers.
|
||||
grouped_layer_names = [list(kv_cache_spec.keys())]
|
||||
|
||||
# Each layer uses a separate Tensor to store its KV cache.
|
||||
kv_cache_tensors = [
|
||||
KVCacheTensor(size=per_layer_size, shared_by=[layer_name])
|
||||
for layer_name in kv_cache_spec
|
||||
]
|
||||
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=num_blocks,
|
||||
kv_cache_tensors=kv_cache_tensors,
|
||||
kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
|
||||
grouped_layer_names),
|
||||
)
|
||||
|
||||
num_tokens = num_blocks * vllm_config.cache_config.block_size
|
||||
if vllm_config.parallel_config.decode_context_parallel_size > 1:
|
||||
num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
|
||||
logger.info(
|
||||
"Multiplying the GPU KV cache size by the dcp_world_size %d.",
|
||||
vllm_config.parallel_config.decode_context_parallel_size)
|
||||
|
||||
num_tokens_str = f"{num_tokens:,}"
|
||||
logger.info("GPU KV cache size: %s tokens", num_tokens_str)
|
||||
max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
|
||||
max_concurrency = get_max_concurrency_for_kv_cache_config(
|
||||
vllm_config, kv_cache_config)
|
||||
logger.info("Maximum concurrency for %s tokens per request: %.2fx",
|
||||
max_model_len_str, max_concurrency)
|
||||
return kv_cache_config
|
||||
return create_kv_cache_group_specs(kv_cache_specs,
|
||||
[list(kv_cache_specs.keys())])
|
||||
|
||||
|
||||
def is_kv_cache_page_size_uniform(
|
||||
@@ -888,11 +850,10 @@ def is_kv_cache_type_attention_free(
|
||||
return not kv_cache_spec
|
||||
|
||||
|
||||
def _get_kv_cache_config_uniform_page_size(
|
||||
vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int) -> KVCacheConfig:
|
||||
def _get_kv_cache_groups_uniform_page_size(
|
||||
kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
|
||||
"""
|
||||
Generates the KV cache configuration for hybrid models with multiple
|
||||
Generates the KV cache groups for hybrid models with multiple
|
||||
attention types but still with a uniform page size (physical memory per
|
||||
block per layer) for all layers.
|
||||
|
||||
@@ -949,11 +910,9 @@ def _get_kv_cache_config_uniform_page_size(
|
||||
memory per block is the same for all groups.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_spec: The KVCacheSpec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes.
|
||||
Returns:
|
||||
The generated KVCacheConfig
|
||||
The generated KVCacheGroupSpecs
|
||||
"""
|
||||
# Group all layers by kv_cache_spec.
|
||||
# E.g., 2 full attention layers and 3 sliding window attention layers,
|
||||
@@ -966,7 +925,7 @@ def _get_kv_cache_config_uniform_page_size(
|
||||
# group identical. Add padding to the last group of each type if necessary.
|
||||
# E.g., (full.0, full.1), (sw.0, sw.1, sw.2)
|
||||
# split to 3 groups with 2 layers each:
|
||||
# (full.0, full.1), (sw.0, sw.1), (sw.2, padding).
|
||||
# (full.0, full.1), (sw.0, sw.2), (sw.1, padding).
|
||||
# FIXME(Chen): At the moment of writing this code (2025-06-02), all
|
||||
# open-source hybrid model follows a n:1 pattern between different attention
|
||||
# types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and
|
||||
@@ -984,19 +943,60 @@ def _get_kv_cache_config_uniform_page_size(
|
||||
num_padding_layers,
|
||||
num_padding_layers / len(layers) * 100,
|
||||
)
|
||||
for i in range(0, len(layers), group_size):
|
||||
grouped_layers.append(layers[i:i + group_size])
|
||||
kv_cache_groups = create_kv_cache_group_specs(kv_cache_spec,
|
||||
grouped_layers)
|
||||
num_groups = cdiv(len(layers), group_size)
|
||||
# In PP case, say if we have
|
||||
# - stage 0: full.0, sw.0, sw.1
|
||||
# - stage 1: full.1, sw.2, sw.3
|
||||
# We should have 3 groups: (full.0, full.1), (sw.0, sw.2), (sw.1, sw.3)
|
||||
# It can't be (full.0, full.1), (sw.0, sw.1), (sw.2, sw.3) because
|
||||
# the 3 groups in stage 0 will be (full.0), (sw.0, sw.1), (empty group)
|
||||
# and it will be padded to (full.0, padding), (sw.0, sw.1),
|
||||
# (padding, padding) to ensure the number of layers in each group is
|
||||
# the same and will cause memory waste.
|
||||
# To avoid this, we assign layers[i::num_groups] to the i-th group
|
||||
# instead of layers[i * group_size: (i + 1) * group_size]
|
||||
for i in range(num_groups):
|
||||
grouped_layers.append(layers[i::num_groups])
|
||||
return create_kv_cache_group_specs(kv_cache_spec, grouped_layers)
|
||||
|
||||
|
||||
def get_kv_cache_config_from_groups(vllm_config: VllmConfig,
|
||||
kv_cache_groups: list[KVCacheGroupSpec],
|
||||
kv_cache_specs: dict[str, KVCacheSpec],
|
||||
available_memory: int) -> KVCacheConfig:
|
||||
"""
|
||||
Generate the KV cache configuration from the KV cache groups and spec
|
||||
of each layer.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_groups: The KV cache groups
|
||||
kv_cache_specs: The KV cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes
|
||||
Returns:
|
||||
The generated KVCacheConfig
|
||||
"""
|
||||
if len(kv_cache_groups) == 0:
|
||||
# Attention free models do not have KV cache.
|
||||
# Return num_blocks=1 as BlockPool always needs a null_block.
|
||||
return KVCacheConfig(
|
||||
num_blocks=1,
|
||||
kv_cache_tensors=[],
|
||||
kv_cache_groups=kv_cache_groups,
|
||||
)
|
||||
|
||||
# Determine how model runners should initialize the KV cache tensors.
|
||||
# We will have group_size memory pools, each is shared by one layer from
|
||||
# each group. As layers of different groups have different block table,
|
||||
# they will use different parts of the shared Tensor.
|
||||
# The memory layout in the example will be:
|
||||
# full.0, sw.0, sw.2: share a Tensor with size=available_memory//2
|
||||
# full.1, sw.1: share another Tensor with size=available_memory//2
|
||||
page_size = get_uniform_page_size(kv_cache_spec)
|
||||
# The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2),
|
||||
# (sw.1, padding) will be: (group_size = 2)
|
||||
# full.0, sw.0, sw.1: share a Tensor with size=available_memory//2
|
||||
# full.1, sw.2: share another Tensor with size=available_memory//2
|
||||
group_size = max(len(group.layer_names) for group in kv_cache_groups)
|
||||
|
||||
page_size = get_uniform_page_size(kv_cache_specs)
|
||||
assert group_size > 0, "group_size must be greater than 0"
|
||||
num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
|
||||
page_size)
|
||||
per_memory_pool_size = page_size * num_blocks
|
||||
@@ -1004,8 +1004,8 @@ def _get_kv_cache_config_uniform_page_size(
|
||||
for i in range(group_size):
|
||||
shared_by = []
|
||||
for j in range(len(kv_cache_groups)):
|
||||
if i < len(grouped_layers[j]):
|
||||
shared_by.append(grouped_layers[j][i])
|
||||
if i < len(kv_cache_groups[j].layer_names):
|
||||
shared_by.append(kv_cache_groups[j].layer_names[i])
|
||||
kv_cache_tensors.append(
|
||||
KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by))
|
||||
|
||||
@@ -1019,7 +1019,12 @@ def _get_kv_cache_config_uniform_page_size(
|
||||
[group.kv_cache_spec.block_size for group in kv_cache_groups])
|
||||
|
||||
# Print the KV cache size and maximum concurrency.
|
||||
num_tokens = num_blocks // len(grouped_layers) * min_block_size
|
||||
num_tokens = num_blocks // len(kv_cache_groups) * min_block_size
|
||||
if vllm_config.parallel_config.decode_context_parallel_size > 1:
|
||||
num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
|
||||
logger.info(
|
||||
"Multiplying the GPU KV cache size by the dcp_world_size %d.",
|
||||
vllm_config.parallel_config.decode_context_parallel_size)
|
||||
num_tokens_str = f"{num_tokens:,}"
|
||||
logger.info("GPU KV cache size: %s tokens", num_tokens_str)
|
||||
max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
|
||||
@@ -1030,10 +1035,6 @@ def _get_kv_cache_config_uniform_page_size(
|
||||
return kv_cache_config
|
||||
|
||||
|
||||
def _get_kv_cache_config_attention_free() -> KVCacheConfig:
|
||||
return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
|
||||
|
||||
|
||||
def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
|
||||
"""
|
||||
This function tries to convert the KV cache specs to one type if the model
|
||||
@@ -1087,72 +1088,112 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
|
||||
"convert the KV cache specs to one unified type.")
|
||||
|
||||
|
||||
def get_kv_cache_config(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int,
|
||||
) -> KVCacheConfig:
|
||||
def get_kv_cache_groups(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
|
||||
"""
|
||||
Generates the KV cache configuration for a model.
|
||||
Split the layers in the model into groups with the same KV cache spec.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes.
|
||||
|
||||
Returns:
|
||||
The generated KVCacheConfigs
|
||||
The generated KVCacheGroups
|
||||
"""
|
||||
check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
|
||||
if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
|
||||
unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
|
||||
if is_kv_cache_type_attention_free(kv_cache_spec):
|
||||
# This returns a kv_cache config with 0 kv_cache groups and 1 block
|
||||
# to allow for the KVCache manager to handle attention free models.
|
||||
return _get_kv_cache_config_attention_free()
|
||||
# This returns an empty list to allow for the KVCacheManager to handle
|
||||
# attention free models.
|
||||
return []
|
||||
elif is_kv_cache_type_uniform(kv_cache_spec):
|
||||
# KV cache of all layers are the same, which is true for
|
||||
# most models. Allocate the same amount of memory for
|
||||
# each layer.
|
||||
return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
return _get_kv_cache_groups_uniform_type(kv_cache_spec)
|
||||
elif is_kv_cache_page_size_uniform(kv_cache_spec):
|
||||
# Model contains multiple attention types, but KV cache of all layers
|
||||
# have the same physical memory per block per layer. Split the layers
|
||||
# into groups with the same number of layers, and thus same total page
|
||||
# size.
|
||||
return _get_kv_cache_config_uniform_page_size(vllm_config,
|
||||
kv_cache_spec,
|
||||
available_memory)
|
||||
return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
|
||||
def get_kv_cache_configs(vllm_config: VllmConfig,
|
||||
kv_cache_specs: list[dict[str, KVCacheSpec]],
|
||||
available_memory: list[int]) -> list[KVCacheConfig]:
|
||||
"""
|
||||
Make the KV cache configurations for each worker consistent, so that all
|
||||
workers can be controlled by the same KVCacheManager.
|
||||
This function verifies that the layer group of each worker are the same,
|
||||
and changes the num_blocks of each worker to the smallest among all workers.
|
||||
Generates the KV cache configurations for a model.
|
||||
Since we use a shared centralized controller for all workers, we need the
|
||||
`kv_cache_config` to be consistent across all workers to make sure
|
||||
the KV cache allocation can be applied to all workers. However, different
|
||||
workers may have different memory available, and different type of layers
|
||||
(when pipeline parallel is enabled). To handle the difference between
|
||||
workers, the current implementation is:
|
||||
1. Merge the KV cache specs of all workers to get the KVCacheSpecs for
|
||||
the whole model.
|
||||
2. Generate the KV cache groups based on the layer ratio of the whole model.
|
||||
3. Generate the KV cache configs for each worker based on the KV cache
|
||||
grouping strategy. (This is reasonable because the layer ratio of
|
||||
different PP stages are similar.)
|
||||
4. Change the num_blocks of each worker to the smallest among all workers.
|
||||
|
||||
Args:
|
||||
kv_cache_configs: The KV cache configurations for each worker. Will be
|
||||
in-place modified to make them consistent.
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker.
|
||||
available_memory: Memory available for KV cache in bytes for each
|
||||
worker.
|
||||
|
||||
Returns:
|
||||
The generated KVCacheConfigs for each worker.
|
||||
"""
|
||||
|
||||
# Sort the kv cache groups by their KV cache spec.
|
||||
# This can avoid the inconsistency caused by the order of groups.
|
||||
for kv_cache_config in kv_cache_configs:
|
||||
kv_cache_config.kv_cache_groups.sort(key=lambda x: (type(
|
||||
x.kv_cache_spec).__name__, astuple(x.kv_cache_spec)))
|
||||
# Check if the available memory is enough for each worker.
|
||||
for kv_cache_spec_one_worker, available_memory_one_worker in zip(
|
||||
kv_cache_specs, available_memory):
|
||||
check_enough_kv_cache_memory(vllm_config, kv_cache_spec_one_worker,
|
||||
available_memory_one_worker)
|
||||
|
||||
# Verify that the groups of each rank are the same.
|
||||
for kv_cache_config in kv_cache_configs[1:]:
|
||||
for group_rank_0, group_rank_i in zip(
|
||||
kv_cache_configs[0].kv_cache_groups,
|
||||
kv_cache_config.kv_cache_groups):
|
||||
assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec
|
||||
# Merge the KV cache specs of all workers. Different PP stages may have
|
||||
# different layer names, and different TP ranks of the same PP stage should
|
||||
# have the same KV cache spec.
|
||||
merged_kv_cache_specs: dict[str, KVCacheSpec] = {}
|
||||
for kv_cache_spec_one_worker in kv_cache_specs:
|
||||
for layer_name, layer_spec in kv_cache_spec_one_worker.items():
|
||||
if layer_name not in merged_kv_cache_specs:
|
||||
merged_kv_cache_specs[layer_name] = layer_spec
|
||||
else:
|
||||
assert merged_kv_cache_specs[layer_name] == layer_spec, (
|
||||
"The KV cache specs for the same layer are different "
|
||||
"across workers. This is not supported yet.")
|
||||
global_kv_cache_groups = get_kv_cache_groups(vllm_config,
|
||||
merged_kv_cache_specs)
|
||||
|
||||
kv_cache_configs: list[KVCacheConfig] = []
|
||||
for kv_cache_spec_one_worker, available_memory_one_worker in zip(
|
||||
kv_cache_specs, available_memory):
|
||||
kv_cache_groups_one_worker: list[KVCacheGroupSpec] = []
|
||||
for group in global_kv_cache_groups:
|
||||
group_layer_names_one_worker = [
|
||||
layer_name for layer_name in group.layer_names
|
||||
if layer_name in kv_cache_spec_one_worker
|
||||
]
|
||||
kv_cache_groups_one_worker.append(
|
||||
KVCacheGroupSpec(group_layer_names_one_worker,
|
||||
group.kv_cache_spec))
|
||||
assert sum(
|
||||
len(group.layer_names) for group in
|
||||
kv_cache_groups_one_worker) == len(kv_cache_spec_one_worker), (
|
||||
"Some layers are not assigned to any group.")
|
||||
kv_cache_configs.append(
|
||||
get_kv_cache_config_from_groups(vllm_config,
|
||||
kv_cache_groups_one_worker,
|
||||
kv_cache_spec_one_worker,
|
||||
available_memory_one_worker))
|
||||
|
||||
# Change the num_blocks of each rank to the smallest among all ranks. We
|
||||
# do not need to shrink the tensor size because it is valid to only use the
|
||||
|
||||
@@ -29,10 +29,9 @@ from vllm.transformers_utils.config import (
|
||||
maybe_register_config_serialize_by_value)
|
||||
from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket,
|
||||
resolve_obj_by_qualname, set_process_title)
|
||||
from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_config,
|
||||
from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_configs,
|
||||
get_request_block_hasher,
|
||||
init_none_hash,
|
||||
unify_kv_cache_configs)
|
||||
init_none_hash)
|
||||
from vllm.v1.core.sched.interface import SchedulerInterface
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
|
||||
@@ -191,18 +190,9 @@ class EngineCore:
|
||||
available_gpu_memory = [0] * len(kv_cache_specs)
|
||||
|
||||
assert len(kv_cache_specs) == len(available_gpu_memory)
|
||||
# Get the kv cache tensor size
|
||||
kv_cache_configs = [
|
||||
get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
|
||||
available_gpu_memory_one_worker)
|
||||
for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
|
||||
zip(kv_cache_specs, available_gpu_memory)
|
||||
]
|
||||
|
||||
# Since we use a shared centralized controller, we need the
|
||||
# `kv_cache_config` to be consistent across all workers to make sure
|
||||
# all the memory operators can be applied to all workers.
|
||||
unify_kv_cache_configs(kv_cache_configs)
|
||||
kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
|
||||
available_gpu_memory)
|
||||
|
||||
# All workers have the same kv_cache_config except layer names, so use
|
||||
# an arbitrary one to initialize the scheduler.
|
||||
|
||||
Reference in New Issue
Block a user