[Feature] Prefill Context Parallel (PCP) basic support (#28718)
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com> Signed-off-by: FENP <yuanyongjie.yyj@antgroup.com> Signed-off-by: LookAround <lixushi@huawei.com> Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com> Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Co-authored-by: FENP <yuanyongjie.yyj@antgroup.com> Co-authored-by: LookAround <lixushi@huawei.com> Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com> Co-authored-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Co-authored-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
This commit is contained in:
@@ -71,6 +71,8 @@ class ParallelConfig:
|
||||
"""Number of pipeline parallel groups."""
|
||||
tensor_parallel_size: int = 1
|
||||
"""Number of tensor parallel groups."""
|
||||
prefill_context_parallel_size: int = 1
|
||||
"""Number of prefill context parallel groups."""
|
||||
data_parallel_size: int = 1
|
||||
"""Number of data parallel groups. MoE layers will be sharded according to
|
||||
the product of the tensor parallel size and data parallel size."""
|
||||
@@ -239,14 +241,25 @@ class ParallelConfig:
|
||||
needs to be divisible by dcp_size."""
|
||||
|
||||
dcp_kv_cache_interleave_size: int = 1
|
||||
"""Interleave size of kv_cache storage while using dcp or cp > 1,
|
||||
store interleave_size tokens on (d)cp i,
|
||||
then store next interleave_size tokens on (d)cp i+1.
|
||||
Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size.
|
||||
Interleave_size=block_size: block-level align, first fill the block on first rank,
|
||||
token is stored on rank i+1 block j after rank i block j is full.
|
||||
Block_size should be greater than or equal to dcp_kv_cache_interleave_size.
|
||||
Block_size should be divisible by dcp_kv_cache_interleave_size.
|
||||
"""
|
||||
Interleave size of kv_cache storage while using DCP.
|
||||
dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
|
||||
and will be deprecated when PCP is fully supported.
|
||||
|
||||
"""
|
||||
cp_kv_cache_interleave_size: int = 1
|
||||
"""Interleave size of kv_cache storage while using DCP or PCP.
|
||||
For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
|
||||
and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
|
||||
store interleave_size tokens on total_cp_rank i,
|
||||
then store next interleave_size tokens on taotal_cp_rank i+1.
|
||||
Interleave_size=1: token-level alignment, where token `i` is stored on
|
||||
total_cp_rank `i % total_cp_world_size`.
|
||||
Interleave_size=block_size: block-level alignment, where tokens are
|
||||
first populated to the preceding ranks. Tokens are then stored
|
||||
in (rank i+1, block j) only after (rank i, block j) is fully occupied.
|
||||
Block_size should be greater than or equal to cp_kv_cache_interleave_size.
|
||||
Block_size should be divisible by cp_kv_cache_interleave_size.
|
||||
"""
|
||||
|
||||
_api_process_count: int = Field(default=1, gt=0)
|
||||
@@ -311,6 +324,11 @@ class ParallelConfig:
|
||||
"num_redundant_experts."
|
||||
)
|
||||
|
||||
if self.prefill_context_parallel_size > 1:
|
||||
raise ValueError(
|
||||
"Prefill context parallelism is not fully supported. "
|
||||
"Please set prefill_context_parallel_size to 1."
|
||||
)
|
||||
return self
|
||||
|
||||
@property
|
||||
@@ -529,7 +547,11 @@ class ParallelConfig:
|
||||
)
|
||||
|
||||
# Continue with the rest of the initialization
|
||||
self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size
|
||||
self.world_size = (
|
||||
self.pipeline_parallel_size
|
||||
* self.tensor_parallel_size
|
||||
* self.prefill_context_parallel_size
|
||||
)
|
||||
|
||||
if self.distributed_executor_backend == "external_launcher":
|
||||
logger.info("Using external launcher for distributed inference.")
|
||||
|
||||
@@ -481,6 +481,14 @@ class VllmConfig:
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
# prefill context parallel do not support full cudagraphs
|
||||
elif self.parallel_config.prefill_context_parallel_size > 1:
|
||||
logger.warning_once(
|
||||
"Prefill context parallel (PCP) is enabled, which is "
|
||||
"incompatible with full CUDA graphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
elif self.model_config is not None:
|
||||
if self.model_config.pooler_config is not None:
|
||||
logger.warning_once(
|
||||
@@ -610,22 +618,34 @@ class VllmConfig:
|
||||
|
||||
# If DCP, ensure the block size is right.
|
||||
if self.parallel_config.decode_context_parallel_size > 1:
|
||||
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
|
||||
self.parallel_config.cp_kv_cache_interleave_size
|
||||
!= self.parallel_config.dcp_kv_cache_interleave_size
|
||||
):
|
||||
self.parallel_config.cp_kv_cache_interleave_size = (
|
||||
self.parallel_config.dcp_kv_cache_interleave_size
|
||||
)
|
||||
logger.warning_once(
|
||||
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
|
||||
"_interleave_size. And dcp-kv-cache-interleave-size will be "
|
||||
"deprecated when PCP is fully supported."
|
||||
)
|
||||
assert (
|
||||
self.parallel_config.dcp_kv_cache_interleave_size
|
||||
self.parallel_config.cp_kv_cache_interleave_size
|
||||
<= self.cache_config.block_size
|
||||
and self.cache_config.block_size
|
||||
% self.parallel_config.dcp_kv_cache_interleave_size
|
||||
% self.parallel_config.cp_kv_cache_interleave_size
|
||||
== 0
|
||||
), (
|
||||
f"Block_size({self.cache_config.block_size}) should be greater "
|
||||
"than or equal to and divisible by dcp_kv_cache_interleave_size "
|
||||
f"({self.parallel_config.dcp_kv_cache_interleave_size})."
|
||||
"than or equal to and divisible by cp_kv_cache_interleave_size "
|
||||
f"({self.parallel_config.cp_kv_cache_interleave_size})."
|
||||
)
|
||||
|
||||
assert (
|
||||
self.parallel_config.dcp_kv_cache_interleave_size == 1
|
||||
self.parallel_config.cp_kv_cache_interleave_size == 1
|
||||
or self.speculative_config is None
|
||||
), "MTP with dcp_kv_cache_interleave_size > 1 is not supported now."
|
||||
), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
|
||||
|
||||
# Do this after all the updates to compilation_config.mode
|
||||
if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
|
||||
|
||||
Reference in New Issue
Block a user