[Feature] Prefill Context Parallel (PCP) basic support (#28718)
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com> Signed-off-by: FENP <yuanyongjie.yyj@antgroup.com> Signed-off-by: LookAround <lixushi@huawei.com> Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com> Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Co-authored-by: FENP <yuanyongjie.yyj@antgroup.com> Co-authored-by: LookAround <lixushi@huawei.com> Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com> Co-authored-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Co-authored-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
This commit is contained in:
@@ -389,8 +389,10 @@ class EngineArgs:
|
||||
nnodes: int = ParallelConfig.nnodes
|
||||
node_rank: int = ParallelConfig.node_rank
|
||||
tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
|
||||
prefill_context_parallel_size: int = ParallelConfig.prefill_context_parallel_size
|
||||
decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
|
||||
dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
|
||||
cp_kv_cache_interleave_size: int = ParallelConfig.cp_kv_cache_interleave_size
|
||||
data_parallel_size: int = ParallelConfig.data_parallel_size
|
||||
data_parallel_rank: int | None = None
|
||||
data_parallel_start_rank: int | None = None
|
||||
@@ -770,6 +772,15 @@ class EngineArgs:
|
||||
"--dcp-kv-cache-interleave-size",
|
||||
**parallel_kwargs["dcp_kv_cache_interleave_size"],
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--cp-kv-cache-interleave-size",
|
||||
**parallel_kwargs["cp_kv_cache_interleave_size"],
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--prefill-context-parallel-size",
|
||||
"-pcp",
|
||||
**parallel_kwargs["prefill_context_parallel_size"],
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]
|
||||
)
|
||||
@@ -1600,6 +1611,7 @@ class EngineArgs:
|
||||
parallel_config = ParallelConfig(
|
||||
pipeline_parallel_size=self.pipeline_parallel_size,
|
||||
tensor_parallel_size=self.tensor_parallel_size,
|
||||
prefill_context_parallel_size=self.prefill_context_parallel_size,
|
||||
data_parallel_size=self.data_parallel_size,
|
||||
data_parallel_rank=self.data_parallel_rank or 0,
|
||||
data_parallel_external_lb=data_parallel_external_lb,
|
||||
@@ -1631,6 +1643,7 @@ class EngineArgs:
|
||||
worker_extension_cls=self.worker_extension_cls,
|
||||
decode_context_parallel_size=self.decode_context_parallel_size,
|
||||
dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size,
|
||||
cp_kv_cache_interleave_size=self.cp_kv_cache_interleave_size,
|
||||
_api_process_count=self._api_process_count,
|
||||
_api_process_rank=self._api_process_rank,
|
||||
)
|
||||
@@ -1952,6 +1965,15 @@ class EngineArgs:
|
||||
default_prefix_caching,
|
||||
) = self.get_chunked_prefill_prefix_caching_defaults(model_config)
|
||||
|
||||
if self.prefill_context_parallel_size > 1:
|
||||
default_chunked_prefill = False
|
||||
default_prefix_caching = False
|
||||
logger.warning(
|
||||
"--prefill-context-parallel-size > 1 is not compatible with "
|
||||
"chunked prefill and prefix caching now. Chunked prefill "
|
||||
"and prefix caching have been disabled by default."
|
||||
)
|
||||
|
||||
if self.enable_chunked_prefill is None:
|
||||
self.enable_chunked_prefill = default_chunked_prefill
|
||||
|
||||
|
||||
Reference in New Issue
Block a user