[Feature] Prefill Context Parallel (PCP) basic support (#28718)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com> Signed-off-by: FENP <yuanyongjie.yyj@antgroup.com> Signed-off-by: LookAround <lixushi@huawei.com> Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com> Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Co-authored-by: FENP <yuanyongjie.yyj@antgroup.com> Co-authored-by: LookAround <lixushi@huawei.com> Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com> Co-authored-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Co-authored-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
2025-11-20 04:52:44 +08:00
parent 02f5903b84
commit 2fd893b4ce
27 changed files with 399 additions and 114 deletions
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -31,7 +31,7 @@ class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    dcp_size: int
-    dcp_kv_cache_interleave_size: int
+    cp_kv_cache_interleave_size: int
    eager_mode: bool
    chunked_prefill: bool

@@ -55,7 +55,7 @@ class CPTestSettings:
        tp_base: int = 4,
        pp_base: int = 1,
        dcp_base: int = 1,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
        multi_node_only: bool = False,
        runner: RunnerOption = "auto",
        load_format: str | None = None,
@@ -71,7 +71,7 @@ class CPTestSettings:
                                tp_size=tp_base,
                                pp_size=pp_multiplier * pp_base,
                                dcp_size=int(dcp_multiplier * tp_base),
-                                dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+                                cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
                                eager_mode=eager_mode_val,
                                chunked_prefill=chunked_prefill_val,
                            )
@@ -116,7 +116,7 @@ def _compare_cp_with_tp(
        tp_size,
        pp_size,
        dcp_size,
-        dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size,
        eager_mode,
        chunked_prefill,
    ) = parallel_setup
@@ -197,7 +197,7 @@ def _compare_cp_with_tp(
        "--decode-context-parallel-size",
        str(dcp_size),
        "--dcp-kv-cache-interleave-size",
-        str(dcp_kv_cache_interleave_size),
+        str(cp_kv_cache_interleave_size),
        "--distributed-executor-backend",
        distributed_backend,
    ]
@@ -227,7 +227,7 @@ CP_TEXT_GENERATION_MODELS = {
    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
        CPTestSettings.detailed(),
        CPTestSettings.detailed(tp_base=2),
-        CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64),
+        CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
    ],
    "bigcode/gpt_bigcode-santacoder": [
        CPTestSettings.detailed(),