[CPU] Enable shared-memory based pipeline parallel for CPU backend (#21289)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-22 00:07:08 +08:00
parent 6dda13c86b
commit a15a50fc17
8 changed files with 165 additions and 59 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1639,13 +1639,14 @@ class EngineArgs:

        # cpu specific default values.
        if current_platform.is_cpu():
+            world_size = self.pipeline_parallel_size * self.tensor_parallel_size
            default_max_num_batched_tokens = {
-                UsageContext.LLM_CLASS: 4096,
-                UsageContext.OPENAI_API_SERVER: 2048,
+                UsageContext.LLM_CLASS: 4096 * world_size,
+                UsageContext.OPENAI_API_SERVER: 2048 * world_size,
            }
            default_max_num_seqs = {
-                UsageContext.LLM_CLASS: 128,
-                UsageContext.OPENAI_API_SERVER: 32,
+                UsageContext.LLM_CLASS: 256 * world_size,
+                UsageContext.OPENAI_API_SERVER: 128 * world_size,
            }

        use_context_value = usage_context.value if usage_context else None