[bugfix] Fix auto thread-binding when world_size > 1 in CPU backend and refactor code (#21032)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-19 20:13:55 +08:00
parent b3d82108e7
commit e3a0e43d7f
7 changed files with 144 additions and 150 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -44,7 +44,7 @@ if TYPE_CHECKING:
    VLLM_PP_LAYER_PARTITION: Optional[str] = None
    VLLM_CPU_KVCACHE_SPACE: int = 0
    VLLM_CPU_OMP_THREADS_BIND: str = ""
-    VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0
+    VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None
    VLLM_CPU_MOE_PREPACK: bool = True
    VLLM_CPU_SGL_KERNEL: bool = False
    VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
@@ -442,7 +442,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # (CPU backend only) CPU cores not used by OMP threads .
    # Those CPU cores will not be used by OMP threads of a rank.
    "VLLM_CPU_NUM_OF_RESERVED_CPU":
-    lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")),
+    lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0"))
+    if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ else None,

    # (CPU backend only) whether to use prepack for MoE layer. This will be
    # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might