[bugfix] Fix auto thread-binding when world_size > 1 in CPU backend and refactor code (#21032)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
@@ -44,7 +44,7 @@ if TYPE_CHECKING:
|
||||
VLLM_PP_LAYER_PARTITION: Optional[str] = None
|
||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||
VLLM_CPU_OMP_THREADS_BIND: str = ""
|
||||
VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0
|
||||
VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None
|
||||
VLLM_CPU_MOE_PREPACK: bool = True
|
||||
VLLM_CPU_SGL_KERNEL: bool = False
|
||||
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
|
||||
@@ -442,7 +442,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# (CPU backend only) CPU cores not used by OMP threads .
|
||||
# Those CPU cores will not be used by OMP threads of a rank.
|
||||
"VLLM_CPU_NUM_OF_RESERVED_CPU":
|
||||
lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")),
|
||||
lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0"))
|
||||
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ else None,
|
||||
|
||||
# (CPU backend only) whether to use prepack for MoE layer. This will be
|
||||
# passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
|
||||
|
||||
Reference in New Issue
Block a user