[cpu][perf] Fix low CPU utilization with VLLM_CPU_OMP_THREADS_BIND on AArch64 (#27415)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
@@ -301,8 +302,8 @@ class CpuPlatform(Platform):
|
||||
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "0"
|
||||
|
||||
# Intel OpenMP setting
|
||||
ld_prealod_str = os.getenv("LD_PRELOAD", "")
|
||||
if "libiomp5.so" in ld_prealod_str:
|
||||
ld_preload_str = os.getenv("LD_PRELOAD", "")
|
||||
if "libiomp5.so" in ld_preload_str:
|
||||
# The time(milliseconds) that a thread should wait after
|
||||
# completing the execution of a parallel region, before sleeping.
|
||||
os.environ["KMP_BLOCKTIME"] = "1"
|
||||
@@ -313,6 +314,31 @@ class CpuPlatform(Platform):
|
||||
os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
|
||||
os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
|
||||
|
||||
if (
|
||||
platform.system() == "Linux"
|
||||
and Platform.get_cpu_architecture() == CpuArchEnum.ARM
|
||||
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
|
||||
):
|
||||
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
|
||||
# one core will be properly utilized when we thread-bind
|
||||
# See: https://github.com/vllm-project/vllm/issues/27369
|
||||
# TODO: Remove once:
|
||||
# https://github.com/pytorch/pytorch/issues/166087 is fixed
|
||||
|
||||
# We need to find the location of PyTorch's libgomp
|
||||
torch_pkg = os.path.dirname(torch.__file__)
|
||||
site_root = os.path.dirname(torch_pkg)
|
||||
torch_libs = os.path.join(site_root, "torch.libs")
|
||||
pytorch_libgomp_so_candidates = glob.glob(
|
||||
os.path.join(torch_libs, "libgomp-*.so*")
|
||||
)
|
||||
if pytorch_libgomp_so_candidates:
|
||||
pytorch_libgomp_so = pytorch_libgomp_so_candidates[0]
|
||||
if ld_preload_str:
|
||||
ld_preload_str += ":"
|
||||
ld_preload_str += pytorch_libgomp_so
|
||||
os.environ["LD_PRELOAD"] = ld_preload_str
|
||||
|
||||
# To hint IPEX uses shared memory based AllReduce
|
||||
os.environ["LOCAL_WORLD_SIZE"] = str(
|
||||
vllm_config.parallel_config.tensor_parallel_size
|
||||
|
||||
Reference in New Issue
Block a user