[cpu][perf] Fix low CPU utilization with VLLM_CPU_OMP_THREADS_BIND on AArch64 (#27415)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
This commit is contained in:
Fadi Arafeh
2025-10-27 11:14:55 +00:00
committed by GitHub
parent a4fc21895e
commit a663f6ae64
3 changed files with 83 additions and 6 deletions

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob
import json
import os
import platform
@@ -301,8 +302,8 @@ class CpuPlatform(Platform):
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "0"
# Intel OpenMP setting
ld_prealod_str = os.getenv("LD_PRELOAD", "")
if "libiomp5.so" in ld_prealod_str:
ld_preload_str = os.getenv("LD_PRELOAD", "")
if "libiomp5.so" in ld_preload_str:
# The time(milliseconds) that a thread should wait after
# completing the execution of a parallel region, before sleeping.
os.environ["KMP_BLOCKTIME"] = "1"
@@ -313,6 +314,31 @@ class CpuPlatform(Platform):
os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
if (
platform.system() == "Linux"
and Platform.get_cpu_architecture() == CpuArchEnum.ARM
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
):
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
# one core will be properly utilized when we thread-bind
# See: https://github.com/vllm-project/vllm/issues/27369
# TODO: Remove once:
# https://github.com/pytorch/pytorch/issues/166087 is fixed
# We need to find the location of PyTorch's libgomp
torch_pkg = os.path.dirname(torch.__file__)
site_root = os.path.dirname(torch_pkg)
torch_libs = os.path.join(site_root, "torch.libs")
pytorch_libgomp_so_candidates = glob.glob(
os.path.join(torch_libs, "libgomp-*.so*")
)
if pytorch_libgomp_so_candidates:
pytorch_libgomp_so = pytorch_libgomp_so_candidates[0]
if ld_preload_str:
ld_preload_str += ":"
ld_preload_str += pytorch_libgomp_so
os.environ["LD_PRELOAD"] = ld_preload_str
# To hint IPEX uses shared memory based AllReduce
os.environ["LOCAL_WORLD_SIZE"] = str(
vllm_config.parallel_config.tensor_parallel_size