[cpu][perf] Fix low CPU utilization with VLLM_CPU_OMP_THREADS_BIND on AArch64 (#27415)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2025-10-27 11:14:55 +00:00
parent a4fc21895e
commit a663f6ae64
3 changed files with 83 additions and 6 deletions
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import glob
 import json
 import os
 import platform
@@ -301,8 +302,8 @@ class CpuPlatform(Platform):
        os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "0"

        # Intel OpenMP setting
-        ld_prealod_str = os.getenv("LD_PRELOAD", "")
-        if "libiomp5.so" in ld_prealod_str:
+        ld_preload_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_preload_str:
            # The time(milliseconds) that a thread should wait after
            # completing the execution of a parallel region, before sleeping.
            os.environ["KMP_BLOCKTIME"] = "1"
@@ -313,6 +314,31 @@ class CpuPlatform(Platform):
            os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
            os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"

+        if (
+            platform.system() == "Linux"
+            and Platform.get_cpu_architecture() == CpuArchEnum.ARM
+            and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
+        ):
+            # We need to LD_PRELOAD PyTorch's libgomp, otherwise only
+            # one core will be properly utilized when we thread-bind
+            # See: https://github.com/vllm-project/vllm/issues/27369
+            # TODO: Remove once:
+            # https://github.com/pytorch/pytorch/issues/166087 is fixed
+
+            # We need to find the location of PyTorch's libgomp
+            torch_pkg = os.path.dirname(torch.__file__)
+            site_root = os.path.dirname(torch_pkg)
+            torch_libs = os.path.join(site_root, "torch.libs")
+            pytorch_libgomp_so_candidates = glob.glob(
+                os.path.join(torch_libs, "libgomp-*.so*")
+            )
+            if pytorch_libgomp_so_candidates:
+                pytorch_libgomp_so = pytorch_libgomp_so_candidates[0]
+                if ld_preload_str:
+                    ld_preload_str += ":"
+                ld_preload_str += pytorch_libgomp_so
+                os.environ["LD_PRELOAD"] = ld_preload_str
+
        # To hint IPEX uses shared memory based AllReduce
        os.environ["LOCAL_WORLD_SIZE"] = str(
            vllm_config.parallel_config.tensor_parallel_size