[CPU][UX][Perf] Enable tcmalloc by default (#37607)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
This commit is contained in:
65
setup.py
65
setup.py
@@ -82,6 +82,66 @@ def is_freethreaded():
|
|||||||
return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
|
return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
|
||||||
|
|
||||||
|
|
||||||
|
def should_bundle_tcmalloc() -> bool:
|
||||||
|
import platform
|
||||||
|
|
||||||
|
return (
|
||||||
|
VLLM_TARGET_DEVICE == "cpu"
|
||||||
|
and sys.platform.startswith("linux")
|
||||||
|
and platform.machine() in ("aarch64", "x86_64")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def find_tcmalloc() -> Path | None:
|
||||||
|
try:
|
||||||
|
# get all shared libs the dynamic loader knows about
|
||||||
|
output = subprocess.check_output(
|
||||||
|
["ldconfig", "-p"],
|
||||||
|
text=True,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# search for libtcmalloc and libtcmalloc_minimal
|
||||||
|
for library_pattern in (
|
||||||
|
r"\blibtcmalloc_minimal\.so\.(\d+)\b",
|
||||||
|
r"\blibtcmalloc\.so\.(\d+)\b",
|
||||||
|
):
|
||||||
|
candidates: list[tuple[int, Path]] = []
|
||||||
|
for line in output.splitlines():
|
||||||
|
match = re.search(library_pattern, line)
|
||||||
|
if match is None or "=>" not in line:
|
||||||
|
continue
|
||||||
|
candidate = Path(line.split("=>")[1].strip())
|
||||||
|
if candidate.exists():
|
||||||
|
candidates.append((int(match.group(1)), candidate))
|
||||||
|
|
||||||
|
if candidates:
|
||||||
|
# if multiple candidates are found, pick the one with the highest
|
||||||
|
# version number
|
||||||
|
return max(candidates, key=lambda item: item[0])[1]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def bundle_tcmalloc(build_lib: str) -> None:
|
||||||
|
tcmalloc_library = find_tcmalloc()
|
||||||
|
if tcmalloc_library is None:
|
||||||
|
logger.warning(
|
||||||
|
"Failed to locate tcmalloc. For best performance, "
|
||||||
|
"please install tcmalloc (e.g. `sudo apt-get "
|
||||||
|
"install -y --no-install-recommends libtcmalloc-minimal4`)"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
bundle_dir = os.path.join(build_lib, "vllm", "libs")
|
||||||
|
os.makedirs(bundle_dir, exist_ok=True)
|
||||||
|
bundle_path = os.path.join(bundle_dir, tcmalloc_library.name)
|
||||||
|
shutil.copy2(tcmalloc_library, bundle_path)
|
||||||
|
logger.info("Bundled tcmalloc into wheel: %s", bundle_path)
|
||||||
|
|
||||||
|
|
||||||
class CMakeExtension(Extension):
|
class CMakeExtension(Extension):
|
||||||
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
|
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
|
||||||
super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
|
super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
|
||||||
@@ -285,6 +345,10 @@ class cmake_build_ext(build_ext):
|
|||||||
# First, run the standard build_ext command to compile the extensions
|
# First, run the standard build_ext command to compile the extensions
|
||||||
super().run()
|
super().run()
|
||||||
|
|
||||||
|
# bundle tcmalloc into CPU wheels for best OOB perf
|
||||||
|
if should_bundle_tcmalloc():
|
||||||
|
bundle_tcmalloc(self.build_lib)
|
||||||
|
|
||||||
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
|
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
|
||||||
# directory so that they can be included in the editable build
|
# directory so that they can be included in the editable build
|
||||||
import glob
|
import glob
|
||||||
@@ -944,6 +1008,7 @@ if _build_custom_ops():
|
|||||||
package_data = {
|
package_data = {
|
||||||
"vllm": [
|
"vllm": [
|
||||||
"py.typed",
|
"py.typed",
|
||||||
|
"libs/*.so*",
|
||||||
"model_executor/layers/fused_moe/configs/*.json",
|
"model_executor/layers/fused_moe/configs/*.json",
|
||||||
"model_executor/layers/quantization/utils/configs/*.json",
|
"model_executor/layers/quantization/utils/configs/*.json",
|
||||||
"entrypoints/serve/instrumentator/static/*.js",
|
"entrypoints/serve/instrumentator/static/*.js",
|
||||||
|
|||||||
@@ -284,8 +284,9 @@ class CpuPlatform(Platform):
|
|||||||
# Avoid inductor generates num_thread() and breaks the thread binding
|
# Avoid inductor generates num_thread() and breaks the thread binding
|
||||||
os.environ["TORCHINDUCTOR_CPP_DYNAMIC_THREADS"] = "1"
|
os.environ["TORCHINDUCTOR_CPP_DYNAMIC_THREADS"] = "1"
|
||||||
|
|
||||||
# Intel OpenMP setting
|
|
||||||
ld_preload_str = os.getenv("LD_PRELOAD", "")
|
ld_preload_str = os.getenv("LD_PRELOAD", "")
|
||||||
|
|
||||||
|
# Intel OpenMP setting
|
||||||
if "libiomp5.so" in ld_preload_str:
|
if "libiomp5.so" in ld_preload_str:
|
||||||
# The time(milliseconds) that a thread should wait after
|
# The time(milliseconds) that a thread should wait after
|
||||||
# completing the execution of a parallel region, before sleeping.
|
# completing the execution of a parallel region, before sleeping.
|
||||||
@@ -297,10 +298,35 @@ class CpuPlatform(Platform):
|
|||||||
os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
|
os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
|
||||||
os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
|
os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
|
||||||
|
|
||||||
|
cpu_architecture = Platform.get_cpu_architecture()
|
||||||
|
|
||||||
|
# LD_PRELOAD libtcmalloc, bundled under vllm/libs to reduce
|
||||||
|
# memory allocation overhead
|
||||||
if (
|
if (
|
||||||
platform.system() == "Linux"
|
platform.system() == "Linux"
|
||||||
and Platform.get_cpu_architecture()
|
and cpu_architecture in (CpuArchEnum.ARM, CpuArchEnum.X86)
|
||||||
in (CpuArchEnum.ARM, CpuArchEnum.POWERPC)
|
and "libtcmalloc" not in ld_preload_str
|
||||||
|
):
|
||||||
|
vllm_pkg = os.path.dirname(os.path.dirname(__file__))
|
||||||
|
tcmalloc_so = None
|
||||||
|
for pattern in ("libtcmalloc_minimal*.so*", "libtcmalloc.so*"):
|
||||||
|
tcmalloc_so_candidates = glob.glob(
|
||||||
|
os.path.join(vllm_pkg, "libs", pattern)
|
||||||
|
)
|
||||||
|
if tcmalloc_so_candidates:
|
||||||
|
tcmalloc_so = tcmalloc_so_candidates[0]
|
||||||
|
break
|
||||||
|
|
||||||
|
if tcmalloc_so is not None:
|
||||||
|
if ld_preload_str:
|
||||||
|
ld_preload_str = f"{tcmalloc_so}:{ld_preload_str}"
|
||||||
|
else:
|
||||||
|
ld_preload_str = tcmalloc_so
|
||||||
|
os.environ["LD_PRELOAD"] = ld_preload_str
|
||||||
|
|
||||||
|
if (
|
||||||
|
platform.system() == "Linux"
|
||||||
|
and cpu_architecture in (CpuArchEnum.ARM, CpuArchEnum.POWERPC)
|
||||||
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
|
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
|
||||||
):
|
):
|
||||||
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
|
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
|
||||||
|
|||||||
Reference in New Issue
Block a user