diff --git a/setup.py b/setup.py
index 2f251a6a2..749977029 100644
--- a/setup.py
+++ b/setup.py
@@ -82,6 +82,66 @@ def is_freethreaded():
     return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
 
 
+def should_bundle_tcmalloc() -> bool:
+    import platform
+
+    return (
+        VLLM_TARGET_DEVICE == "cpu"
+        and sys.platform.startswith("linux")
+        and platform.machine() in ("aarch64", "x86_64")
+    )
+
+
+def find_tcmalloc() -> Path | None:
+    try:
+        # get all shared libs the dynamic loader knows about
+        output = subprocess.check_output(
+            ["ldconfig", "-p"],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        )
+    except Exception:
+        return None
+
+    # search for libtcmalloc and libtcmalloc_minimal
+    for library_pattern in (
+        r"\blibtcmalloc_minimal\.so\.(\d+)\b",
+        r"\blibtcmalloc\.so\.(\d+)\b",
+    ):
+        candidates: list[tuple[int, Path]] = []
+        for line in output.splitlines():
+            match = re.search(library_pattern, line)
+            if match is None or "=>" not in line:
+                continue
+            candidate = Path(line.split("=>")[1].strip())
+            if candidate.exists():
+                candidates.append((int(match.group(1)), candidate))
+
+        if candidates:
+            # if multiple candidates are found, pick the one with the highest
+            # version number
+            return max(candidates, key=lambda item: item[0])[1]
+
+    return None
+
+
+def bundle_tcmalloc(build_lib: str) -> None:
+    tcmalloc_library = find_tcmalloc()
+    if tcmalloc_library is None:
+        logger.warning(
+            "Failed to locate tcmalloc. For best performance, "
+            "please install tcmalloc (e.g. `sudo apt-get "
+            "install -y --no-install-recommends libtcmalloc-minimal4`)"
+        )
+        return
+
+    bundle_dir = os.path.join(build_lib, "vllm", "libs")
+    os.makedirs(bundle_dir, exist_ok=True)
+    bundle_path = os.path.join(bundle_dir, tcmalloc_library.name)
+    shutil.copy2(tcmalloc_library, bundle_path)
+    logger.info("Bundled tcmalloc into wheel: %s", bundle_path)
+
+
 class CMakeExtension(Extension):
     def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
         super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
@@ -285,6 +345,10 @@ class cmake_build_ext(build_ext):
         # First, run the standard build_ext command to compile the extensions
         super().run()
 
+        # bundle tcmalloc into CPU wheels for best OOB perf
+        if should_bundle_tcmalloc():
+            bundle_tcmalloc(self.build_lib)
+
         # copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
         # directory so that they can be included in the editable build
         import glob
@@ -944,6 +1008,7 @@ if _build_custom_ops():
 package_data = {
     "vllm": [
         "py.typed",
+        "libs/*.so*",
         "model_executor/layers/fused_moe/configs/*.json",
         "model_executor/layers/quantization/utils/configs/*.json",
         "entrypoints/serve/instrumentator/static/*.js",
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index f8fc3a38a..7fbad3e4c 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -284,8 +284,9 @@ class CpuPlatform(Platform):
         # Avoid inductor generates num_thread() and breaks the thread binding
         os.environ["TORCHINDUCTOR_CPP_DYNAMIC_THREADS"] = "1"
 
-        # Intel OpenMP setting
         ld_preload_str = os.getenv("LD_PRELOAD", "")
+
+        # Intel OpenMP setting
         if "libiomp5.so" in ld_preload_str:
             # The time(milliseconds) that a thread should wait after
             # completing the execution of a parallel region, before sleeping.
@@ -297,10 +298,35 @@ class CpuPlatform(Platform):
             os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
             os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
 
+        cpu_architecture = Platform.get_cpu_architecture()
+
+        # LD_PRELOAD libtcmalloc, bundled under vllm/libs to reduce
+        # memory allocation overhead
         if (
             platform.system() == "Linux"
-            and Platform.get_cpu_architecture()
-            in (CpuArchEnum.ARM, CpuArchEnum.POWERPC)
+            and cpu_architecture in (CpuArchEnum.ARM, CpuArchEnum.X86)
+            and "libtcmalloc" not in ld_preload_str
+        ):
+            vllm_pkg = os.path.dirname(os.path.dirname(__file__))
+            tcmalloc_so = None
+            for pattern in ("libtcmalloc_minimal*.so*", "libtcmalloc.so*"):
+                tcmalloc_so_candidates = glob.glob(
+                    os.path.join(vllm_pkg, "libs", pattern)
+                )
+                if tcmalloc_so_candidates:
+                    tcmalloc_so = tcmalloc_so_candidates[0]
+                    break
+
+            if tcmalloc_so is not None:
+                if ld_preload_str:
+                    ld_preload_str = f"{tcmalloc_so}:{ld_preload_str}"
+                else:
+                    ld_preload_str = tcmalloc_so
+                os.environ["LD_PRELOAD"] = ld_preload_str
+
+        if (
+            platform.system() == "Linux"
+            and cpu_architecture in (CpuArchEnum.ARM, CpuArchEnum.POWERPC)
             and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
         ):
             # We need to LD_PRELOAD PyTorch's libgomp, otherwise only