In-Tree AMD Zen CPU Backend via zentorch [1/N] (#35970)

Signed-off-by: Lalithnarayan C <Lalithnarayan.C@amd.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Chinmay-Kulkarni-AMD <Chinmay.Kulkarni@amd.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-16 05:05:35 +05:30
parent 697e4ff352
commit 7acaea634c
9 changed files with 261 additions and 2 deletions
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -9,6 +9,7 @@
 #
 # Build targets:
 #   vllm-openai (default): used for serving deployment
+#   vllm-openai-zen: vLLM from source + zentorch from PyPI via vllm[zen]
 #   vllm-test: used for CI tests
 #   vllm-dev: used for development
 #
@@ -222,3 +223,19 @@ LABEL ai.vllm.build.cpu-arm-bf16="${VLLM_CPU_ARM_BF16:-false}"
 LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}"

 ENTRYPOINT ["vllm", "serve"]
+
+
+######################### ZEN CPU PYPI IMAGE #########################
+FROM vllm-openai AS vllm-openai-zen
+
+ARG TARGETARCH
+
+RUN if [ "$TARGETARCH" != "amd64" ]; then \
+        echo "ERROR: vllm-openai-amd only supports --platform=linux/amd64"; \
+        exit 1; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install "vllm[zen]"
+
+ENTRYPOINT ["vllm", "serve"]
--- a/setup.py
+++ b/setup.py
@@ -966,6 +966,8 @@ setup(
    ext_modules=ext_modules,
    install_requires=get_requirements(),
    extras_require={
+        # AMD Zen CPU optimizations via zentorch
+        "zen": ["zentorch"],
        "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy", "plotly"],
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.2.2"],
--- a/tests/model_executor/test_cpu_unquantized_gemm_dispatch.py
+++ b/tests/model_executor/test_cpu_unquantized_gemm_dispatch.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for CPU unquantized GEMM dispatch behavior."""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers import utils
+from vllm.platforms import current_platform
+
+
+@pytest.fixture(scope="module")
+def _mock_zentorch_linear_unary():
+    """Register a mock zentorch_linear_unary op when zentorch is not installed.
+
+    Allows the dispatch tests to run in CI without a real zentorch build.
+    Skips registration when zentorch is already available.
+    """
+    if hasattr(torch.ops.zentorch, "zentorch_linear_unary"):
+        yield
+        return
+
+    lib_def = torch.library.Library("zentorch", "DEF")
+    lib_def.define(
+        "zentorch_linear_unary("
+        "Tensor input, "
+        "Tensor weight, "
+        "Tensor? bias, "
+        "bool is_weight_prepacked=False"
+        ") -> Tensor"
+    )
+
+    lib_impl = torch.library.Library("zentorch", "IMPL", "CPU")
+    lib_impl.impl(
+        "zentorch_linear_unary",
+        lambda input, weight, bias, is_weight_prepacked=False: (
+            torch.nn.functional.linear(input, weight, bias)
+        ),
+    )
+
+    yield
+
+    lib_impl._destroy()
+    lib_def._destroy()
+
+
+@pytest.mark.usefixtures("_mock_zentorch_linear_unary")
+def test_dispatch_cpu_unquantized_gemm_uses_zentorch_on_zen(monkeypatch):
+    monkeypatch.setattr(current_platform, "is_zen_cpu", lambda: True)
+
+    layer = torch.nn.Linear(16, 8, bias=True)
+    x = torch.randn(4, 16)
+    expected = torch.nn.functional.linear(x, layer.weight, layer.bias)
+
+    utils.dispatch_cpu_unquantized_gemm(layer, remove_weight=False)
+    output = layer.cpu_linear(x, layer.weight, layer.bias)
+
+    torch.testing.assert_close(output, expected)
+
+
+@pytest.mark.usefixtures("_mock_zentorch_linear_unary")
+def test_dispatch_cpu_unquantized_gemm_zen_remove_weight(monkeypatch):
+    monkeypatch.setattr(current_platform, "is_zen_cpu", lambda: True)
+
+    layer = torch.nn.Linear(16, 8, bias=True)
+    utils.dispatch_cpu_unquantized_gemm(layer, remove_weight=True)
+
+    assert layer.weight.numel() == 0
--- a/tests/test_zen_cpu_platform_detection.py
+++ b/tests/test_zen_cpu_platform_detection.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import mock_open, patch
+
+from vllm.platforms import _is_amd_zen_cpu
+
+
+def test_is_amd_zen_cpu_detects_amd_with_avx512():
+    cpuinfo = "vendor_id: AuthenticAMD\nflags: avx avx2 avx512f avx512bw"
+    with (
+        patch("os.path.exists", return_value=True),
+        patch("builtins.open", mock_open(read_data=cpuinfo)),
+    ):
+        assert _is_amd_zen_cpu()
+
+
+def test_is_amd_zen_cpu_returns_false_for_amd_without_avx512():
+    cpuinfo = "vendor_id: AuthenticAMD\nflags: avx avx2"
+    with (
+        patch("os.path.exists", return_value=True),
+        patch("builtins.open", mock_open(read_data=cpuinfo)),
+    ):
+        assert not _is_amd_zen_cpu()
+
+
+def test_is_amd_zen_cpu_returns_false_for_intel_with_avx512():
+    cpuinfo = "vendor_id: GenuineIntel\nflags: avx avx2 avx512f"
+    with (
+        patch("os.path.exists", return_value=True),
+        patch("builtins.open", mock_open(read_data=cpuinfo)),
+    ):
+        assert not _is_amd_zen_cpu()
+
+
+def test_is_amd_zen_cpu_returns_false_when_cpuinfo_missing():
+    with patch("os.path.exists", return_value=False):
+        assert not _is_amd_zen_cpu()
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -51,6 +51,7 @@ if TYPE_CHECKING:
    VLLM_CPU_OMP_THREADS_BIND: str = "auto"
    VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None
    VLLM_CPU_SGL_KERNEL: bool = False
+    VLLM_ZENTORCH_WEIGHT_PREPACK: bool = True
    VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
    VLLM_XLA_CHECK_RECOMPILATION: bool = False
    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto"
@@ -709,6 +710,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    else None,
    # (CPU backend only) whether to use SGL kernels, optimized for small batch.
    "VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
+    # (Zen CPU backend) eagerly prepack weights into ZenDNN blocked layout
+    # at model load time. Eliminates per-inference layout conversion overhead.
+    "VLLM_ZENTORCH_WEIGHT_PREPACK": lambda: bool(
+        int(os.getenv("VLLM_ZENTORCH_WEIGHT_PREPACK", "1"))
+    ),
    # If the env var is set, Ray Compiled Graph uses the specified
    # channel type to communicate between workers belonging to
    # different pipeline-parallel stages.
@@ -1768,6 +1774,7 @@ def compile_factors() -> dict[str, object]:
        "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",
        "VLLM_CPU_KVCACHE_SPACE",
        "VLLM_CPU_MOE_PREPACK",
+        "VLLM_ZENTORCH_WEIGHT_PREPACK",
        "VLLM_TEST_FORCE_LOAD_FORMAT",
        "VLLM_ENABLE_CUDA_COMPATIBILITY",
        "VLLM_CUDA_COMPATIBILITY_PATH",
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -231,6 +231,30 @@ def dispatch_cpu_unquantized_gemm(
    N, K = layer.weight.size()
    dtype = layer.weight.dtype

+    # Zen CPU path: zentorch_linear_unary with optional eager weight prepacking.
+    if current_platform.is_zen_cpu() and hasattr(
+        torch.ops.zentorch, "zentorch_linear_unary"
+    ):
+        zen_weight = layer.weight.detach()
+        is_prepacked = False
+
+        if envs.VLLM_ZENTORCH_WEIGHT_PREPACK and hasattr(
+            torch.ops.zentorch, "zentorch_weight_prepack_for_linear"
+        ):
+            zen_weight = torch.ops.zentorch.zentorch_weight_prepack_for_linear(
+                zen_weight
+            )
+            is_prepacked = True
+
+        layer.cpu_linear = lambda x, weight, bias, _p=is_prepacked: (
+            torch.ops.zentorch.zentorch_linear_unary(
+                x, zen_weight, bias, is_weight_prepacked=_p
+            )
+        )
+        if remove_weight:
+            layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
+        return
+
    if envs.VLLM_CPU_SGL_KERNEL and check_cpu_sgl_kernel(N, K, dtype):
        packed_weight = torch.ops._C.convert_weight_packed(layer.weight)
        if getattr(layer, "bias", None) is not None:
--- a/vllm/platforms/init.py
+++ b/vllm/platforms/init.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import logging
+import os
 import traceback
 from itertools import chain
 from typing import TYPE_CHECKING
@@ -150,6 +151,15 @@ def xpu_platform_plugin() -> str | None:
    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None


+def _is_amd_zen_cpu() -> bool:
+    """Detect AMD CPU with AVX-512 via /proc/cpuinfo."""
+    if not os.path.exists("/proc/cpuinfo"):
+        return False
+    with open("/proc/cpuinfo") as f:
+        cpuinfo = f.read()
+    return "AuthenticAMD" in cpuinfo and "avx512" in cpuinfo
+
+
 def cpu_platform_plugin() -> str | None:
    is_cpu = False
    logger.debug("Checking if CPU platform is available.")
@@ -171,7 +181,24 @@ def cpu_platform_plugin() -> str | None:
    except Exception as e:
        logger.debug("CPU platform is not available because: %s", str(e))

-    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
+    if not is_cpu:
+        return None
+
+    if _is_amd_zen_cpu():
+        try:
+            import zentorch  # noqa: F401
+
+            logger.debug(
+                "AMD Zen CPU detected with zentorch installed, using ZenCpuPlatform."
+            )
+            return "vllm.platforms.zen_cpu.ZenCpuPlatform"
+        except ImportError:
+            logger.debug(
+                "AMD Zen CPU detected but zentorch not installed, "
+                "falling back to CpuPlatform."
+            )
+
+    return "vllm.platforms.cpu.CpuPlatform"


 builtin_platform_plugins = {
@@ -269,4 +296,11 @@ def __setattr__(name: str, value):
        raise AttributeError(f"No attribute named '{name}' exists in {__name__}.")


-__all__ = ["Platform", "PlatformEnum", "current_platform", "CpuArchEnum", "_init_trace"]
+__all__ = [
+    "Platform",
+    "PlatformEnum",
+    "current_platform",
+    "CpuArchEnum",
+    "_init_trace",
+    "_is_amd_zen_cpu",
+]
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -167,6 +167,9 @@ class Platform:
    def is_cpu(self) -> bool:
        return self._enum == PlatformEnum.CPU

+    def is_zen_cpu(self) -> bool:
+        return False
+
    def is_out_of_tree(self) -> bool:
        return self._enum == PlatformEnum.OOT

--- a/vllm/platforms/zen_cpu.py
+++ b/vllm/platforms/zen_cpu.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING
+
+from vllm.logger import init_logger
+from vllm.platforms.cpu import CpuPlatform
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+
+class ZenCpuPlatform(CpuPlatform):
+    """CPU platform with AMD Zen (ZenDNN/zentorch) optimizations.
+
+    Model-load time (dispatch_cpu_unquantized_gemm in layers/utils.py):
+      - Routes linear ops to zentorch_linear_unary.
+      - When VLLM_ZENTORCH_WEIGHT_PREPACK=1 (default), eagerly prepacks
+        weights via zentorch_weight_prepack_for_linear.
+    """
+
+    device_name: str = "cpu"
+    device_type: str = "cpu"
+
+    def is_zen_cpu(self) -> bool:
+        # is_cpu() also returns True for this platform (inherited from CpuPlatform).
+        return True
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        super().check_and_update_config(vllm_config)
+        cls._apply_pytorch_backports()
+
+    @classmethod
+    def _apply_pytorch_backports(cls):
+        """Backport PyTorch mainline fixes missing in 2.10.
+
+        PyTorch 2.10 has a bug in FxGraphCachePickler.dumps that doesn't
+        catch ValueError, causing torch.compile cache misses. Remove this
+        once we drop PyTorch 2.10 support. PT mainline already has this fix.
+        """
+        if not is_torch_equal_or_newer("2.10.0") or is_torch_equal_or_newer("2.11.0"):
+            return
+
+        cls._patch_fxgraphcache_pickle()
+
+    @classmethod
+    def _patch_fxgraphcache_pickle(cls):
+        """Backport mainline ValueError fix to FxGraphCachePickler.dumps()."""
+        from torch._inductor.codecache import BypassFxGraphCache, FxGraphCachePickler
+
+        original_dumps = FxGraphCachePickler.dumps
+        if hasattr(original_dumps, "_zen_patched"):
+            return
+
+        def patched_dumps(self, obj):
+            try:
+                return original_dumps(self, obj)
+            except ValueError as e:
+                raise BypassFxGraphCache("Failed to pickle cache key") from e
+
+        patched_dumps._zen_patched = True  # type: ignore[attr-defined]
+        FxGraphCachePickler.dumps = patched_dumps
+        logger.info("[zen_cpu] Patched FxGraphCachePickler.dumps (ValueError fix)")