[UX] Improve UX of CPU backend (#36968)

Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Li, Jiang <bigpyj64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-03-14 09:27:29 +08:00
parent f680dc1b39
commit 092ace9e3a
10 changed files with 174 additions and 118 deletions
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -21,6 +21,20 @@ steps:
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"

+- label: CPU-Compatibility Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - cmake/cpu_extension.cmake
+  - setup.py
+  - vllm/platforms/cpu.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
+
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -25,9 +25,7 @@ fi
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --build-arg VLLM_CPU_AVX512BF16=true \
-  --build-arg VLLM_CPU_AVX512VNNI=true \
-  --build-arg VLLM_CPU_AMXBF16=true \
+  --build-arg VLLM_CPU_X86=true \
  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
  --target vllm-test \
  --progress plain .
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -83,7 +83,7 @@ steps:
        agents:
          queue: cpu_queue_postmerge
        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -euox pipefail
+
+export VLLM_CPU_KVCACHE_SPACE=1 
+export VLLM_CPU_CI_ENV=1
+# Reduce sub-processes for acceleration
+export TORCH_COMPILE_DISABLE=1 
+export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
+SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
+wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
+echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
+mkdir -p sde
+tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
+
+wait_for_pid_and_check_log() {
+    local pid="$1"
+    local log_file="$2"
+    local exit_status
+
+    if [ -z "$pid" ] || [ -z "$log_file" ]; then
+        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
+        return 1
+    fi
+
+    echo "Waiting for process $pid to finish..."
+    
+    # Use the 'wait' command to pause the script until the specific PID exits.
+    # The 'wait' command's own exit status will be that of the waited-for process.
+    if wait "$pid"; then
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Success)."
+    else
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Failure)."
+    fi
+
+    if [ "$exit_status" -ne 0 ]; then
+        echo "Process exited with a non-zero status."
+        echo "--- Last few lines of log file: $log_file ---"
+        tail -n 50 "$log_file"
+        echo "---------------------------------------------"
+        return 1 # Indicate failure based on exit status
+    fi
+
+    echo "No errors detected in log file and process exited successfully."
+    return 0
+}
+
+# Test Sky Lake (AVX512F)
+./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
+PID_TEST_0=$!
+
+# Test Cascade Lake (AVX512F + VNNI)
+./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
+PID_TEST_1=$!
+
+# Test Cooper Lake (AVX512F + VNNI + BF16)
+./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
+PID_TEST_2=$!
+
+wait_for_pid_and_check_log $PID_TEST_0 test_0.log
+wait_for_pid_and_check_log $PID_TEST_1 test_1.log
+wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -102,11 +102,13 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
        "-mavx512f"
        "-mavx512vl"
        "-mavx512bw"
-        "-mavx512dq"
-        "-mavx512bf16"
-        "-mavx512vnni"
+        "-mavx512dq")
+    list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX 
+        ${CXX_COMPILE_FLAGS_AVX512}
        "-mamx-bf16"
-        "-mamx-tile")
+        "-mamx-tile"
+        "-mavx512bf16"
+        "-mavx512vnni")
    list(APPEND CXX_COMPILE_FLAGS_AVX2
        "-mavx2")
 elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
@@ -314,7 +316,8 @@ endif()

 # TODO: Refactor this
 if (ENABLE_X86_ISA)
-    message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
+    message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}")
+    message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
    message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
 else()
    message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@@ -366,13 +369,15 @@ if(USE_ONEDNN)
 endif()

 if (ENABLE_X86_ISA)
-    set(VLLM_EXT_SRC_AVX512
+    set(VLLM_EXT_SRC_SGL
        "csrc/cpu/sgl-kernels/gemm.cpp"
        "csrc/cpu/sgl-kernels/gemm_int8.cpp"
        "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
        "csrc/cpu/sgl-kernels/moe.cpp"
        "csrc/cpu/sgl-kernels/moe_int8.cpp"
-        "csrc/cpu/sgl-kernels/moe_fp8.cpp"
+        "csrc/cpu/sgl-kernels/moe_fp8.cpp")
+
+    set(VLLM_EXT_SRC_AVX512
        "csrc/cpu/shm.cpp"
        "csrc/cpu/cpu_wna16.cpp"
        "csrc/cpu/cpu_fused_moe.cpp"
@@ -398,31 +403,48 @@ if (ENABLE_X86_ISA)
        "csrc/cpu/pos_encoding.cpp"
        "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 

-    message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
+    message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}")
+    message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}")
    message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")

+    set(_C_LIBS numa dnnl_ext)
+    set(_C_AVX512_LIBS numa dnnl_ext)
+    set(_C_AVX2_LIBS numa)
+
+    # AMX + AVX512F + AVX512BF16 + AVX512VNNI
    define_extension_target(
        _C
        DESTINATION vllm
        LANGUAGE CXX
+        SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}
+        LIBRARIES ${_C_LIBS}
+        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX}
+        USE_SABI 3
+        WITH_SOABI
+    )
+
+    # For AMX kernels
+    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
+
+    # AVX512F 
+    define_extension_target(
+        _C_AVX512
+        DESTINATION vllm
+        LANGUAGE CXX
        SOURCES ${VLLM_EXT_SRC_AVX512}
-        LIBRARIES ${LIBS}
+        LIBRARIES ${_C_AVX512_LIBS}
        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
        USE_SABI 3
        WITH_SOABI
    )

-    # For SGL kernels
-    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
-    # For AMX kernels
-    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
-
+    # AVX2 
    define_extension_target(
        _C_AVX2
        DESTINATION vllm
        LANGUAGE CXX
        SOURCES ${VLLM_EXT_SRC_AVX2}
-        LIBRARIES ${LIBS}
+        LIBRARIES ${_C_AVX2_LIBS}
        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
        USE_SABI 3
        WITH_SOABI
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -14,12 +14,7 @@
 #
 # Build arguments:
 #   PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
-#   VLLM_CPU_DISABLE_AVX512=false (default)|true
-#   VLLM_CPU_AVX2=false (default)|true (for cross-compilation)
-#   VLLM_CPU_AVX512=false (default)|true (for cross-compilation)
-#   VLLM_CPU_AVX512BF16=false (default)|true (for cross-compilation)
-#   VLLM_CPU_AVX512VNNI=false (default)|true (for cross-compilation)
-#   VLLM_CPU_AMXBF16=false (default)|true (for cross-compilation)
+#   VLLM_CPU_X86=false (default)|true (for cross-compilation)
 #   VLLM_CPU_ARM_BF16=false (default)|true (for cross-compilation)
 #

@@ -36,7 +31,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get update -y \
    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
-    gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
+    gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof xz-utils \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
    && curl -LsSf https://astral.sh/uv/install.sh | sh

@@ -91,24 +86,9 @@ ARG max_jobs=32
 ENV MAX_JOBS=${max_jobs}

 ARG GIT_REPO_CHECK=0
-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512=0
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-# Support for cross-compilation with AVX2 ISA: docker build --build-arg VLLM_CPU_AVX2="1" ...
-ARG VLLM_CPU_AVX2=0
-ENV VLLM_CPU_AVX2=${VLLM_CPU_AVX2}
-# Support for cross-compilation with AVX512 ISA: docker build --build-arg VLLM_CPU_AVX512="1" ...
-ARG VLLM_CPU_AVX512=0
-ENV VLLM_CPU_AVX512=${VLLM_CPU_AVX512}
-# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
-ARG VLLM_CPU_AVX512BF16=0
-ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
-# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
-ARG VLLM_CPU_AVX512VNNI=0
-ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
-# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
-ARG VLLM_CPU_AMXBF16=1
-ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
+# Support for cross-compilation with x86 ISA including AVX2 and AVX512: docker build --build-arg VLLM_CPU_X86="true" ...
+ARG VLLM_CPU_X86=0
+ENV VLLM_CPU_X86=${VLLM_CPU_X86}
 # Support for cross-compilation with ARM BF16 ISA: docker build --build-arg VLLM_CPU_ARM_BF16="true" ...
 ARG VLLM_CPU_ARM_BF16=0
 ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16}
@@ -116,7 +96,7 @@ ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16}
 WORKDIR /vllm-workspace

 # Validate build arguments - prevent mixing incompatible ISA flags
-RUN if [ "$TARGETARCH" = "arm64" ] && { [ "$VLLM_CPU_AVX2" != "0" ] || [ "$VLLM_CPU_AVX512" != "0" ] || [ "$VLLM_CPU_AVX512BF16" != "0" ] || [ "$VLLM_CPU_AVX512VNNI" != "0" ]; }; then \
+RUN if [ "$TARGETARCH" = "arm64" ] && [ "$VLLM_CPU_X86" != "0" ]; then \
        echo "ERROR: Cannot use x86-specific ISA flags (AVX2, AVX512, etc.) when building for ARM64 (--platform=linux/arm64)"; \
        exit 1; \
    fi && \
@@ -174,7 +154,7 @@ WORKDIR /vllm-workspace

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14
+    apt-get install -y --no-install-recommends vim numactl make clangd-14

 RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd

@@ -232,22 +212,12 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm"

 # Build configuration labels
 ARG TARGETARCH
-ARG VLLM_CPU_DISABLE_AVX512
-ARG VLLM_CPU_AVX2
-ARG VLLM_CPU_AVX512
-ARG VLLM_CPU_AVX512BF16
-ARG VLLM_CPU_AVX512VNNI
-ARG VLLM_CPU_AMXBF16
+ARG VLLM_CPU_X86
 ARG VLLM_CPU_ARM_BF16
 ARG PYTHON_VERSION

 LABEL ai.vllm.build.target-arch="${TARGETARCH}"
-LABEL ai.vllm.build.cpu-disable-avx512="${VLLM_CPU_DISABLE_AVX512:-false}"
-LABEL ai.vllm.build.cpu-avx2="${VLLM_CPU_AVX2:-false}"
-LABEL ai.vllm.build.cpu-avx512="${VLLM_CPU_AVX512:-false}"
-LABEL ai.vllm.build.cpu-avx512bf16="${VLLM_CPU_AVX512BF16:-false}"
-LABEL ai.vllm.build.cpu-avx512vnni="${VLLM_CPU_AVX512VNNI:-false}"
-LABEL ai.vllm.build.cpu-amxbf16="${VLLM_CPU_AMXBF16:-false}"
+LABEL ai.vllm.build.cpu-x86="${VLLM_CPU_X86:-false}"
 LABEL ai.vllm.build.cpu-arm-bf16="${VLLM_CPU_ARM_BF16:-false}"
 LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}"

--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -7,7 +7,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 --8<-- [start:requirements]

 - OS: Linux
- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional)
+- CPU flags: `avx512f` (Recommended), `avx2` (Limited features)

 !!! tip
    Use `lscpu` to check the CPU flags.
@@ -18,7 +18,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 --8<-- [end:set-up-using-python]
 --8<-- [start:pre-built-wheels]

-Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels:
+Pre-built vLLM wheels for x86 with AVX512/AVX2 are available since version 0.17.0. To install release wheels:

 ```bash
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
@@ -108,13 +108,13 @@ VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
 If you want to develop vLLM, install it in editable mode instead.

 ```bash
-VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
+VLLM_TARGET_DEVICE=cpu python3 setup.py develop
 ```

 Optionally, build a portable wheel which you can then install elsewhere:

 ```bash
-VLLM_TARGET_DEVICE=cpu uv build --wheel
+VLLM_TARGET_DEVICE=cpu uv build --wheel --no-build-isolation
 ```

 ```bash
@@ -185,12 +185,9 @@ docker run \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
    --env "HF_TOKEN=<secret>" \
-vllm/vllm-openai-cpu:latest-x86_64 <args...>
+    vllm/vllm-openai-cpu:latest-x86_64 <args...>
 ```

-!!! warning
-    If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. See the build-image-from-source section below for build arguments to match your target CPU capabilities.
-
 --8<-- [end:pre-built-images]
 --8<-- [start:build-image-from-source]

@@ -198,50 +195,11 @@ vllm/vllm-openai-cpu:latest-x86_64 <args...>

 ```bash
 docker build -f docker/Dockerfile.cpu \
-        --build-arg VLLM_CPU_DISABLE_AVX512=<false (default)|true> \
-        --build-arg VLLM_CPU_AVX2=<false (default)|true> \
-        --build-arg VLLM_CPU_AVX512=<false (default)|true> \
-        --build-arg VLLM_CPU_AVX512BF16=<false (default)|true> \
-        --build-arg VLLM_CPU_AVX512VNNI=<false (default)|true> \
-        --build-arg VLLM_CPU_AMXBF16=<false|true (default)> \
+        --build-arg VLLM_CPU_X86=<false (default)|true> \ # For cross-compilation
        --tag vllm-cpu-env \
        --target vllm-openai .
 ```

-!!! note "Auto-detection by default"
-    By default, CPU instruction sets (AVX512, AVX2, etc.) are automatically detected from the build system's CPU flags. Build arguments like `VLLM_CPU_AVX2`, `VLLM_CPU_AVX512`, `VLLM_CPU_AVX512BF16`, `VLLM_CPU_AVX512VNNI`, and `VLLM_CPU_AMXBF16` are used for cross-compilation:
-
-    - `VLLM_CPU_{ISA}=true` - Force-enable the instruction set (build with ISA regardless of build system capabilities)
-    - `VLLM_CPU_{ISA}=false` - Rely on auto-detection (default)
-
-##### Examples
-
-###### Auto-detection build (default)
-
-```bash
-docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
-```
-
-###### Cross-compile for AVX512
-
-```bash
-docker build -f docker/Dockerfile.cpu \
-        --build-arg VLLM_CPU_AVX512=true \
-        --build-arg VLLM_CPU_AVX512BF16=true \
-        --build-arg VLLM_CPU_AVX512VNNI=true \
-        --tag vllm-cpu-avx512 \
-        --target vllm-openai .
-```
-
-###### Cross-compile for AVX2
-
-```bash
-docker build -f docker/Dockerfile.cpu \
-        --build-arg VLLM_CPU_AVX2=true \
-        --tag vllm-cpu-avx2 \
-        --target vllm-openai .
-```
-
 #### Launching the OpenAI server

 ```bash
--- a/setup.py
+++ b/setup.py
@@ -920,6 +920,7 @@ if _is_cpu():

    if platform.machine() in ("x86_64", "AMD64"):
        ext_modules.append(CMakeExtension(name="vllm._C"))
+        ext_modules.append(CMakeExtension(name="vllm._C_AVX512"))
        ext_modules.append(CMakeExtension(name="vllm._C_AVX2"))
    else:
        ext_modules.append(CMakeExtension(name="vllm._C"))
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -252,6 +252,8 @@ class CpuPlatform(Platform):
        if vllm_config.lora_config is not None:
            compilation_config.mode = CompilationMode.NONE

+        vllm_config.profiler_config.torch_profiler_dump_cuda_time_total = False
+
        assert vllm_config.device_config.device_type == "cpu"

        #
@@ -470,21 +472,32 @@ class CpuPlatform(Platform):
    @classmethod
    def import_kernels(cls) -> None:
        if Platform.get_cpu_architecture() in (CpuArchEnum.X86,):
-            if torch._C._cpu._is_avx512_supported():
-                try:
-                    import vllm._C  # noqa: F401
-                except ImportError as e:
-                    logger.warning("Failed to import from vllm._C: %r", e)
+            # Note: The lib name is _C_AVX2/AVX512, but the module name is _C.
+            # This will cause a exception "dynamic module does define
+            # module export function". But the library is imported
+            # successfully. So ignore the exception for now, until we find
+            # a solution.
+            ignored_msg = "dynamic module does not define module export function"
+            if torch.cpu._is_avx512_supported():
+                if torch.cpu._is_avx512_bf16_supported():
+                    try:
+                        import vllm._C  # noqa: F401
+                    except ImportError as e:
+                        logger.warning("Failed to import from vllm._C: %r", e)
+                else:
+                    try:
+                        import vllm._C_AVX512  # noqa: F401
+                    except ImportError as e:
+                        if ignored_msg not in e.msg:
+                            logger.warning(
+                                "Failed to import from vllm._C_AVX512: %r", e
+                            )
            else:
-                # Note: The lib name is _C_AVX2, but the module name is _C.
-                # This will cause a exception "dynamic module does define
-                # module export function". But the library is imported
-                # successfully. So ignore the exception for now, until we find
-                # a solution.
                try:
                    import vllm._C_AVX2  # noqa: F401
                except ImportError as e:
-                    logger.warning("Failed to import from vllm._C_AVX2: %r", e)
+                    if ignored_msg not in e.msg:
+                        logger.warning("Failed to import from vllm._C_AVX2: %r", e)
        else:
            try:
                import vllm._C  # noqa: F401
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -52,6 +52,21 @@ class CPUWorker(Worker):
            )

    def init_device(self):
+        # Check whether critical libraries are loaded
+        def check_preloaded_libs(name: str):
+            ld_preload_list = os.environ.get("LD_PRELOAD", "")
+            if name not in ld_preload_list:
+                raise RuntimeError(
+                    f"{name} is not found in LD_PRELOAD. "
+                    "Please follow the section `set LD_PRELOAD` in "
+                    "https://docs.vllm.ai/en/latest/getting_started/installation/cpu/ "
+                    "to setup required pre-loaded libraries."
+                )
+
+        check_preloaded_libs("libtcmalloc")
+        if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+            check_preloaded_libs("libiomp")
+
        # Setup OpenMP threads affinity.
        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
        # Under numa binding some cores reserved for kv transfer in nixl_connector.py