From 092ace9e3a21f90c9f4aba8defe69ecff4bab628 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Sat, 14 Mar 2026 09:27:29 +0800 Subject: [PATCH] [UX] Improve UX of CPU backend (#36968) Signed-off-by: jiang1.li Signed-off-by: Li, Jiang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .buildkite/hardware_tests/cpu.yaml | 14 ++++ .buildkite/image_build/image_build_cpu.sh | 4 +- .buildkite/release-pipeline.yaml | 4 +- .../hardware_ci/run-cpu-compatibility-test.sh | 65 +++++++++++++++++++ cmake/cpu_extension.cmake | 52 ++++++++++----- docker/Dockerfile.cpu | 48 +++----------- .../installation/cpu.x86.inc.md | 54 ++------------- setup.py | 1 + vllm/platforms/cpu.py | 35 ++++++---- vllm/v1/worker/cpu_worker.py | 15 +++++ 10 files changed, 174 insertions(+), 118 deletions(-) create mode 100755 .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml index b387cf935..5c181943c 100644 --- a/.buildkite/hardware_tests/cpu.yaml +++ b/.buildkite/hardware_tests/cpu.yaml @@ -21,6 +21,20 @@ steps: pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py pytest -x -v -s tests/kernels/test_onednn.py" +- label: CPU-Compatibility Tests + depends_on: [] + soft_fail: true + device: intel_cpu + no_plugin: true + source_file_dependencies: + - cmake/cpu_extension.cmake + - setup.py + - vllm/platforms/cpu.py + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh" + - label: CPU-Language Generation and Pooling Model Tests depends_on: [] soft_fail: true diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh index 2d5e49ecd..ccfe155fa 100755 --- a/.buildkite/image_build/image_build_cpu.sh +++ b/.buildkite/image_build/image_build_cpu.sh @@ -25,9 +25,7 @@ fi docker build --file docker/Dockerfile.cpu \ --build-arg max_jobs=16 \ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \ - --build-arg VLLM_CPU_AVX512BF16=true \ - --build-arg VLLM_CPU_AVX512VNNI=true \ - --build-arg VLLM_CPU_AMXBF16=true \ + --build-arg VLLM_CPU_X86=true \ --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \ --target vllm-test \ --progress plain . diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 3f820a74a..001ed2f68 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -83,7 +83,7 @@ steps: agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" @@ -152,7 +152,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: diff --git a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh new file mode 100755 index 000000000..232673f01 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -euox pipefail + +export VLLM_CPU_KVCACHE_SPACE=1 +export VLLM_CPU_CI_ENV=1 +# Reduce sub-processes for acceleration +export TORCH_COMPILE_DISABLE=1 +export VLLM_ENABLE_V1_MULTIPROCESSING=0 + +SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz" +SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217" +wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}" +echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check +mkdir -p sde +tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/ + +wait_for_pid_and_check_log() { + local pid="$1" + local log_file="$2" + local exit_status + + if [ -z "$pid" ] || [ -z "$log_file" ]; then + echo "Usage: wait_for_pid_and_check_log " + return 1 + fi + + echo "Waiting for process $pid to finish..." + + # Use the 'wait' command to pause the script until the specific PID exits. + # The 'wait' command's own exit status will be that of the waited-for process. + if wait "$pid"; then + exit_status=$? + echo "Process $pid finished with exit status $exit_status (Success)." + else + exit_status=$? + echo "Process $pid finished with exit status $exit_status (Failure)." + fi + + if [ "$exit_status" -ne 0 ]; then + echo "Process exited with a non-zero status." + echo "--- Last few lines of log file: $log_file ---" + tail -n 50 "$log_file" + echo "---------------------------------------------" + return 1 # Indicate failure based on exit status + fi + + echo "No errors detected in log file and process exited successfully." + return 0 +} + +# Test Sky Lake (AVX512F) +./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 & +PID_TEST_0=$! + +# Test Cascade Lake (AVX512F + VNNI) +./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 & +PID_TEST_1=$! + +# Test Cooper Lake (AVX512F + VNNI + BF16) +./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 & +PID_TEST_2=$! + +wait_for_pid_and_check_log $PID_TEST_0 test_0.log +wait_for_pid_and_check_log $PID_TEST_1 test_1.log +wait_for_pid_and_check_log $PID_TEST_2 test_2.log diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 1d5e223fa..8d74d6d5d 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -102,11 +102,13 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA) "-mavx512f" "-mavx512vl" "-mavx512bw" - "-mavx512dq" - "-mavx512bf16" - "-mavx512vnni" + "-mavx512dq") + list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX + ${CXX_COMPILE_FLAGS_AVX512} "-mamx-bf16" - "-mamx-tile") + "-mamx-tile" + "-mavx512bf16" + "-mavx512vnni") list(APPEND CXX_COMPILE_FLAGS_AVX2 "-mavx2") elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) @@ -314,7 +316,8 @@ endif() # TODO: Refactor this if (ENABLE_X86_ISA) - message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}") + message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}") + message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}") message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}") else() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") @@ -366,13 +369,15 @@ if(USE_ONEDNN) endif() if (ENABLE_X86_ISA) - set(VLLM_EXT_SRC_AVX512 + set(VLLM_EXT_SRC_SGL "csrc/cpu/sgl-kernels/gemm.cpp" "csrc/cpu/sgl-kernels/gemm_int8.cpp" "csrc/cpu/sgl-kernels/gemm_fp8.cpp" "csrc/cpu/sgl-kernels/moe.cpp" "csrc/cpu/sgl-kernels/moe_int8.cpp" - "csrc/cpu/sgl-kernels/moe_fp8.cpp" + "csrc/cpu/sgl-kernels/moe_fp8.cpp") + + set(VLLM_EXT_SRC_AVX512 "csrc/cpu/shm.cpp" "csrc/cpu/cpu_wna16.cpp" "csrc/cpu/cpu_fused_moe.cpp" @@ -398,31 +403,48 @@ if (ENABLE_X86_ISA) "csrc/cpu/pos_encoding.cpp" "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") - message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}") + message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}") + message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}") message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}") + set(_C_LIBS numa dnnl_ext) + set(_C_AVX512_LIBS numa dnnl_ext) + set(_C_AVX2_LIBS numa) + + # AMX + AVX512F + AVX512BF16 + AVX512VNNI define_extension_target( _C DESTINATION vllm LANGUAGE CXX + SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL} + LIBRARIES ${_C_LIBS} + COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX} + USE_SABI 3 + WITH_SOABI + ) + + # For AMX kernels + target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16") + + # AVX512F + define_extension_target( + _C_AVX512 + DESTINATION vllm + LANGUAGE CXX SOURCES ${VLLM_EXT_SRC_AVX512} - LIBRARIES ${LIBS} + LIBRARIES ${_C_AVX512_LIBS} COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512} USE_SABI 3 WITH_SOABI ) - # For SGL kernels - target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512") - # For AMX kernels - target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16") - + # AVX2 define_extension_target( _C_AVX2 DESTINATION vllm LANGUAGE CXX SOURCES ${VLLM_EXT_SRC_AVX2} - LIBRARIES ${LIBS} + LIBRARIES ${_C_AVX2_LIBS} COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2} USE_SABI 3 WITH_SOABI diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index d81957e02..8a1da6897 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -14,12 +14,7 @@ # # Build arguments: # PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10 -# VLLM_CPU_DISABLE_AVX512=false (default)|true -# VLLM_CPU_AVX2=false (default)|true (for cross-compilation) -# VLLM_CPU_AVX512=false (default)|true (for cross-compilation) -# VLLM_CPU_AVX512BF16=false (default)|true (for cross-compilation) -# VLLM_CPU_AVX512VNNI=false (default)|true (for cross-compilation) -# VLLM_CPU_AMXBF16=false (default)|true (for cross-compilation) +# VLLM_CPU_X86=false (default)|true (for cross-compilation) # VLLM_CPU_ARM_BF16=false (default)|true (for cross-compilation) # @@ -36,7 +31,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ apt-get update -y \ && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \ - gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \ + gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof xz-utils \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && curl -LsSf https://astral.sh/uv/install.sh | sh @@ -91,24 +86,9 @@ ARG max_jobs=32 ENV MAX_JOBS=${max_jobs} ARG GIT_REPO_CHECK=0 -# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... -ARG VLLM_CPU_DISABLE_AVX512=0 -ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} -# Support for cross-compilation with AVX2 ISA: docker build --build-arg VLLM_CPU_AVX2="1" ... -ARG VLLM_CPU_AVX2=0 -ENV VLLM_CPU_AVX2=${VLLM_CPU_AVX2} -# Support for cross-compilation with AVX512 ISA: docker build --build-arg VLLM_CPU_AVX512="1" ... -ARG VLLM_CPU_AVX512=0 -ENV VLLM_CPU_AVX512=${VLLM_CPU_AVX512} -# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ... -ARG VLLM_CPU_AVX512BF16=0 -ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16} -# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ... -ARG VLLM_CPU_AVX512VNNI=0 -ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI} -# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ... -ARG VLLM_CPU_AMXBF16=1 -ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16} +# Support for cross-compilation with x86 ISA including AVX2 and AVX512: docker build --build-arg VLLM_CPU_X86="true" ... +ARG VLLM_CPU_X86=0 +ENV VLLM_CPU_X86=${VLLM_CPU_X86} # Support for cross-compilation with ARM BF16 ISA: docker build --build-arg VLLM_CPU_ARM_BF16="true" ... ARG VLLM_CPU_ARM_BF16=0 ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16} @@ -116,7 +96,7 @@ ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16} WORKDIR /vllm-workspace # Validate build arguments - prevent mixing incompatible ISA flags -RUN if [ "$TARGETARCH" = "arm64" ] && { [ "$VLLM_CPU_AVX2" != "0" ] || [ "$VLLM_CPU_AVX512" != "0" ] || [ "$VLLM_CPU_AVX512BF16" != "0" ] || [ "$VLLM_CPU_AVX512VNNI" != "0" ]; }; then \ +RUN if [ "$TARGETARCH" = "arm64" ] && [ "$VLLM_CPU_X86" != "0" ]; then \ echo "ERROR: Cannot use x86-specific ISA flags (AVX2, AVX512, etc.) when building for ARM64 (--platform=linux/arm64)"; \ exit 1; \ fi && \ @@ -174,7 +154,7 @@ WORKDIR /vllm-workspace RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ - apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14 + apt-get install -y --no-install-recommends vim numactl make clangd-14 RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd @@ -232,22 +212,12 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm" # Build configuration labels ARG TARGETARCH -ARG VLLM_CPU_DISABLE_AVX512 -ARG VLLM_CPU_AVX2 -ARG VLLM_CPU_AVX512 -ARG VLLM_CPU_AVX512BF16 -ARG VLLM_CPU_AVX512VNNI -ARG VLLM_CPU_AMXBF16 +ARG VLLM_CPU_X86 ARG VLLM_CPU_ARM_BF16 ARG PYTHON_VERSION LABEL ai.vllm.build.target-arch="${TARGETARCH}" -LABEL ai.vllm.build.cpu-disable-avx512="${VLLM_CPU_DISABLE_AVX512:-false}" -LABEL ai.vllm.build.cpu-avx2="${VLLM_CPU_AVX2:-false}" -LABEL ai.vllm.build.cpu-avx512="${VLLM_CPU_AVX512:-false}" -LABEL ai.vllm.build.cpu-avx512bf16="${VLLM_CPU_AVX512BF16:-false}" -LABEL ai.vllm.build.cpu-avx512vnni="${VLLM_CPU_AVX512VNNI:-false}" -LABEL ai.vllm.build.cpu-amxbf16="${VLLM_CPU_AMXBF16:-false}" +LABEL ai.vllm.build.cpu-x86="${VLLM_CPU_X86:-false}" LABEL ai.vllm.build.cpu-arm-bf16="${VLLM_CPU_ARM_BF16:-false}" LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}" diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md index 45278756b..8b855e919 100644 --- a/docs/getting_started/installation/cpu.x86.inc.md +++ b/docs/getting_started/installation/cpu.x86.inc.md @@ -7,7 +7,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data --8<-- [start:requirements] - OS: Linux -- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional) +- CPU flags: `avx512f` (Recommended), `avx2` (Limited features) !!! tip Use `lscpu` to check the CPU flags. @@ -18,7 +18,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data --8<-- [end:set-up-using-python] --8<-- [start:pre-built-wheels] -Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels: +Pre-built vLLM wheels for x86 with AVX512/AVX2 are available since version 0.17.0. To install release wheels: ```bash export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') @@ -108,13 +108,13 @@ VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation If you want to develop vLLM, install it in editable mode instead. ```bash -VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation +VLLM_TARGET_DEVICE=cpu python3 setup.py develop ``` Optionally, build a portable wheel which you can then install elsewhere: ```bash -VLLM_TARGET_DEVICE=cpu uv build --wheel +VLLM_TARGET_DEVICE=cpu uv build --wheel --no-build-isolation ``` ```bash @@ -185,12 +185,9 @@ docker run \ -v ~/.cache/huggingface:/root/.cache/huggingface \ -p 8000:8000 \ --env "HF_TOKEN=" \ -vllm/vllm-openai-cpu:latest-x86_64 + vllm/vllm-openai-cpu:latest-x86_64 ``` -!!! warning - If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. See the build-image-from-source section below for build arguments to match your target CPU capabilities. - --8<-- [end:pre-built-images] --8<-- [start:build-image-from-source] @@ -198,50 +195,11 @@ vllm/vllm-openai-cpu:latest-x86_64 ```bash docker build -f docker/Dockerfile.cpu \ - --build-arg VLLM_CPU_DISABLE_AVX512= \ - --build-arg VLLM_CPU_AVX2= \ - --build-arg VLLM_CPU_AVX512= \ - --build-arg VLLM_CPU_AVX512BF16= \ - --build-arg VLLM_CPU_AVX512VNNI= \ - --build-arg VLLM_CPU_AMXBF16= \ + --build-arg VLLM_CPU_X86= \ # For cross-compilation --tag vllm-cpu-env \ --target vllm-openai . ``` -!!! note "Auto-detection by default" - By default, CPU instruction sets (AVX512, AVX2, etc.) are automatically detected from the build system's CPU flags. Build arguments like `VLLM_CPU_AVX2`, `VLLM_CPU_AVX512`, `VLLM_CPU_AVX512BF16`, `VLLM_CPU_AVX512VNNI`, and `VLLM_CPU_AMXBF16` are used for cross-compilation: - - - `VLLM_CPU_{ISA}=true` - Force-enable the instruction set (build with ISA regardless of build system capabilities) - - `VLLM_CPU_{ISA}=false` - Rely on auto-detection (default) - -##### Examples - -###### Auto-detection build (default) - -```bash -docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai . -``` - -###### Cross-compile for AVX512 - -```bash -docker build -f docker/Dockerfile.cpu \ - --build-arg VLLM_CPU_AVX512=true \ - --build-arg VLLM_CPU_AVX512BF16=true \ - --build-arg VLLM_CPU_AVX512VNNI=true \ - --tag vllm-cpu-avx512 \ - --target vllm-openai . -``` - -###### Cross-compile for AVX2 - -```bash -docker build -f docker/Dockerfile.cpu \ - --build-arg VLLM_CPU_AVX2=true \ - --tag vllm-cpu-avx2 \ - --target vllm-openai . -``` - #### Launching the OpenAI server ```bash diff --git a/setup.py b/setup.py index fa13fff4e..32d04d578 100644 --- a/setup.py +++ b/setup.py @@ -920,6 +920,7 @@ if _is_cpu(): if platform.machine() in ("x86_64", "AMD64"): ext_modules.append(CMakeExtension(name="vllm._C")) + ext_modules.append(CMakeExtension(name="vllm._C_AVX512")) ext_modules.append(CMakeExtension(name="vllm._C_AVX2")) else: ext_modules.append(CMakeExtension(name="vllm._C")) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index fbb3ebeac..b3a616eeb 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -252,6 +252,8 @@ class CpuPlatform(Platform): if vllm_config.lora_config is not None: compilation_config.mode = CompilationMode.NONE + vllm_config.profiler_config.torch_profiler_dump_cuda_time_total = False + assert vllm_config.device_config.device_type == "cpu" # @@ -470,21 +472,32 @@ class CpuPlatform(Platform): @classmethod def import_kernels(cls) -> None: if Platform.get_cpu_architecture() in (CpuArchEnum.X86,): - if torch._C._cpu._is_avx512_supported(): - try: - import vllm._C # noqa: F401 - except ImportError as e: - logger.warning("Failed to import from vllm._C: %r", e) + # Note: The lib name is _C_AVX2/AVX512, but the module name is _C. + # This will cause a exception "dynamic module does define + # module export function". But the library is imported + # successfully. So ignore the exception for now, until we find + # a solution. + ignored_msg = "dynamic module does not define module export function" + if torch.cpu._is_avx512_supported(): + if torch.cpu._is_avx512_bf16_supported(): + try: + import vllm._C # noqa: F401 + except ImportError as e: + logger.warning("Failed to import from vllm._C: %r", e) + else: + try: + import vllm._C_AVX512 # noqa: F401 + except ImportError as e: + if ignored_msg not in e.msg: + logger.warning( + "Failed to import from vllm._C_AVX512: %r", e + ) else: - # Note: The lib name is _C_AVX2, but the module name is _C. - # This will cause a exception "dynamic module does define - # module export function". But the library is imported - # successfully. So ignore the exception for now, until we find - # a solution. try: import vllm._C_AVX2 # noqa: F401 except ImportError as e: - logger.warning("Failed to import from vllm._C_AVX2: %r", e) + if ignored_msg not in e.msg: + logger.warning("Failed to import from vllm._C_AVX2: %r", e) else: try: import vllm._C # noqa: F401 diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index c4e4783a6..a24553c5c 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -52,6 +52,21 @@ class CPUWorker(Worker): ) def init_device(self): + # Check whether critical libraries are loaded + def check_preloaded_libs(name: str): + ld_preload_list = os.environ.get("LD_PRELOAD", "") + if name not in ld_preload_list: + raise RuntimeError( + f"{name} is not found in LD_PRELOAD. " + "Please follow the section `set LD_PRELOAD` in " + "https://docs.vllm.ai/en/latest/getting_started/installation/cpu/ " + "to setup required pre-loaded libraries." + ) + + check_preloaded_libs("libtcmalloc") + if current_platform.get_cpu_architecture() == CpuArchEnum.X86: + check_preloaded_libs("libiomp") + # Setup OpenMP threads affinity. omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND # Under numa binding some cores reserved for kv transfer in nixl_connector.py