[UX] Improve UX of CPU backend (#36968)
Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Li, Jiang <bigpyj64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -21,6 +21,20 @@ steps:
|
||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
- label: CPU-Compatibility Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- cmake/cpu_extension.cmake
|
||||
- setup.py
|
||||
- vllm/platforms/cpu.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
|
||||
|
||||
- label: CPU-Language Generation and Pooling Model Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
|
||||
@@ -25,9 +25,7 @@ fi
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||
--build-arg VLLM_CPU_AMXBF16=true \
|
||||
--build-arg VLLM_CPU_X86=true \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
@@ -83,7 +83,7 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
@@ -152,7 +152,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
|
||||
65
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
Executable file
65
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
set -euox pipefail
|
||||
|
||||
export VLLM_CPU_KVCACHE_SPACE=1
|
||||
export VLLM_CPU_CI_ENV=1
|
||||
# Reduce sub-processes for acceleration
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
export VLLM_ENABLE_V1_MULTIPROCESSING=0
|
||||
|
||||
SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
|
||||
SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
|
||||
wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
|
||||
echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check
|
||||
mkdir -p sde
|
||||
tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
|
||||
|
||||
wait_for_pid_and_check_log() {
|
||||
local pid="$1"
|
||||
local log_file="$2"
|
||||
local exit_status
|
||||
|
||||
if [ -z "$pid" ] || [ -z "$log_file" ]; then
|
||||
echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Waiting for process $pid to finish..."
|
||||
|
||||
# Use the 'wait' command to pause the script until the specific PID exits.
|
||||
# The 'wait' command's own exit status will be that of the waited-for process.
|
||||
if wait "$pid"; then
|
||||
exit_status=$?
|
||||
echo "Process $pid finished with exit status $exit_status (Success)."
|
||||
else
|
||||
exit_status=$?
|
||||
echo "Process $pid finished with exit status $exit_status (Failure)."
|
||||
fi
|
||||
|
||||
if [ "$exit_status" -ne 0 ]; then
|
||||
echo "Process exited with a non-zero status."
|
||||
echo "--- Last few lines of log file: $log_file ---"
|
||||
tail -n 50 "$log_file"
|
||||
echo "---------------------------------------------"
|
||||
return 1 # Indicate failure based on exit status
|
||||
fi
|
||||
|
||||
echo "No errors detected in log file and process exited successfully."
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test Sky Lake (AVX512F)
|
||||
./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
|
||||
PID_TEST_0=$!
|
||||
|
||||
# Test Cascade Lake (AVX512F + VNNI)
|
||||
./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
|
||||
PID_TEST_1=$!
|
||||
|
||||
# Test Cooper Lake (AVX512F + VNNI + BF16)
|
||||
./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
|
||||
PID_TEST_2=$!
|
||||
|
||||
wait_for_pid_and_check_log $PID_TEST_0 test_0.log
|
||||
wait_for_pid_and_check_log $PID_TEST_1 test_1.log
|
||||
wait_for_pid_and_check_log $PID_TEST_2 test_2.log
|
||||
@@ -102,11 +102,13 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
|
||||
"-mavx512f"
|
||||
"-mavx512vl"
|
||||
"-mavx512bw"
|
||||
"-mavx512dq"
|
||||
"-mavx512bf16"
|
||||
"-mavx512vnni"
|
||||
"-mavx512dq")
|
||||
list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX
|
||||
${CXX_COMPILE_FLAGS_AVX512}
|
||||
"-mamx-bf16"
|
||||
"-mamx-tile")
|
||||
"-mamx-tile"
|
||||
"-mavx512bf16"
|
||||
"-mavx512vnni")
|
||||
list(APPEND CXX_COMPILE_FLAGS_AVX2
|
||||
"-mavx2")
|
||||
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
@@ -314,7 +316,8 @@ endif()
|
||||
|
||||
# TODO: Refactor this
|
||||
if (ENABLE_X86_ISA)
|
||||
message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
|
||||
message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}")
|
||||
message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
|
||||
message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
|
||||
else()
|
||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||
@@ -366,13 +369,15 @@ if(USE_ONEDNN)
|
||||
endif()
|
||||
|
||||
if (ENABLE_X86_ISA)
|
||||
set(VLLM_EXT_SRC_AVX512
|
||||
set(VLLM_EXT_SRC_SGL
|
||||
"csrc/cpu/sgl-kernels/gemm.cpp"
|
||||
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
|
||||
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_int8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_fp8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_fp8.cpp")
|
||||
|
||||
set(VLLM_EXT_SRC_AVX512
|
||||
"csrc/cpu/shm.cpp"
|
||||
"csrc/cpu/cpu_wna16.cpp"
|
||||
"csrc/cpu/cpu_fused_moe.cpp"
|
||||
@@ -398,31 +403,48 @@ if (ENABLE_X86_ISA)
|
||||
"csrc/cpu/pos_encoding.cpp"
|
||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
||||
|
||||
message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
|
||||
message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}")
|
||||
message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}")
|
||||
message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
|
||||
|
||||
set(_C_LIBS numa dnnl_ext)
|
||||
set(_C_AVX512_LIBS numa dnnl_ext)
|
||||
set(_C_AVX2_LIBS numa)
|
||||
|
||||
# AMX + AVX512F + AVX512BF16 + AVX512VNNI
|
||||
define_extension_target(
|
||||
_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}
|
||||
LIBRARIES ${_C_LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
# For AMX kernels
|
||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
|
||||
|
||||
# AVX512F
|
||||
define_extension_target(
|
||||
_C_AVX512
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC_AVX512}
|
||||
LIBRARIES ${LIBS}
|
||||
LIBRARIES ${_C_AVX512_LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
# For SGL kernels
|
||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
|
||||
# For AMX kernels
|
||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
|
||||
|
||||
# AVX2
|
||||
define_extension_target(
|
||||
_C_AVX2
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC_AVX2}
|
||||
LIBRARIES ${LIBS}
|
||||
LIBRARIES ${_C_AVX2_LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
|
||||
@@ -14,12 +14,7 @@
|
||||
#
|
||||
# Build arguments:
|
||||
# PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
|
||||
# VLLM_CPU_DISABLE_AVX512=false (default)|true
|
||||
# VLLM_CPU_AVX2=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_AVX512=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_AVX512BF16=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_AVX512VNNI=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_AMXBF16=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_X86=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_ARM_BF16=false (default)|true (for cross-compilation)
|
||||
#
|
||||
|
||||
@@ -36,7 +31,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
|
||||
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
|
||||
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof xz-utils \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
@@ -91,24 +86,9 @@ ARG max_jobs=32
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||
ARG VLLM_CPU_DISABLE_AVX512=0
|
||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||
# Support for cross-compilation with AVX2 ISA: docker build --build-arg VLLM_CPU_AVX2="1" ...
|
||||
ARG VLLM_CPU_AVX2=0
|
||||
ENV VLLM_CPU_AVX2=${VLLM_CPU_AVX2}
|
||||
# Support for cross-compilation with AVX512 ISA: docker build --build-arg VLLM_CPU_AVX512="1" ...
|
||||
ARG VLLM_CPU_AVX512=0
|
||||
ENV VLLM_CPU_AVX512=${VLLM_CPU_AVX512}
|
||||
# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
|
||||
ARG VLLM_CPU_AVX512BF16=0
|
||||
ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
|
||||
# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
|
||||
ARG VLLM_CPU_AVX512VNNI=0
|
||||
ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
|
||||
# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
|
||||
ARG VLLM_CPU_AMXBF16=1
|
||||
ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
|
||||
# Support for cross-compilation with x86 ISA including AVX2 and AVX512: docker build --build-arg VLLM_CPU_X86="true" ...
|
||||
ARG VLLM_CPU_X86=0
|
||||
ENV VLLM_CPU_X86=${VLLM_CPU_X86}
|
||||
# Support for cross-compilation with ARM BF16 ISA: docker build --build-arg VLLM_CPU_ARM_BF16="true" ...
|
||||
ARG VLLM_CPU_ARM_BF16=0
|
||||
ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16}
|
||||
@@ -116,7 +96,7 @@ ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16}
|
||||
WORKDIR /vllm-workspace
|
||||
|
||||
# Validate build arguments - prevent mixing incompatible ISA flags
|
||||
RUN if [ "$TARGETARCH" = "arm64" ] && { [ "$VLLM_CPU_AVX2" != "0" ] || [ "$VLLM_CPU_AVX512" != "0" ] || [ "$VLLM_CPU_AVX512BF16" != "0" ] || [ "$VLLM_CPU_AVX512VNNI" != "0" ]; }; then \
|
||||
RUN if [ "$TARGETARCH" = "arm64" ] && [ "$VLLM_CPU_X86" != "0" ]; then \
|
||||
echo "ERROR: Cannot use x86-specific ISA flags (AVX2, AVX512, etc.) when building for ARM64 (--platform=linux/arm64)"; \
|
||||
exit 1; \
|
||||
fi && \
|
||||
@@ -174,7 +154,7 @@ WORKDIR /vllm-workspace
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14
|
||||
apt-get install -y --no-install-recommends vim numactl make clangd-14
|
||||
|
||||
RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd
|
||||
|
||||
@@ -232,22 +212,12 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm"
|
||||
|
||||
# Build configuration labels
|
||||
ARG TARGETARCH
|
||||
ARG VLLM_CPU_DISABLE_AVX512
|
||||
ARG VLLM_CPU_AVX2
|
||||
ARG VLLM_CPU_AVX512
|
||||
ARG VLLM_CPU_AVX512BF16
|
||||
ARG VLLM_CPU_AVX512VNNI
|
||||
ARG VLLM_CPU_AMXBF16
|
||||
ARG VLLM_CPU_X86
|
||||
ARG VLLM_CPU_ARM_BF16
|
||||
ARG PYTHON_VERSION
|
||||
|
||||
LABEL ai.vllm.build.target-arch="${TARGETARCH}"
|
||||
LABEL ai.vllm.build.cpu-disable-avx512="${VLLM_CPU_DISABLE_AVX512:-false}"
|
||||
LABEL ai.vllm.build.cpu-avx2="${VLLM_CPU_AVX2:-false}"
|
||||
LABEL ai.vllm.build.cpu-avx512="${VLLM_CPU_AVX512:-false}"
|
||||
LABEL ai.vllm.build.cpu-avx512bf16="${VLLM_CPU_AVX512BF16:-false}"
|
||||
LABEL ai.vllm.build.cpu-avx512vnni="${VLLM_CPU_AVX512VNNI:-false}"
|
||||
LABEL ai.vllm.build.cpu-amxbf16="${VLLM_CPU_AMXBF16:-false}"
|
||||
LABEL ai.vllm.build.cpu-x86="${VLLM_CPU_X86:-false}"
|
||||
LABEL ai.vllm.build.cpu-arm-bf16="${VLLM_CPU_ARM_BF16:-false}"
|
||||
LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}"
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
|
||||
--8<-- [start:requirements]
|
||||
|
||||
- OS: Linux
|
||||
- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional)
|
||||
- CPU flags: `avx512f` (Recommended), `avx2` (Limited features)
|
||||
|
||||
!!! tip
|
||||
Use `lscpu` to check the CPU flags.
|
||||
@@ -18,7 +18,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
|
||||
--8<-- [end:set-up-using-python]
|
||||
--8<-- [start:pre-built-wheels]
|
||||
|
||||
Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels:
|
||||
Pre-built vLLM wheels for x86 with AVX512/AVX2 are available since version 0.17.0. To install release wheels:
|
||||
|
||||
```bash
|
||||
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
|
||||
@@ -108,13 +108,13 @@ VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
|
||||
If you want to develop vLLM, install it in editable mode instead.
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
||||
```
|
||||
|
||||
Optionally, build a portable wheel which you can then install elsewhere:
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv build --wheel
|
||||
VLLM_TARGET_DEVICE=cpu uv build --wheel --no-build-isolation
|
||||
```
|
||||
|
||||
```bash
|
||||
@@ -185,12 +185,9 @@ docker run \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
vllm/vllm-openai-cpu:latest-x86_64 <args...>
|
||||
vllm/vllm-openai-cpu:latest-x86_64 <args...>
|
||||
```
|
||||
|
||||
!!! warning
|
||||
If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. See the build-image-from-source section below for build arguments to match your target CPU capabilities.
|
||||
|
||||
--8<-- [end:pre-built-images]
|
||||
--8<-- [start:build-image-from-source]
|
||||
|
||||
@@ -198,50 +195,11 @@ vllm/vllm-openai-cpu:latest-x86_64 <args...>
|
||||
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.cpu \
|
||||
--build-arg VLLM_CPU_DISABLE_AVX512=<false (default)|true> \
|
||||
--build-arg VLLM_CPU_AVX2=<false (default)|true> \
|
||||
--build-arg VLLM_CPU_AVX512=<false (default)|true> \
|
||||
--build-arg VLLM_CPU_AVX512BF16=<false (default)|true> \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=<false (default)|true> \
|
||||
--build-arg VLLM_CPU_AMXBF16=<false|true (default)> \
|
||||
--build-arg VLLM_CPU_X86=<false (default)|true> \ # For cross-compilation
|
||||
--tag vllm-cpu-env \
|
||||
--target vllm-openai .
|
||||
```
|
||||
|
||||
!!! note "Auto-detection by default"
|
||||
By default, CPU instruction sets (AVX512, AVX2, etc.) are automatically detected from the build system's CPU flags. Build arguments like `VLLM_CPU_AVX2`, `VLLM_CPU_AVX512`, `VLLM_CPU_AVX512BF16`, `VLLM_CPU_AVX512VNNI`, and `VLLM_CPU_AMXBF16` are used for cross-compilation:
|
||||
|
||||
- `VLLM_CPU_{ISA}=true` - Force-enable the instruction set (build with ISA regardless of build system capabilities)
|
||||
- `VLLM_CPU_{ISA}=false` - Rely on auto-detection (default)
|
||||
|
||||
##### Examples
|
||||
|
||||
###### Auto-detection build (default)
|
||||
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
|
||||
```
|
||||
|
||||
###### Cross-compile for AVX512
|
||||
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.cpu \
|
||||
--build-arg VLLM_CPU_AVX512=true \
|
||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||
--tag vllm-cpu-avx512 \
|
||||
--target vllm-openai .
|
||||
```
|
||||
|
||||
###### Cross-compile for AVX2
|
||||
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.cpu \
|
||||
--build-arg VLLM_CPU_AVX2=true \
|
||||
--tag vllm-cpu-avx2 \
|
||||
--target vllm-openai .
|
||||
```
|
||||
|
||||
#### Launching the OpenAI server
|
||||
|
||||
```bash
|
||||
|
||||
1
setup.py
1
setup.py
@@ -920,6 +920,7 @@ if _is_cpu():
|
||||
|
||||
if platform.machine() in ("x86_64", "AMD64"):
|
||||
ext_modules.append(CMakeExtension(name="vllm._C"))
|
||||
ext_modules.append(CMakeExtension(name="vllm._C_AVX512"))
|
||||
ext_modules.append(CMakeExtension(name="vllm._C_AVX2"))
|
||||
else:
|
||||
ext_modules.append(CMakeExtension(name="vllm._C"))
|
||||
|
||||
@@ -252,6 +252,8 @@ class CpuPlatform(Platform):
|
||||
if vllm_config.lora_config is not None:
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
|
||||
vllm_config.profiler_config.torch_profiler_dump_cuda_time_total = False
|
||||
|
||||
assert vllm_config.device_config.device_type == "cpu"
|
||||
|
||||
#
|
||||
@@ -470,21 +472,32 @@ class CpuPlatform(Platform):
|
||||
@classmethod
|
||||
def import_kernels(cls) -> None:
|
||||
if Platform.get_cpu_architecture() in (CpuArchEnum.X86,):
|
||||
if torch._C._cpu._is_avx512_supported():
|
||||
try:
|
||||
import vllm._C # noqa: F401
|
||||
except ImportError as e:
|
||||
logger.warning("Failed to import from vllm._C: %r", e)
|
||||
# Note: The lib name is _C_AVX2/AVX512, but the module name is _C.
|
||||
# This will cause a exception "dynamic module does define
|
||||
# module export function". But the library is imported
|
||||
# successfully. So ignore the exception for now, until we find
|
||||
# a solution.
|
||||
ignored_msg = "dynamic module does not define module export function"
|
||||
if torch.cpu._is_avx512_supported():
|
||||
if torch.cpu._is_avx512_bf16_supported():
|
||||
try:
|
||||
import vllm._C # noqa: F401
|
||||
except ImportError as e:
|
||||
logger.warning("Failed to import from vllm._C: %r", e)
|
||||
else:
|
||||
try:
|
||||
import vllm._C_AVX512 # noqa: F401
|
||||
except ImportError as e:
|
||||
if ignored_msg not in e.msg:
|
||||
logger.warning(
|
||||
"Failed to import from vllm._C_AVX512: %r", e
|
||||
)
|
||||
else:
|
||||
# Note: The lib name is _C_AVX2, but the module name is _C.
|
||||
# This will cause a exception "dynamic module does define
|
||||
# module export function". But the library is imported
|
||||
# successfully. So ignore the exception for now, until we find
|
||||
# a solution.
|
||||
try:
|
||||
import vllm._C_AVX2 # noqa: F401
|
||||
except ImportError as e:
|
||||
logger.warning("Failed to import from vllm._C_AVX2: %r", e)
|
||||
if ignored_msg not in e.msg:
|
||||
logger.warning("Failed to import from vllm._C_AVX2: %r", e)
|
||||
else:
|
||||
try:
|
||||
import vllm._C # noqa: F401
|
||||
|
||||
@@ -52,6 +52,21 @@ class CPUWorker(Worker):
|
||||
)
|
||||
|
||||
def init_device(self):
|
||||
# Check whether critical libraries are loaded
|
||||
def check_preloaded_libs(name: str):
|
||||
ld_preload_list = os.environ.get("LD_PRELOAD", "")
|
||||
if name not in ld_preload_list:
|
||||
raise RuntimeError(
|
||||
f"{name} is not found in LD_PRELOAD. "
|
||||
"Please follow the section `set LD_PRELOAD` in "
|
||||
"https://docs.vllm.ai/en/latest/getting_started/installation/cpu/ "
|
||||
"to setup required pre-loaded libraries."
|
||||
)
|
||||
|
||||
check_preloaded_libs("libtcmalloc")
|
||||
if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
|
||||
check_preloaded_libs("libiomp")
|
||||
|
||||
# Setup OpenMP threads affinity.
|
||||
omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
|
||||
# Under numa binding some cores reserved for kv transfer in nixl_connector.py
|
||||
|
||||
Reference in New Issue
Block a user