diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index c9813a73d..5a0980dcc 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -18,6 +18,7 @@ set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512}) set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16}) set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI}) set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16}) +set(ENABLE_ARM_BF16 $ENV{VLLM_CPU_ARM_BF16}) include_directories("${CMAKE_SOURCE_DIR}/csrc") @@ -115,6 +116,10 @@ else() set(AVX512_FOUND ON) message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable") endif() + if (ENABLE_ARM_BF16) + set(ARM_BF16_FOUND ON) + message(STATUS "ARM BF16 support enabled via VLLM_CPU_ARM_BF16 environment variable") + endif() endif() if (AVX512_FOUND AND NOT AVX512_DISABLED) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 063d3e6e4..d81957e02 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -20,6 +20,7 @@ # VLLM_CPU_AVX512BF16=false (default)|true (for cross-compilation) # VLLM_CPU_AVX512VNNI=false (default)|true (for cross-compilation) # VLLM_CPU_AMXBF16=false (default)|true (for cross-compilation) +# VLLM_CPU_ARM_BF16=false (default)|true (for cross-compilation) # ######################### COMMON BASE IMAGE ######################### @@ -108,9 +109,22 @@ ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI} # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ... ARG VLLM_CPU_AMXBF16=1 ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16} +# Support for cross-compilation with ARM BF16 ISA: docker build --build-arg VLLM_CPU_ARM_BF16="true" ... +ARG VLLM_CPU_ARM_BF16=0 +ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16} WORKDIR /vllm-workspace +# Validate build arguments - prevent mixing incompatible ISA flags +RUN if [ "$TARGETARCH" = "arm64" ] && { [ "$VLLM_CPU_AVX2" != "0" ] || [ "$VLLM_CPU_AVX512" != "0" ] || [ "$VLLM_CPU_AVX512BF16" != "0" ] || [ "$VLLM_CPU_AVX512VNNI" != "0" ]; }; then \ + echo "ERROR: Cannot use x86-specific ISA flags (AVX2, AVX512, etc.) when building for ARM64 (--platform=linux/arm64)"; \ + exit 1; \ + fi && \ + if [ "$TARGETARCH" = "amd64" ] && [ "$VLLM_CPU_ARM_BF16" != "0" ]; then \ + echo "ERROR: Cannot use ARM-specific ISA flags (ARM_BF16) when building for x86_64 (--platform=linux/amd64)"; \ + exit 1; \ + fi + # Copy build requirements COPY requirements/cpu-build.txt requirements/build.txt @@ -224,6 +238,7 @@ ARG VLLM_CPU_AVX512 ARG VLLM_CPU_AVX512BF16 ARG VLLM_CPU_AVX512VNNI ARG VLLM_CPU_AMXBF16 +ARG VLLM_CPU_ARM_BF16 ARG PYTHON_VERSION LABEL ai.vllm.build.target-arch="${TARGETARCH}" @@ -233,6 +248,7 @@ LABEL ai.vllm.build.cpu-avx512="${VLLM_CPU_AVX512:-false}" LABEL ai.vllm.build.cpu-avx512bf16="${VLLM_CPU_AVX512BF16:-false}" LABEL ai.vllm.build.cpu-avx512vnni="${VLLM_CPU_AVX512VNNI:-false}" LABEL ai.vllm.build.cpu-amxbf16="${VLLM_CPU_AMXBF16:-false}" +LABEL ai.vllm.build.cpu-arm-bf16="${VLLM_CPU_ARM_BF16:-false}" LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}" ENTRYPOINT ["vllm", "serve"] diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md index e331d87a7..ae7d648b0 100644 --- a/docs/getting_started/installation/cpu.arm.inc.md +++ b/docs/getting_started/installation/cpu.arm.inc.md @@ -172,25 +172,78 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}-arm64- # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] + +## Building for your target ARM CPU + ```bash docker build -f docker/Dockerfile.cpu \ - --tag vllm-cpu-env . + --platform=linux/arm64 \ + --build-arg VLLM_CPU_ARM_BF16= \ + --tag vllm-cpu-env \ + --target vllm-openai . +``` -# Launching OpenAI server +!!! note "Auto-detection by default" + By default, ARM CPU instruction sets (BF16, NEON, etc.) are automatically detected from the build system's CPU flags. The `VLLM_CPU_ARM_BF16` build argument is used for cross-compilation: + + - `VLLM_CPU_ARM_BF16=true` - Force-enable ARM BF16 support (build with BF16 regardless of build system capabilities) + - `VLLM_CPU_ARM_BF16=false` - Rely on auto-detection (default) + +### Examples + +**Auto-detection build (native ARM)** + +```bash +# Building on ARM64 system - platform auto-detected +docker build -f docker/Dockerfile.cpu \ + --tag vllm-cpu-arm64 \ + --target vllm-openai . +``` + +**Cross-compile for ARM with BF16 support** + +```bash +# Building on ARM64 for newer ARM CPUs with BF16 +docker build -f docker/Dockerfile.cpu \ + --build-arg VLLM_CPU_ARM_BF16=true \ + --tag vllm-cpu-arm64-bf16 \ + --target vllm-openai . +``` + +**Cross-compile from x86_64 to ARM64 with BF16** + +```bash +# Requires Docker buildx with ARM emulation (QEMU) +docker buildx build -f docker/Dockerfile.cpu \ + --platform=linux/arm64 \ + --build-arg VLLM_CPU_ARM_BF16=true \ + --build-arg max_jobs=4 \ + --tag vllm-cpu-arm64-bf16 \ + --target vllm-openai \ + --load . +``` + +!!! note "ARM BF16 requirements" + ARM BF16 support requires ARMv8.6-A or later (FEAT_BF16). Supported on AWS Graviton3/4, AmpereOne, and other recent ARM processors. + +## Launching the OpenAI server + +```bash docker run --rm \ - --privileged=true \ + --security-opt seccomp=unconfined \ + --cap-add SYS_NICE \ --shm-size=4g \ -p 8000:8000 \ -e VLLM_CPU_KVCACHE_SPACE= \ -e VLLM_CPU_OMP_THREADS_BIND= \ - vllm-cpu-env \ - --model=meta-llama/Llama-3.2-1B-Instruct \ + vllm-cpu-arm64 \ + meta-llama/Llama-3.2-1B-Instruct \ --dtype=bfloat16 \ other vLLM OpenAI server arguments ``` -!!! tip - An alternative of `--privileged=true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`. +!!! tip "Alternative to --privileged" + Instead of `--privileged=true`, use `--cap-add SYS_NICE --security-opt seccomp=unconfined` for better security. # --8<-- [end:build-image-from-source] # --8<-- [start:extra-information]