diff --git a/.buildkite/hardware_tests/arm.yaml b/.buildkite/hardware_tests/arm.yaml deleted file mode 100644 index d39ab4a7e..000000000 --- a/.buildkite/hardware_tests/arm.yaml +++ /dev/null @@ -1,8 +0,0 @@ -group: Hardware -steps: - - label: "Arm CPU Test" - soft_fail: true - device: arm_cpu - no_plugin: true - commands: - - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml new file mode 100644 index 000000000..39a551696 --- /dev/null +++ b/.buildkite/hardware_tests/cpu.yaml @@ -0,0 +1,100 @@ +group: CPU +depends_on: [] +steps: +- label: CPU-Kernel Tests + depends_on: [] + soft_fail: true + device: intel_cpu + no_plugin: true + source_file_dependencies: + - csrc/cpu/ + - cmake/cpu_extension.cmake + - CMakeLists.txt + - vllm/_custom_ops.py + - tests/kernels/attention/test_cpu_attn.py + - tests/kernels/moe/test_cpu_fused_moe.py + - tests/kernels/test_onednn.py + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py + pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py + pytest -x -v -s tests/kernels/test_onednn.py" + +- label: CPU-Language Generation and Pooling Model Tests + depends_on: [] + soft_fail: true + device: intel_cpu + no_plugin: true + source_file_dependencies: + - csrc/cpu/ + - vllm/ + - tests/models/language/generation/ + - tests/models/language/pooling/ + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m " + pytest -x -v -s tests/models/language/generation -m cpu_model + pytest -x -v -s tests/models/language/pooling -m cpu_model" + +- label: CPU-Quantization Model Tests + depends_on: [] + soft_fail: true + device: intel_cpu + no_plugin: true + source_file_dependencies: + - csrc/cpu/ + - vllm/model_executor/layers/quantization/cpu_wna16.py + - vllm/model_executor/layers/quantization/gptq_marlin.py + - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py + - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py + - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py + - tests/quantization/test_compressed_tensors.py + - tests/quantization/test_cpu_wna16.py + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs + pytest -x -v -s tests/quantization/test_cpu_wna16.py" + +- label: CPU-TP/DP/PP Tests + depends_on: [] + soft_fail: true + device: intel_cpu + no_plugin: true + source_file_dependencies: + - csrc/cpu/shm.cpp + - vllm/v1/worker/cpu_worker.py + - vllm/v1/worker/gpu_worker.py + - vllm/v1/worker/cpu_model_runner.py + - vllm/v1/worker/gpu_model_runner.py + - vllm/platforms/cpu.py + - vllm/distributed/parallel_state.py + - vllm/distributed/device_communicators/cpu_communicator.py + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m " + bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh" + +- label: CPU-Multi-Modal Model Tests %N + depends_on: [] + soft_fail: true + device: intel_cpu + no_plugin: true + source_file_dependencies: + # - vllm/ + - vllm/model_executor/layers/rotary_embedding + - tests/models/multimodal/generation/ + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m " + pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB" + parallelism: 2 + +- label: "Arm CPU Test" + depends_on: [] + soft_fail: true + device: arm_cpu + no_plugin: true + commands: + - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh diff --git a/.buildkite/hardware_tests/intel.yaml b/.buildkite/hardware_tests/intel.yaml index 76bf2e0be..ba0088b3a 100644 --- a/.buildkite/hardware_tests/intel.yaml +++ b/.buildkite/hardware_tests/intel.yaml @@ -1,13 +1,6 @@ group: Hardware depends_on: ~ steps: - - label: "Intel CPU Test" - soft_fail: true - device: intel_cpu - no_plugin: true - commands: - - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh - - label: "Intel HPU Test" soft_fail: true device: intel_hpu diff --git a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh new file mode 100644 index 000000000..3caa49832 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -euox pipefail + +echo "--- PP+TP" +vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & +server_pid=$! +timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 +vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions +kill -s SIGTERM $server_pid & + +echo "--- DP+TP" +vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & +server_pid=$! +timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 +vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions +kill -s SIGTERM $server_pid & diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index ee6510bf8..c32b051ca 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -2,119 +2,19 @@ # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. -set -ex +set -euox pipefail # allow to bind to different cores CORE_RANGE=${CORE_RANGE:-48-95} -# used for TP/PP E2E test -OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} +IMAGE_NAME="cpu-test-$NUMA_NODE" +TIMEOUT_VAL=$1 +TEST_COMMAND=$2 -export CMAKE_BUILD_PARALLEL_LEVEL=32 - -# Setup cleanup -remove_docker_container() { - set -e; - docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; -} -trap remove_docker_container EXIT -remove_docker_container - -# Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . +# building the docker image +echo "--- :docker: Building Docker image" +docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 - -function cpu_tests() { - set -e - export NUMA_NODE=$2 - - # list packages - docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " - set -e - pip list" - - docker exec cpu-test-"$NUMA_NODE" bash -c " - set -e - pip list" - - # offline inference - docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " - set -e - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" - - # Run kernel tests - docker exec cpu-test-"$NUMA_NODE" bash -c " - set -e - pytest -x -v -s tests/kernels/attention/test_cpu_attn.py - pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py - pytest -x -v -s tests/kernels/test_onednn.py" - - # Run basic model test - docker exec cpu-test-"$NUMA_NODE" bash -c " - set -e - # Note: disable until supports V1 - # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model - # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model - - pytest -x -v -s tests/models/language/generation -m cpu_model - VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model - - pytest -x -v -s tests/models/language/pooling -m cpu_model - pytest -x -v -s tests/models/multimodal/generation \ - --ignore=tests/models/multimodal/generation/test_pixtral.py \ - -m cpu_model" - - # Run compressed-tensor test - docker exec cpu-test-"$NUMA_NODE" bash -c " - set -e - pytest -x -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" - - # Run AWQ/GPTQ test - docker exec cpu-test-"$NUMA_NODE" bash -c " - set -e - pytest -x -s -v \ - tests/quantization/test_cpu_wna16.py" - - # Run multi-lora tests - docker exec cpu-test-"$NUMA_NODE" bash -c " - set -e - pytest -x -s -v \ - tests/lora/test_qwenvl.py" - - # online serving: tp+pp - docker exec cpu-test-"$NUMA_NODE" bash -c ' - set -e - VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & - server_pid=$! - timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 - vllm bench serve \ - --backend vllm \ - --dataset-name random \ - --model meta-llama/Llama-3.2-3B-Instruct \ - --num-prompts 20 \ - --endpoint /v1/completions - kill -s SIGTERM $server_pid &' - - # online serving: tp+dp - docker exec cpu-test-"$NUMA_NODE" bash -c ' - set -e - VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & - server_pid=$! - timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 - vllm bench serve \ - --backend vllm \ - --dataset-name random \ - --model meta-llama/Llama-3.2-3B-Instruct \ - --num-prompts 20 \ - --endpoint /v1/completions - kill -s SIGTERM $server_pid &' -} - -# All of CPU tests are expected to be finished less than 40 mins. -export -f cpu_tests -timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \ + timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}" diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index ca82fc8af..169696ca1 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -136,22 +136,38 @@ class CPUWorker(Worker): the LogicalCPUInfo.id. A selected LogicalCPUInfo list should be returned. """ + # simulate multiple numa nodes, for testing + sim_multi_numa_nodes = os.environ.get("VLLM_CPU_SIM_MULTI_NUMA", "0") != "0" allowed_numa_nodes, logical_cpu_list = ( CpuPlatform.get_allowed_cpu_core_node_list() ) - assert len(allowed_numa_nodes) >= self.parallel_config.world_size, ( + assert ( + len(allowed_numa_nodes) >= self.parallel_config.world_size + or sim_multi_numa_nodes + ), ( f"Not enough allowed NUMA nodes to bind threads of " f"{self.parallel_config.world_size} CPUWorkers. " f"Allowed NUMA nodes are {allowed_numa_nodes}. " "Please try to bind threads manually." ) - # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]` - selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore - logical_cpu_list = [ - x for x in logical_cpu_list if x.numa_node == selected_numa_node - ] + if not sim_multi_numa_nodes: + # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]` + selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore + logical_cpu_list = [ + x for x in logical_cpu_list if x.numa_node == selected_numa_node + ] + else: + assert len(logical_cpu_list) >= self.parallel_config.world_size + logical_cpu_list = sorted(logical_cpu_list, key=lambda x: x.numa_node) + sim_cpu_num_per_node = ( + len(logical_cpu_list) // self.parallel_config.world_size + ) + start_idx = self.local_rank * sim_cpu_num_per_node + logical_cpu_list = logical_cpu_list[ + start_idx : (start_idx + sim_cpu_num_per_node) + ] # Select CPUs from each physical core via cpu_selector core_to_cpus: dict[int, list[LogicalCPUInfo]] = {}