diff --git a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh index f289a43c6..d90540316 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh @@ -1,9 +1,10 @@ #!/bin/bash set -euox pipefail export VLLM_CPU_CI_ENV=0 +export VLLM_CPU_KVCACHE_SPACE=1 # avoid OOM echo "--- PP+TP" -vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & +vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 --max-model-len=4096 & server_pid=$! timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 vllm bench serve \ @@ -23,7 +24,7 @@ if [ "$failed_req" -ne 0 ]; then fi echo "--- DP+TP" -vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & +vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 & server_pid=$! timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 vllm bench serve \ diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 122cacd14..2547751c0 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -108,6 +108,15 @@ class CPUWorker(Worker): if ret: logger.info(ret) + # After the thread binding, changing thread num is not allowed + def skip_set_num_threads(x: int): + logger.warning( + "CPU backend doesn't allow to use " + "`torch.set_num_threads` after the thread binding, skip it." + ) + + torch.set_num_threads = skip_set_num_threads + # Note: unique identifier for creating allreduce shared memory os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(":")[-1] # Initialize the distributed environment.