diff --git a/.buildkite/hardware_tests/arm.yaml b/.buildkite/hardware_tests/arm.yaml
deleted file mode 100644
index d39ab4a7e..000000000
--- a/.buildkite/hardware_tests/arm.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-group: Hardware
-steps:
-  - label: "Arm CPU Test"
-    soft_fail: true
-    device: arm_cpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml
new file mode 100644
index 000000000..39a551696
--- /dev/null
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -0,0 +1,100 @@
+group: CPU
+depends_on: []
+steps:
+- label: CPU-Kernel Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - cmake/cpu_extension.cmake
+  - CMakeLists.txt
+  - vllm/_custom_ops.py
+  - tests/kernels/attention/test_cpu_attn.py
+  - tests/kernels/moe/test_cpu_fused_moe.py
+  - tests/kernels/test_onednn.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+      pytest -x -v -s tests/kernels/test_onednn.py"
+
+- label: CPU-Language Generation and Pooling Model Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - vllm/
+  - tests/models/language/generation/
+  - tests/models/language/pooling/
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
+      pytest -x -v -s tests/models/language/generation -m cpu_model
+      pytest -x -v -s tests/models/language/pooling -m cpu_model"
+
+- label: CPU-Quantization Model Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - vllm/model_executor/layers/quantization/cpu_wna16.py
+  - vllm/model_executor/layers/quantization/gptq_marlin.py
+  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
+  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
+  - tests/quantization/test_compressed_tensors.py
+  - tests/quantization/test_cpu_wna16.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
+      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
+      
+- label: CPU-TP/DP/PP Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/shm.cpp
+  - vllm/v1/worker/cpu_worker.py
+  - vllm/v1/worker/gpu_worker.py
+  - vllm/v1/worker/cpu_model_runner.py
+  - vllm/v1/worker/gpu_model_runner.py
+  - vllm/platforms/cpu.py
+  - vllm/distributed/parallel_state.py
+  - vllm/distributed/device_communicators/cpu_communicator.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+
+- label: CPU-Multi-Modal Model Tests %N
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  # - vllm/
+  - vllm/model_executor/layers/rotary_embedding
+  - tests/models/multimodal/generation/
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
+      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
+  parallelism: 2
+
+- label: "Arm CPU Test"
+  depends_on: []
+  soft_fail: true
+  device: arm_cpu
+  no_plugin: true
+  commands: 
+  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
diff --git a/.buildkite/hardware_tests/intel.yaml b/.buildkite/hardware_tests/intel.yaml
index 76bf2e0be..ba0088b3a 100644
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -1,13 +1,6 @@
 group: Hardware
 depends_on: ~
 steps:
-  - label: "Intel CPU Test"
-    soft_fail: true
-    device: intel_cpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
-  
   - label: "Intel HPU Test"
     soft_fail: true
     device: intel_hpu
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
new file mode 100644
index 000000000..3caa49832
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -euox pipefail
+
+echo "--- PP+TP"
+vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+server_pid=$!
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+vllm bench serve \
+    --backend vllm \
+    --dataset-name random \
+    --model meta-llama/Llama-3.2-3B-Instruct \
+    --num-prompts 20 \
+    --endpoint /v1/completions
+kill -s SIGTERM $server_pid &
+
+echo "--- DP+TP"
+vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+server_pid=$!
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+vllm bench serve \
+    --backend vllm \
+    --dataset-name random \
+    --model meta-llama/Llama-3.2-3B-Instruct \
+    --num-prompts 20 \
+    --endpoint /v1/completions
+kill -s SIGTERM $server_pid &
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index ee6510bf8..c32b051ca 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -2,119 +2,19 @@
 
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -ex
+set -euox pipefail
 
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
-# used for TP/PP E2E test
-OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
+IMAGE_NAME="cpu-test-$NUMA_NODE"
+TIMEOUT_VAL=$1
+TEST_COMMAND=$2
 
-export CMAKE_BUILD_PARALLEL_LEVEL=32
-
-# Setup cleanup
-remove_docker_container() {
-    set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
-}
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+# building the docker image
+echo "--- :docker: Building Docker image"
+docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
-
-function cpu_tests() {
-  set -e
-  export NUMA_NODE=$2
-
-  # list packages
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
-    set -e
-    pip list"
-
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pip list"
-
-  # offline inference
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
-
-  # Run kernel tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
-    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
-    pytest -x -v -s tests/kernels/test_onednn.py"
-
-  # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    # Note: disable until supports V1
-    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-
-    pytest -x -v -s tests/models/language/generation -m cpu_model
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
-
-    pytest -x -v -s tests/models/language/pooling -m cpu_model
-    pytest -x -v -s tests/models/multimodal/generation \
-                --ignore=tests/models/multimodal/generation/test_pixtral.py \
-                -m cpu_model"
-
-  # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
-
-  # Run AWQ/GPTQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/quantization/test_cpu_wna16.py"
-
-  # Run multi-lora tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/lora/test_qwenvl.py"
-
-  # online serving: tp+pp
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-    server_pid=$!
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
-
-  # online serving: tp+dp
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
-    server_pid=$!
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
-}
-
-# All of CPU tests are expected to be finished less than 40 mins.
-export -f cpu_tests
-timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
+        timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index ca82fc8af..169696ca1 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -136,22 +136,38 @@ class CPUWorker(Worker):
             the LogicalCPUInfo.id. A selected LogicalCPUInfo list should be
             returned.
         """
+        # simulate multiple numa nodes, for testing
+        sim_multi_numa_nodes = os.environ.get("VLLM_CPU_SIM_MULTI_NUMA", "0") != "0"
 
         allowed_numa_nodes, logical_cpu_list = (
             CpuPlatform.get_allowed_cpu_core_node_list()
         )
-        assert len(allowed_numa_nodes) >= self.parallel_config.world_size, (
+        assert (
+            len(allowed_numa_nodes) >= self.parallel_config.world_size
+            or sim_multi_numa_nodes
+        ), (
             f"Not enough allowed NUMA nodes to bind threads of "
             f"{self.parallel_config.world_size} CPUWorkers. "
             f"Allowed NUMA nodes are {allowed_numa_nodes}. "
             "Please try to bind threads manually."
         )
 
-        # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
-        selected_numa_node = allowed_numa_nodes[self.local_rank]  # type: ignore
-        logical_cpu_list = [
-            x for x in logical_cpu_list if x.numa_node == selected_numa_node
-        ]
+        if not sim_multi_numa_nodes:
+            # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
+            selected_numa_node = allowed_numa_nodes[self.local_rank]  # type: ignore
+            logical_cpu_list = [
+                x for x in logical_cpu_list if x.numa_node == selected_numa_node
+            ]
+        else:
+            assert len(logical_cpu_list) >= self.parallel_config.world_size
+            logical_cpu_list = sorted(logical_cpu_list, key=lambda x: x.numa_node)
+            sim_cpu_num_per_node = (
+                len(logical_cpu_list) // self.parallel_config.world_size
+            )
+            start_idx = self.local_rank * sim_cpu_num_per_node
+            logical_cpu_list = logical_cpu_list[
+                start_idx : (start_idx + sim_cpu_num_per_node)
+            ]
 
         # Select CPUs from each physical core via cpu_selector
         core_to_cpus: dict[int, list[LogicalCPUInfo]] = {}