[CI/Build] enable Intel XPU test flow with prebuilt image (#37447)
Signed-off-by: wendyliu235 <wenjun.liu@intel.com>
This commit is contained in:
23
.buildkite/ci_config_intel.yaml
Normal file
23
.buildkite/ci_config_intel.yaml
Normal file
@@ -0,0 +1,23 @@
|
||||
name: vllm_intel_ci
|
||||
job_dirs:
|
||||
- ".buildkite/intel_jobs"
|
||||
run_all_patterns:
|
||||
- "docker/Dockerfile"
|
||||
- "CMakeLists.txt"
|
||||
- "requirements/common.txt"
|
||||
- "requirements/xpu.txt"
|
||||
- "requirements/build.txt"
|
||||
- "requirements/test.txt"
|
||||
- "setup.py"
|
||||
- "csrc/"
|
||||
- "cmake/"
|
||||
run_all_exclude_patterns:
|
||||
- "docker/Dockerfile."
|
||||
- "csrc/cpu/"
|
||||
- "csrc/rocm/"
|
||||
- "cmake/hipify.py"
|
||||
- "cmake/cpu_extension.cmake"
|
||||
registries: public.ecr.aws/q9t5s3a7
|
||||
repositories:
|
||||
main: "vllm-ci-test-repo"
|
||||
premerge: "vllm-ci-test-repo"
|
||||
34
.buildkite/image_build/image_build_xpu.sh
Executable file
34
.buildkite/image_build/image_build_xpu.sh
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
|
||||
# skip build if image already exists
|
||||
if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build \
|
||||
--file docker/Dockerfile.xpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu
|
||||
64
.buildkite/intel_jobs/test-intel.yaml
Normal file
64
.buildkite/intel_jobs/test-intel.yaml
Normal file
@@ -0,0 +1,64 @@
|
||||
group: Intel
|
||||
steps:
|
||||
- label: ":docker: Build XPU image"
|
||||
soft_fail: true
|
||||
depends_on: []
|
||||
key: image-build-xpu
|
||||
commands:
|
||||
- bash -lc '.buildkite/image_build/image_build_xpu.sh "public.ecr.aws/q9t5s3a7" "vllm-ci-test-repo" "$BUILDKITE_COMMIT"'
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
- label: "XPU example Test"
|
||||
depends_on:
|
||||
- image-build-xpu
|
||||
timeout_in_minutes: 30
|
||||
device: intel_gpu
|
||||
no_plugin: true
|
||||
env:
|
||||
REGISTRY: "public.ecr.aws/q9t5s3a7"
|
||||
REPO: "vllm-ci-test-repo"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/intel_jobs/test-intel.yaml
|
||||
commands:
|
||||
- >-
|
||||
bash .buildkite/scripts/hardware_ci/run-intel-test.sh
|
||||
'pip install tblib==3.1.0 &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 &&
|
||||
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192 &&
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 &&
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel'
|
||||
- label: "XPU V1 test"
|
||||
depends_on:
|
||||
- image-build-xpu
|
||||
timeout_in_minutes: 30
|
||||
device: intel_gpu
|
||||
no_plugin: true
|
||||
env:
|
||||
REGISTRY: "public.ecr.aws/q9t5s3a7"
|
||||
REPO: "vllm-ci-test-repo"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/intel_jobs/test-intel.yaml
|
||||
commands:
|
||||
- >-
|
||||
bash .buildkite/scripts/hardware_ci/run-intel-test.sh
|
||||
'cd tests &&
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py &&
|
||||
pytest -v -s v1/engine --ignore=v1/engine/test_output_processor.py &&
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py &&
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py &&
|
||||
pytest -v -s v1/structured_output &&
|
||||
pytest -v -s v1/test_serial_utils.py &&
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py &&
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py'
|
||||
276
.buildkite/scripts/hardware_ci/run-intel-test.sh
Executable file
276
.buildkite/scripts/hardware_ci/run-intel-test.sh
Executable file
@@ -0,0 +1,276 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script runs tests inside the Intel XPU docker container.
|
||||
# It mirrors the structure of run-amd-test.sh while keeping Intel-specific
|
||||
# container setup and allowing commands to be sourced from YAML or env.
|
||||
#
|
||||
# Command sources (in priority order):
|
||||
# 1) VLLM_TEST_COMMANDS env var (preferred, preserves quoting)
|
||||
# 2) Positional args (legacy)
|
||||
# 3) One or more YAML files with a commands list (test-area style)
|
||||
###############################################################################
|
||||
set -o pipefail
|
||||
|
||||
DRY_RUN=${DRY_RUN:-0}
|
||||
if [[ "${1:-}" == "--dry-run" ]]; then
|
||||
DRY_RUN=1
|
||||
shift
|
||||
fi
|
||||
|
||||
# Export Python path
|
||||
export PYTHONPATH=".."
|
||||
|
||||
###############################################################################
|
||||
# Helper Functions
|
||||
###############################################################################
|
||||
|
||||
cleanup_docker() {
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
docker image prune -f
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
|
||||
re_quote_pytest_markers() {
|
||||
local input="$1"
|
||||
local output=""
|
||||
local collecting=false
|
||||
local marker_buf=""
|
||||
|
||||
local flat="${input//$'\n'/ }"
|
||||
local restore_glob
|
||||
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
|
||||
set -o noglob
|
||||
local -a words
|
||||
read -ra words <<< "$flat"
|
||||
eval "$restore_glob"
|
||||
|
||||
for word in "${words[@]}"; do
|
||||
if $collecting; then
|
||||
if [[ "$word" == *"'"* ]]; then
|
||||
if [[ -n "$marker_buf" ]]; then
|
||||
output+="${marker_buf} "
|
||||
marker_buf=""
|
||||
fi
|
||||
output+="${word} "
|
||||
collecting=false
|
||||
continue
|
||||
fi
|
||||
|
||||
local is_boundary=false
|
||||
case "$word" in
|
||||
"&&"|"||"|";"|"|")
|
||||
is_boundary=true ;;
|
||||
--*)
|
||||
is_boundary=true ;;
|
||||
-[a-zA-Z])
|
||||
is_boundary=true ;;
|
||||
*/*)
|
||||
is_boundary=true ;;
|
||||
*.py|*.py::*)
|
||||
is_boundary=true ;;
|
||||
*=*)
|
||||
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
|
||||
is_boundary=true
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
if $is_boundary; then
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}' "
|
||||
else
|
||||
output+="${marker_buf} "
|
||||
fi
|
||||
collecting=false
|
||||
marker_buf=""
|
||||
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||
output+="${word} "
|
||||
collecting=true
|
||||
else
|
||||
output+="${word} "
|
||||
fi
|
||||
else
|
||||
if [[ -n "$marker_buf" ]]; then
|
||||
marker_buf+=" ${word}"
|
||||
else
|
||||
marker_buf="${word}"
|
||||
fi
|
||||
fi
|
||||
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||
output+="${word} "
|
||||
collecting=true
|
||||
marker_buf=""
|
||||
else
|
||||
output+="${word} "
|
||||
fi
|
||||
done
|
||||
|
||||
if $collecting && [[ -n "$marker_buf" ]]; then
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}'"
|
||||
else
|
||||
output+="${marker_buf}"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "${output% }"
|
||||
}
|
||||
|
||||
apply_intel_test_overrides() {
|
||||
local cmds="$1"
|
||||
# Placeholder for Intel-specific exclusions/overrides.
|
||||
echo "$cmds"
|
||||
}
|
||||
|
||||
is_yaml_file() {
|
||||
local p="$1"
|
||||
[[ -f "$p" && "$p" == *.yaml ]]
|
||||
}
|
||||
|
||||
extract_yaml_commands() {
|
||||
local yaml_path="$1"
|
||||
awk '
|
||||
$1 == "commands:" { in_cmds=1; next }
|
||||
in_cmds && $0 ~ /^[[:space:]]*-[[:space:]]/ {
|
||||
sub(/^[[:space:]]*-[[:space:]]/, "");
|
||||
print;
|
||||
next
|
||||
}
|
||||
in_cmds && $0 ~ /^[^[:space:]]/ { exit }
|
||||
' "$yaml_path"
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# Main
|
||||
###############################################################################
|
||||
|
||||
default_image_name="${REGISTRY}/${REPO}:${BUILDKITE_COMMIT}-xpu"
|
||||
#default_image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-xpu"
|
||||
image_name="${IMAGE_TAG_XPU:-${default_image_name}}"
|
||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# ---- Command source selection ----
|
||||
commands=""
|
||||
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
|
||||
commands="${VLLM_TEST_COMMANDS}"
|
||||
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
|
||||
elif [[ $# -gt 0 ]]; then
|
||||
all_yaml=true
|
||||
for arg in "$@"; do
|
||||
if ! is_yaml_file "$arg"; then
|
||||
all_yaml=false
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if $all_yaml; then
|
||||
for yaml in "$@"; do
|
||||
mapfile -t COMMANDS < <(extract_yaml_commands "$yaml")
|
||||
if [[ ${#COMMANDS[@]} -eq 0 ]]; then
|
||||
echo "Error: No commands found in ${yaml}" >&2
|
||||
exit 1
|
||||
fi
|
||||
for cmd in "${COMMANDS[@]}"; do
|
||||
if [[ -z "$commands" ]]; then
|
||||
commands="${cmd}"
|
||||
else
|
||||
commands+=" && ${cmd}"
|
||||
fi
|
||||
done
|
||||
done
|
||||
echo "Commands sourced from YAML files: $*"
|
||||
else
|
||||
commands="$*"
|
||||
echo "Commands sourced from positional args (legacy mode)"
|
||||
fi
|
||||
else
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
DEFAULT_YAML="${SCRIPT_DIR}/intel-test.yaml"
|
||||
if [[ ! -f "${DEFAULT_YAML}" ]]; then
|
||||
echo "Error: YAML file not found: ${DEFAULT_YAML}" >&2
|
||||
exit 1
|
||||
fi
|
||||
mapfile -t COMMANDS < <(extract_yaml_commands "${DEFAULT_YAML}")
|
||||
if [[ ${#COMMANDS[@]} -eq 0 ]]; then
|
||||
echo "Error: No commands found in ${DEFAULT_YAML}" >&2
|
||||
exit 1
|
||||
fi
|
||||
for cmd in "${COMMANDS[@]}"; do
|
||||
if [[ -z "$commands" ]]; then
|
||||
commands="${cmd}"
|
||||
else
|
||||
commands+=" && ${cmd}"
|
||||
fi
|
||||
done
|
||||
echo "Commands sourced from default YAML: ${DEFAULT_YAML}"
|
||||
fi
|
||||
|
||||
if [[ -z "$commands" ]]; then
|
||||
echo "Error: No test commands provided." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Raw commands: $commands"
|
||||
commands=$(re_quote_pytest_markers "$commands")
|
||||
echo "After re-quoting: $commands"
|
||||
commands=$(apply_intel_test_overrides "$commands")
|
||||
echo "Final commands: $commands"
|
||||
|
||||
# Dry-run mode prints final commands and exits before Docker.
|
||||
if [[ "$DRY_RUN" == "1" ]]; then
|
||||
echo "DRY_RUN=1 set, skipping Docker execution."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Docker housekeeping ---
|
||||
cleanup_docker
|
||||
|
||||
# --- Build or pull test image ---
|
||||
if [[ -n "${IMAGE_TAG_XPU:-}" ]]; then
|
||||
echo "Using prebuilt XPU image: ${IMAGE_TAG_XPU}"
|
||||
docker pull "${IMAGE_TAG_XPU}"
|
||||
else
|
||||
echo "Using prebuilt XPU image: ${image_name}"
|
||||
docker pull "${image_name}"
|
||||
fi
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || true
|
||||
docker image rm -f "${image_name}" || true
|
||||
docker system prune -f || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# --- Single-node job ---
|
||||
|
||||
if [[ -z "${ZE_AFFINITY_MASK:-}" ]]; then
|
||||
echo "Warning: ZE_AFFINITY_MASK is not set. Proceeding without device affinity." >&2
|
||||
fi
|
||||
|
||||
docker run \
|
||||
--device /dev/dri:/dev/dri \
|
||||
--net=host \
|
||||
--ipc=host \
|
||||
--privileged \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
--entrypoint="" \
|
||||
-e "HF_TOKEN=${HF_TOKEN:-}" \
|
||||
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-}" \
|
||||
-e "CMDS=${commands}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
bash -c 'set -e; echo "ZE_AFFINITY_MASK is ${ZE_AFFINITY_MASK:-}"; eval "$CMDS"'
|
||||
Reference in New Issue
Block a user