[XPU] Add test script of PD disaggregation (#36434)
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
This commit is contained in:
@@ -0,0 +1,174 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Hosts / ports
|
||||
PREFILL_HOST=${PREFILL_HOST:-"localhost"}
|
||||
PREFILL_PORT=${PREFILL_PORT:-8100}
|
||||
PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:-5577}
|
||||
DECODE_HOST=${DECODE_HOST:-"localhost"}
|
||||
DECODE_PORT=${DECODE_PORT:-8200}
|
||||
PROXY_HOST=${PROXY_HOST:-"localhost"}
|
||||
PROXY_PORT=${PROXY_PORT:-8192}
|
||||
BASELINE_HOST=${BASELINE_HOST:-"localhost"}
|
||||
BASELINE_PORT=${BASELINE_PORT:-9290}
|
||||
|
||||
# Model to run.
|
||||
MODEL_NAME=${MODEL_NAME:-"Qwen/Qwen3-0.6B"}
|
||||
MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024}
|
||||
BLOCK_SIZE=${BLOCK_SIZE:-64}
|
||||
PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
|
||||
DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
|
||||
KV_BUFFER_DEVICE=${KV_BUFFER_DEVICE:-"xpu"}
|
||||
GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.8}
|
||||
|
||||
generate_affinity_mask() {
|
||||
local count=$1
|
||||
local start=${2:-0}
|
||||
local mask=""
|
||||
local i
|
||||
|
||||
for ((i=0; i<count; i++)); do
|
||||
local device=$((start + i))
|
||||
if [[ -z "${mask}" ]]; then
|
||||
mask="${device}"
|
||||
else
|
||||
mask="${mask},${device}"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "${mask}"
|
||||
}
|
||||
|
||||
PREFILLER_ZE_AFFINITY_MASK=${PREFILLER_ZE_AFFINITY_MASK:-$(generate_affinity_mask "${PREFILLER_TP_SIZE}" 0)}
|
||||
DECODER_ZE_AFFINITY_MASK=${DECODER_ZE_AFFINITY_MASK:-$(generate_affinity_mask "${DECODER_TP_SIZE}" "${PREFILLER_TP_SIZE}")}
|
||||
|
||||
|
||||
# execution env
|
||||
GIT_ROOT=$(git rev-parse --show-toplevel)
|
||||
EXP_ROOT="${GIT_ROOT}/tests/v1/kv_connector/nixl_integration"
|
||||
|
||||
OUTPUT_FILE=${OUTPUT_FILE:-"${EXP_ROOT}/.xpu_accuracy_test_outputs.txt"}
|
||||
|
||||
# Trap the SIGINT signal (triggered by Ctrl+C)
|
||||
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
|
||||
|
||||
cleanup() {
|
||||
echo "Cleaning up any running vLLM instances..."
|
||||
pkill -f "vllm serve" || true
|
||||
sleep 2
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
local host=$1
|
||||
local port=$2
|
||||
timeout 1200 bash -c "
|
||||
until curl -s ${host}:${port}/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done" && return 0 || return 1
|
||||
}
|
||||
|
||||
launch_baseline() {
|
||||
BASELINE_BASE_CMD="
|
||||
ZE_AFFINITY_MASK=0 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \
|
||||
--host ${BASELINE_HOST} \
|
||||
--port ${BASELINE_PORT} \
|
||||
--max-model-len ${MAX_MODEL_LEN}\
|
||||
--seed 42 \
|
||||
-tp 1 \
|
||||
--block-size ${BLOCK_SIZE} \
|
||||
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
|
||||
--dtype float16 \
|
||||
--enforce-eager"
|
||||
echo ${BASELINE_BASE_CMD}
|
||||
bash -c "${BASELINE_BASE_CMD}" &
|
||||
sleep 10
|
||||
wait_for_server ${BASELINE_HOST} ${BASELINE_PORT}
|
||||
}
|
||||
|
||||
launch_pd() {
|
||||
PREFILL_BASE_CMD="
|
||||
ZE_AFFINITY_MASK=${PREFILLER_ZE_AFFINITY_MASK} \
|
||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \
|
||||
--host ${PREFILL_HOST} \
|
||||
--port ${PREFILL_PORT} \
|
||||
--max-model-len ${MAX_MODEL_LEN}\
|
||||
--seed 42 \
|
||||
--block-size ${BLOCK_SIZE} \
|
||||
--enforce-eager \
|
||||
--dtype float16 \
|
||||
-tp ${PREFILLER_TP_SIZE} \
|
||||
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
|
||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}'"
|
||||
|
||||
|
||||
DECODE_BASE_CMD="
|
||||
ZE_AFFINITY_MASK=${DECODER_ZE_AFFINITY_MASK} \
|
||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \
|
||||
--host ${DECODE_HOST} \
|
||||
--port ${DECODE_PORT} \
|
||||
--max-model-len ${MAX_MODEL_LEN}\
|
||||
--seed 42 \
|
||||
--block-size ${BLOCK_SIZE} \
|
||||
--enforce-eager \
|
||||
-tp ${DECODER_TP_SIZE} \
|
||||
--dtype float16 \
|
||||
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
|
||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}'"
|
||||
|
||||
echo ${PREFILL_BASE_CMD}
|
||||
echo ${DECODE_BASE_CMD}
|
||||
sleep 2
|
||||
|
||||
# execute on hosts
|
||||
bash -c "${PREFILL_BASE_CMD}" &
|
||||
bash -c "${DECODE_BASE_CMD}" &
|
||||
sleep 1
|
||||
wait_for_server ${PREFILL_HOST} ${PREFILL_PORT}
|
||||
sleep 1
|
||||
wait_for_server ${DECODE_HOST} ${DECODE_PORT}
|
||||
sleep 1
|
||||
}
|
||||
|
||||
launch_pd_proxy(){
|
||||
PROXY_BASE_CMD="
|
||||
python3 ${EXP_ROOT}/toy_proxy_server.py \
|
||||
--prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \
|
||||
--decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \
|
||||
--host=${PROXY_HOST} --port ${PROXY_PORT}"
|
||||
echo ${PROXY_BASE_CMD}
|
||||
bash -c "${PROXY_BASE_CMD}" &
|
||||
sleep 2
|
||||
}
|
||||
|
||||
run_tests(){
|
||||
local service_url=$1
|
||||
local mode=$2
|
||||
python3 ${EXP_ROOT}/test_disagg_accuracy.py --service_url=${service_url} --model_name=${MODEL_NAME} --mode=${mode} --file_name=${OUTPUT_FILE}
|
||||
}
|
||||
|
||||
|
||||
# run non-disagg. baseline & save outputs
|
||||
launch_baseline
|
||||
run_tests "http://${BASELINE_HOST}:${BASELINE_PORT}" "baseline"
|
||||
cleanup
|
||||
sleep 10
|
||||
|
||||
|
||||
# run disagg. & do exact-match with the outputs from baseline
|
||||
launch_pd
|
||||
launch_pd_proxy
|
||||
run_tests "http://${PROXY_HOST}:${PROXY_PORT}" "disagg"
|
||||
echo "-----P/D success----"
|
||||
|
||||
rm ${OUTPUT_FILE}
|
||||
cleanup
|
||||
|
||||
exit 0
|
||||
Reference in New Issue
Block a user