diff --git a/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh new file mode 100644 index 000000000..79863123b --- /dev/null +++ b/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh @@ -0,0 +1,174 @@ +#!/bin/bash +set -e + +# Hosts / ports +PREFILL_HOST=${PREFILL_HOST:-"localhost"} +PREFILL_PORT=${PREFILL_PORT:-8100} +PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:-5577} +DECODE_HOST=${DECODE_HOST:-"localhost"} +DECODE_PORT=${DECODE_PORT:-8200} +PROXY_HOST=${PROXY_HOST:-"localhost"} +PROXY_PORT=${PROXY_PORT:-8192} +BASELINE_HOST=${BASELINE_HOST:-"localhost"} +BASELINE_PORT=${BASELINE_PORT:-9290} + +# Model to run. +MODEL_NAME=${MODEL_NAME:-"Qwen/Qwen3-0.6B"} +MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024} +BLOCK_SIZE=${BLOCK_SIZE:-64} +PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1} +DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} +KV_BUFFER_DEVICE=${KV_BUFFER_DEVICE:-"xpu"} +GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.8} + +generate_affinity_mask() { + local count=$1 + local start=${2:-0} + local mask="" + local i + + for ((i=0; i /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + +launch_baseline() { + BASELINE_BASE_CMD=" + ZE_AFFINITY_MASK=0 \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \ + --host ${BASELINE_HOST} \ + --port ${BASELINE_PORT} \ + --max-model-len ${MAX_MODEL_LEN}\ + --seed 42 \ + -tp 1 \ + --block-size ${BLOCK_SIZE} \ + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ + --dtype float16 \ + --enforce-eager" + echo ${BASELINE_BASE_CMD} + bash -c "${BASELINE_BASE_CMD}" & + sleep 10 + wait_for_server ${BASELINE_HOST} ${BASELINE_PORT} +} + +launch_pd() { + PREFILL_BASE_CMD=" + ZE_AFFINITY_MASK=${PREFILLER_ZE_AFFINITY_MASK} \ + VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ + VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ + VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \ + --host ${PREFILL_HOST} \ + --port ${PREFILL_PORT} \ + --max-model-len ${MAX_MODEL_LEN}\ + --seed 42 \ + --block-size ${BLOCK_SIZE} \ + --enforce-eager \ + --dtype float16 \ + -tp ${PREFILLER_TP_SIZE} \ + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ + --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}'" + + + DECODE_BASE_CMD=" + ZE_AFFINITY_MASK=${DECODER_ZE_AFFINITY_MASK} \ + VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \ + --host ${DECODE_HOST} \ + --port ${DECODE_PORT} \ + --max-model-len ${MAX_MODEL_LEN}\ + --seed 42 \ + --block-size ${BLOCK_SIZE} \ + --enforce-eager \ + -tp ${DECODER_TP_SIZE} \ + --dtype float16 \ + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ + --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}'" + + echo ${PREFILL_BASE_CMD} + echo ${DECODE_BASE_CMD} + sleep 2 + + # execute on hosts + bash -c "${PREFILL_BASE_CMD}" & + bash -c "${DECODE_BASE_CMD}" & + sleep 1 + wait_for_server ${PREFILL_HOST} ${PREFILL_PORT} + sleep 1 + wait_for_server ${DECODE_HOST} ${DECODE_PORT} + sleep 1 +} + +launch_pd_proxy(){ + PROXY_BASE_CMD=" + python3 ${EXP_ROOT}/toy_proxy_server.py \ + --prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \ + --decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \ + --host=${PROXY_HOST} --port ${PROXY_PORT}" + echo ${PROXY_BASE_CMD} + bash -c "${PROXY_BASE_CMD}" & + sleep 2 +} + +run_tests(){ + local service_url=$1 + local mode=$2 + python3 ${EXP_ROOT}/test_disagg_accuracy.py --service_url=${service_url} --model_name=${MODEL_NAME} --mode=${mode} --file_name=${OUTPUT_FILE} +} + + +# run non-disagg. baseline & save outputs +launch_baseline +run_tests "http://${BASELINE_HOST}:${BASELINE_PORT}" "baseline" +cleanup +sleep 10 + + +# run disagg. & do exact-match with the outputs from baseline +launch_pd +launch_pd_proxy +run_tests "http://${PROXY_HOST}:${PROXY_PORT}" "disagg" +echo "-----P/D success----" + +rm ${OUTPUT_FILE} +cleanup + +exit 0