[Core][KVConnector] Support HMA+NixlConnector (#35758)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -12,6 +12,7 @@ tp_configs=(
|
||||
"GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
|
||||
"GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
|
||||
"GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
|
||||
"GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192" # SW model
|
||||
)
|
||||
dp_ep_configs=(
|
||||
"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1)
|
||||
@@ -26,6 +27,14 @@ else
|
||||
configs=("${tp_configs[@]}")
|
||||
fi
|
||||
|
||||
if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then
|
||||
# Append ENABLE_HMA_FLAG=1 to each config in the selected array
|
||||
echo "ENABLE_HMA_FLAG is set, appending ENABLE_HMA_FLAG=1 to each config"
|
||||
for i in "${!configs[@]}"; do
|
||||
configs[$i]="ENABLE_HMA_FLAG=1 ${configs[$i]}"
|
||||
done
|
||||
fi
|
||||
|
||||
run_tests() {
|
||||
local label=$1
|
||||
local extra_args=$2
|
||||
|
||||
@@ -5,6 +5,12 @@ set -xe
|
||||
KV_BUFFER_DEVICE="cuda" # Default to cuda
|
||||
ATTENTION_BACKEND="" # Default to empty (use vllm default)
|
||||
CROSS_LAYERS_BLOCKS="False"
|
||||
ENABLE_HMA_VAR="" # Default to empty (HMA disabled by default for kv connector)
|
||||
# Check for ENABLE_HMA_FLAG environment variable
|
||||
if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then
|
||||
ENABLE_HMA_VAR="--no-disable-hybrid-kv-cache-manager"
|
||||
fi
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--kv_buffer_device)
|
||||
@@ -31,6 +37,12 @@ echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
|
||||
if [[ -n "$ATTENTION_BACKEND" ]]; then
|
||||
echo "Using attention backend: $ATTENTION_BACKEND"
|
||||
fi
|
||||
if [[ -n "$ENABLE_HMA_VAR" ]]; then
|
||||
echo "HMA (Hybrid KV Cache Manager) enabled"
|
||||
fi
|
||||
if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then
|
||||
echo "vLLM serve extra args: $VLLM_SERVE_EXTRA_ARGS"
|
||||
fi
|
||||
|
||||
DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
|
||||
if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
|
||||
@@ -70,6 +82,8 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
|
||||
GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
|
||||
PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-128}
|
||||
DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128}
|
||||
# Comma-separated extra args for vllm serve (e.g. --max-model-len,2048)
|
||||
VLLM_SERVE_EXTRA_ARGS=${VLLM_SERVE_EXTRA_ARGS:-}
|
||||
|
||||
# Find the git repository root directory
|
||||
GIT_ROOT=$(git rev-parse --show-toplevel)
|
||||
@@ -151,14 +165,24 @@ run_tests_for_model() {
|
||||
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
|
||||
--tensor-parallel-size $PREFILLER_TP_SIZE \
|
||||
--kv-transfer-config '$KV_CONFIG'"
|
||||
if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then
|
||||
IFS=',' read -r -a extra_args <<< "$VLLM_SERVE_EXTRA_ARGS"
|
||||
for arg in "${extra_args[@]}"; do
|
||||
BASE_CMD="${BASE_CMD} $arg"
|
||||
done
|
||||
fi
|
||||
|
||||
# Add attention backend config if specified
|
||||
if [[ -n "$ATTENTION_BACKEND" ]]; then
|
||||
BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
|
||||
fi
|
||||
|
||||
# Add HMA flag if specified
|
||||
if [[ -n "$ENABLE_HMA_VAR" ]]; then
|
||||
BASE_CMD="${BASE_CMD} $ENABLE_HMA_VAR"
|
||||
fi
|
||||
|
||||
FULL_CMD="$BASE_CMD"
|
||||
|
||||
eval "$FULL_CMD &"
|
||||
|
||||
# Store host and port for proxy configuration
|
||||
@@ -193,12 +217,23 @@ run_tests_for_model() {
|
||||
--block-size ${DECODE_BLOCK_SIZE} \
|
||||
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
|
||||
--kv-transfer-config '$KV_CONFIG'"
|
||||
if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then
|
||||
IFS=',' read -r -a extra_args <<< "$VLLM_SERVE_EXTRA_ARGS"
|
||||
for arg in "${extra_args[@]}"; do
|
||||
BASE_CMD="${BASE_CMD} $arg"
|
||||
done
|
||||
fi
|
||||
|
||||
# Add attention backend config if specified
|
||||
if [[ -n "$ATTENTION_BACKEND" ]]; then
|
||||
BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
|
||||
fi
|
||||
|
||||
# Add HMA flag if specified
|
||||
if [[ -n "$ENABLE_HMA_VAR" ]]; then
|
||||
BASE_CMD="${BASE_CMD} $ENABLE_HMA_VAR"
|
||||
fi
|
||||
|
||||
# DP-EP attention mode
|
||||
if [[ -z "$DP_EP" ]]; then
|
||||
BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE"
|
||||
|
||||
@@ -17,6 +17,7 @@ EXPECTED_VALUES = {
|
||||
"deepseek-ai/deepseek-vl2-small": 0.59,
|
||||
"deepseek-ai/deepseek-vl2-tiny": 0.19,
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": 0.65,
|
||||
"google/gemma-3-4b-it": 0.74,
|
||||
}
|
||||
|
||||
SIMPLE_PROMPT = (
|
||||
|
||||
Reference in New Issue
Block a user