Compare commits
201 Commits
v0.16.0rc0
...
v0.15.2rc0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bbe0574d8e | ||
|
|
4d9513537d | ||
|
|
439afa4eea | ||
|
|
fa4e0fb028 | ||
|
|
ce498a6d61 | ||
|
|
9f14c9224d | ||
|
|
535de06cb1 | ||
|
|
4292c90a2a | ||
|
|
6e98f6d8b6 | ||
|
|
2f6d17cb2f | ||
|
|
192ad4648b | ||
|
|
0e92298622 | ||
|
|
87d9a26166 | ||
|
|
80f921ba4b | ||
|
|
711edaf0d0 | ||
|
|
1d367a738e | ||
|
|
32a02c7ca2 | ||
|
|
f67ee8b859 | ||
|
|
e57ef99b40 | ||
|
|
f8516a1ab9 | ||
|
|
824058076c | ||
|
|
8e32690869 | ||
|
|
a208439537 | ||
|
|
bcd2f74c0d | ||
|
|
f79f777803 | ||
|
|
4c8d1bf361 | ||
|
|
061da6bcf7 | ||
|
|
4403e3ed4c | ||
|
|
08e094997e | ||
|
|
d88a1df699 | ||
|
|
90d74ebaa4 | ||
|
|
45f8fd6f97 | ||
|
|
5e1e0a0fbd | ||
|
|
eb5ed20743 | ||
|
|
2647163674 | ||
|
|
9fb27dd3b3 | ||
|
|
4dffc5e044 | ||
|
|
e1bf04b6c2 | ||
|
|
02080179a3 | ||
|
|
1b8fe6f7c4 | ||
|
|
52ee21021a | ||
|
|
655efb3e69 | ||
|
|
bd8da29a66 | ||
|
|
2a99c5a6c8 | ||
|
|
3f7662d650 | ||
|
|
a372f3f40a | ||
|
|
61e632aea1 | ||
|
|
b1bb18de8d | ||
|
|
2267cb1cfd | ||
|
|
0d6ccf68fa | ||
|
|
18e7cbbb15 | ||
|
|
f0d5251715 | ||
|
|
5c4f2dd6ef | ||
|
|
f3d8a34671 | ||
|
|
4bc913aeec | ||
|
|
fbb3cf6981 | ||
|
|
2df2b3499d | ||
|
|
2a8d84e66d | ||
|
|
a3acfa1071 | ||
|
|
be8168ff88 | ||
|
|
f6af34626d | ||
|
|
ceab70c89d | ||
|
|
52683ccbe1 | ||
|
|
e346e2d056 | ||
|
|
83449a5ff0 | ||
|
|
dad2d6a590 | ||
|
|
32e84fa1ff | ||
|
|
fd9c83d0e0 | ||
|
|
b95cc5014d | ||
|
|
61397891ce | ||
|
|
ef248ff740 | ||
|
|
e10604480b | ||
|
|
bf001da4bf | ||
|
|
a0a984ac2e | ||
|
|
f1cb9b5544 | ||
|
|
4c4b6f7a97 | ||
|
|
10546f925a | ||
|
|
e69c990c21 | ||
|
|
5eac9a1b34 | ||
|
|
1b60b45d0d | ||
|
|
4b3803d180 | ||
|
|
5019c59dd2 | ||
|
|
089cd4f002 | ||
|
|
0130223bd9 | ||
|
|
5d1aef3004 | ||
|
|
ffe1fc7a28 | ||
|
|
8b7346d5f1 | ||
|
|
6141ebe0dd | ||
|
|
199e3cb476 | ||
|
|
9f8cb81b44 | ||
|
|
d7e17aaacd | ||
|
|
528e9b1490 | ||
|
|
d95b4be47a | ||
|
|
4061dcf4c5 | ||
|
|
0aca8b8c62 | ||
|
|
9eb58f8cf1 | ||
|
|
b10d05b8a8 | ||
|
|
b398e5c819 | ||
|
|
78061ef584 | ||
|
|
528b3076af | ||
|
|
a502831d36 | ||
|
|
ba871fb788 | ||
|
|
ab374786c7 | ||
|
|
808dd87b30 | ||
|
|
beb8899482 | ||
|
|
ce88756b96 | ||
|
|
a3154a6092 | ||
|
|
7c036432fc | ||
|
|
318b120766 | ||
|
|
c3b40dc3e7 | ||
|
|
a01ef3fa51 | ||
|
|
7320ca3942 | ||
|
|
cf0a99f84d | ||
|
|
e535d90deb | ||
|
|
0b225fb7b2 | ||
|
|
46b4a02794 | ||
|
|
8869cd8ec1 | ||
|
|
cd86fff38f | ||
|
|
b5f8c3092d | ||
|
|
21997f45b1 | ||
|
|
672023877b | ||
|
|
754a8ca942 | ||
|
|
302ecf64ff | ||
|
|
b6bb2842cf | ||
|
|
79b6ec6aab | ||
|
|
d6416fdde9 | ||
|
|
0fb3157267 | ||
|
|
a358e4dffe | ||
|
|
079781177a | ||
|
|
63c0889416 | ||
|
|
1e86c802d4 | ||
|
|
fedf64332e | ||
|
|
2238a12c13 | ||
|
|
ce0afe2451 | ||
|
|
88c3e114d8 | ||
|
|
92924b2ddd | ||
|
|
27cb2f678f | ||
|
|
22d9a056d5 | ||
|
|
13b842f271 | ||
|
|
15f40b20aa | ||
|
|
793af538a3 | ||
|
|
6f5e7cda57 | ||
|
|
68feb76a6f | ||
|
|
4cb59dea6a | ||
|
|
608b556507 | ||
|
|
f0a1c8453a | ||
|
|
8980001c93 | ||
|
|
527bcd14d4 | ||
|
|
f68e3ea4e1 | ||
|
|
d5c41db35b | ||
|
|
1618e25492 | ||
|
|
f3888aca83 | ||
|
|
f0bca83ee4 | ||
|
|
73419abfae | ||
|
|
e77f162cf5 | ||
|
|
8ecd213c0b | ||
|
|
5b55c0bea7 | ||
|
|
15e0bb9c42 | ||
|
|
6c64c41b4a | ||
|
|
a2ef06e1b3 | ||
|
|
0a3c71e7e5 | ||
|
|
29fba76781 | ||
|
|
9df152bbf6 | ||
|
|
876a16f4fb | ||
|
|
aaa901ad55 | ||
|
|
010ec0c30e | ||
|
|
64a40a7ab4 | ||
|
|
31aedfe7d6 | ||
|
|
67ebaff528 | ||
|
|
2b465570e6 | ||
|
|
9ca66ecc10 | ||
|
|
c3a9752b0c | ||
|
|
f451b4558b | ||
|
|
3f96fcf646 | ||
|
|
6c1f9e4c18 | ||
|
|
67239c4c42 | ||
|
|
8ece60768f | ||
|
|
fd0e377244 | ||
|
|
f857a03f6b | ||
|
|
74898a7015 | ||
|
|
8f5d51203b | ||
|
|
ae5b7aff2b | ||
|
|
a11bc12d53 | ||
|
|
58cb55e4de | ||
|
|
cf896ae0e3 | ||
|
|
c5113f60f2 | ||
|
|
174f16700b | ||
|
|
8e2ad97ad0 | ||
|
|
10152d2194 | ||
|
|
1a7894dbdf | ||
|
|
c87eac18f7 | ||
|
|
f45870b53f | ||
|
|
ba45bedfd1 | ||
|
|
9432ed8c7e | ||
|
|
726d89720c | ||
|
|
d334dd26c4 | ||
|
|
070c811d6f | ||
|
|
8bfc8d5600 | ||
|
|
ec51831a22 | ||
|
|
80b918f2bd | ||
|
|
c46b0cd0af |
@@ -0,0 +1,15 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.695
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.447
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
max_model_len: 262144
|
||||
enforce_eager: false
|
||||
apply_chat_template: true
|
||||
fewshot_as_multiturn: true
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,19 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.7142
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.4579
|
||||
env_vars:
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: "1"
|
||||
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
max_model_len: 262144
|
||||
kv_cache_dtype: fp8
|
||||
enforce_eager: false
|
||||
apply_chat_template: true
|
||||
fewshot_as_multiturn: true
|
||||
trust_remote_code: true
|
||||
@@ -1 +1,2 @@
|
||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||
NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
|
||||
|
||||
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
|
||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||
Qwen2-57B-A14-Instruct.yaml
|
||||
DeepSeek-V2-Lite-Chat.yaml
|
||||
NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
|
||||
|
||||
@@ -393,6 +393,11 @@ run_serving_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
# save the compilation mode and optimization level on the serving results
|
||||
# whenever they are set
|
||||
compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
|
||||
optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
@@ -406,15 +411,15 @@ run_serving_tests() {
|
||||
for max_concurrency in $max_concurrency_list; do
|
||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
||||
echo " new test name $new_test_name"
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
# pass the tensor parallel size, the compilation mode, and the optimization
|
||||
# level to the client so that they can be used on the benchmark dashboard
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--max-concurrency $max_concurrency \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
|
||||
$client_args $client_remote_args "
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
|
||||
@@ -86,5 +86,27 @@ docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker manifest push vllm/vllm-openai:latest-cu130
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
|
||||
|
||||
# CPU images (vllm/vllm-openai-cpu)
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker push vllm/vllm-openai-cpu:latest-arm64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
|
||||
docker manifest rm vllm/vllm-openai-cpu:latest || true
|
||||
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker manifest push vllm/vllm-openai-cpu:latest
|
||||
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
||||
EOF
|
||||
|
||||
@@ -87,7 +87,7 @@ mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands=$@
|
||||
echo "Commands:$commands"
|
||||
echo "Raw commands: $commands"
|
||||
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
|
||||
@@ -169,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
commands=$(echo "$commands" | sed 's/ \\ / /g')
|
||||
echo "Final commands: $commands"
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
@@ -176,7 +179,6 @@ fi
|
||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
||||
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# Test that we're launching on the machine that has
|
||||
@@ -187,56 +189,7 @@ if [[ -z "$render_gid" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands_gpu}" \
|
||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||
PIDS+=($!)
|
||||
done
|
||||
#wait for all processes to finish and collect exit codes
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait "${pid}"
|
||||
STATUS+=($?)
|
||||
done
|
||||
at_least_one_shard_with_tests=0
|
||||
for st in "${STATUS[@]}"; do
|
||||
if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
|
||||
echo "One of the processes failed with $st"
|
||||
exit "${st}"
|
||||
elif [[ ${st} -eq 5 ]]; then
|
||||
echo "Shard exited with status 5 (no tests collected) - treating as success"
|
||||
else # This means st is 0
|
||||
at_least_one_shard_with_tests=1
|
||||
fi
|
||||
done
|
||||
if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
|
||||
echo "All shards reported no tests collected. Failing the build."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
|
||||
if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
|
||||
|
||||
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
|
||||
|
||||
|
||||
@@ -5,7 +5,9 @@
|
||||
set -exuo pipefail
|
||||
|
||||
# Try building the docker image
|
||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
||||
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
|
||||
cat <<EOF | docker build -t ${image_name} -f - .
|
||||
FROM gaudi-base-image:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
@@ -15,7 +17,8 @@ WORKDIR /workspace/vllm
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
||||
RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
|
||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# install development dependencies (for testing)
|
||||
@@ -36,15 +39,20 @@ EOF
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
|
||||
remove_docker_containers() { docker rm -f ${container_name} || true; }
|
||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
||||
remove_docker_containers
|
||||
|
||||
echo "Running HPU plugin v1 test"
|
||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
|
||||
docker run --rm --runtime=habana --name=${container_name} --network=host \
|
||||
-e HABANA_VISIBLE_DEVICES=all \
|
||||
hpu-plugin-v1-test-env \
|
||||
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
|
||||
-e VLLM_SKIP_WARMUP=true \
|
||||
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
|
||||
-e PT_HPU_LAZY_MODE=1 \
|
||||
"${image_name}" \
|
||||
/bin/bash -c '
|
||||
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
'
|
||||
|
||||
EXITCODE=$?
|
||||
if [ $EXITCODE -eq 0 ]; then
|
||||
|
||||
@@ -38,10 +38,11 @@ docker run \
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
cd tests
|
||||
pytest -v -s v1/core
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
|
||||
pytest -v -s v1/engine
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
|
||||
@@ -43,7 +43,6 @@ trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--enable-eplb \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--all2all-backend $BACK \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
@@ -542,7 +542,7 @@ steps:
|
||||
- label: LoRA Test %N # 20min each
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
@@ -604,9 +604,11 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# # Limit to no custom ops to reduce running time
|
||||
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@@ -636,7 +638,7 @@ steps:
|
||||
- label: Kernels Attention Test %N # 23min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
@@ -651,7 +653,7 @@ steps:
|
||||
- label: Kernels Quantization Test %N # 64min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
@@ -664,7 +666,7 @@ steps:
|
||||
- label: Kernels MoE Test %N # 40min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
@@ -742,7 +744,7 @@ steps:
|
||||
- label: Benchmarks # 11min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
source_file_dependencies:
|
||||
@@ -753,7 +755,7 @@ steps:
|
||||
- label: Benchmarks CLI Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -827,7 +829,7 @@ steps:
|
||||
- label: Basic Models Tests (Extra Initialization) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@@ -888,7 +890,7 @@ steps:
|
||||
- label: Language Models Tests (Extra Standard) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@@ -909,7 +911,7 @@ steps:
|
||||
- label: Language Models Tests (Hybrid) %N
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@@ -1181,7 +1183,6 @@ steps:
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
@@ -1189,33 +1190,16 @@ steps:
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
|
||||
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# # Wrap with quotes to escape yaml
|
||||
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
@@ -1566,7 +1550,10 @@ steps:
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
# - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
||||
|
||||
@@ -537,9 +537,11 @@ steps:
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# # Limit to no custom ops to reduce running time
|
||||
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@@ -1069,7 +1071,6 @@ steps:
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
@@ -1077,75 +1078,15 @@ steps:
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# # Wrap with quotes to escape yaml
|
||||
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Hopper Fusion E2E Tests (H100) # 10min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: h100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# skip Llama-4 since it does not fit on this device
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
|
||||
|
||||
- label: Hopper Fusion Distributed E2E Tests (2xH100) # 70min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
|
||||
@@ -2,56 +2,196 @@ group: Compile
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Fusion and Compile Tests (B200)
|
||||
- label: Sequence Parallel Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/
|
||||
- vllm/compilation/
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- tests/distributed/test_sequence_parallel.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
|
||||
- label: Sequence Parallel Tests (2xH100)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
|
||||
- label: Distributed Compile Unit Tests (2xH100)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/model_executor/layers
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_sequence_parallelism.py
|
||||
- tests/compile/distributed/test_async_tp.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
|
||||
- label: Fusion and Compile Unit Tests (B200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/quantization/
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_devices=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
# TODO(luka) move to H100 once pass tests run on H100
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Fusion E2E (2 GPUs)(B200)
|
||||
timeout_in_minutes: 40
|
||||
- label: Fusion E2E Quick (H100)
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||
|
||||
- label: Fusion E2E Config Sweep (H100)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp8) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E Config Sweep (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||
# Run just llama3 (fp8 & fp4) for all config combinations
|
||||
# -k "llama-3"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 Quick (H100)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
|
||||
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp8 & bf16) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 (B200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
# for ar-rms-quant-fp4, also sweep llama3
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
|
||||
@@ -16,7 +16,7 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 90
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
@@ -47,7 +47,6 @@ steps:
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
@@ -133,25 +132,13 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Sequence Parallel Tests (H100)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# Run sequence parallel tests
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(H100)
|
||||
timeout_in_minutes: 15
|
||||
device: h100
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_devices: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
@@ -217,45 +204,3 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- label: Hopper Fusion E2E Tests (H100)
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# skip Llama-4 since it does not fit on this device
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
|
||||
|
||||
- label: Hopper Fusion Distributed E2E Tests (2xH100)
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
|
||||
@@ -16,7 +16,7 @@ steps:
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not slow_test' v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
@@ -166,4 +166,18 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: Acceptance Length Test (Large Models) # optional
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 1
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/model_executor/models/mlp_speculator.py
|
||||
- tests/v1/spec_decode/test_acceptance_length.py
|
||||
commands:
|
||||
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
|
||||
|
||||
@@ -18,7 +18,7 @@ steps:
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test
|
||||
timeout_in_minutes: 30
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
@@ -30,16 +30,13 @@ steps:
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||
|
||||
- label: PyTorch Fullgraph
|
||||
timeout_in_minutes: 40
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
|
||||
@@ -842,6 +842,7 @@ class BenchmarkTensors:
|
||||
"sorted_token_ids": sorted_token_ids,
|
||||
"expert_ids": expert_ids,
|
||||
"num_tokens_post_padded": num_tokens_post_padded,
|
||||
"token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
|
||||
"top_k_num": ctx.top_k_num,
|
||||
"device": self.input.device,
|
||||
"N": lora_rank,
|
||||
@@ -915,6 +916,7 @@ class BenchmarkTensors:
|
||||
"sorted_token_ids": sorted_token_ids,
|
||||
"expert_ids": expert_ids,
|
||||
"num_tokens_post_padded": num_tokens_post_padded,
|
||||
"token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
|
||||
"top_k_num": ctx.top_k_num,
|
||||
"device": self.input.device,
|
||||
"N": lora_rank,
|
||||
|
||||
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
||||
TritonOrDeepGemmExperts,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
@@ -482,6 +481,8 @@ class BenchmarkWorker:
|
||||
block_quant_shape: list[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
# local import to allow serialization by ray
|
||||
|
||||
set_random_seed(self.seed)
|
||||
dtype_str = _get_config_dtype_str(
|
||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||
@@ -535,6 +536,9 @@ class BenchmarkWorker:
|
||||
block_quant_shape: list[int],
|
||||
use_deep_gemm: bool,
|
||||
) -> dict[str, int]:
|
||||
# local import to allow serialization by ray
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
if current_platform.is_rocm():
|
||||
@@ -646,20 +650,28 @@ def save_configs(
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def get_compressed_tensors_block_structure(config, default_value=None):
|
||||
config_groups = config.get("config_groups", {})
|
||||
if len(config_groups) != 1:
|
||||
return default_value
|
||||
group = next(iter(config_groups.values()))
|
||||
weights = group.get("weights", {})
|
||||
block_structure = weights.get("block_structure", default_value)
|
||||
return block_structure
|
||||
|
||||
|
||||
def get_weight_block_size_safety(config, default_value=None):
|
||||
quantization_config = getattr(config, "quantization_config", {})
|
||||
if isinstance(quantization_config, dict):
|
||||
return quantization_config.get("weight_block_size", default_value)
|
||||
if "weight_block_size" in quantization_config:
|
||||
return quantization_config["weight_block_size"]
|
||||
return get_compressed_tensors_block_structure(
|
||||
quantization_config, default_value
|
||||
)
|
||||
return default_value
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
||||
if args.model_prefix:
|
||||
config = getattr(config, args.model_prefix)
|
||||
|
||||
def get_model_params(config):
|
||||
if config.architectures[0] == "DbrxForCausalLM":
|
||||
E = config.ffn_config.moe_num_experts
|
||||
topk = config.ffn_config.moe_top_k
|
||||
@@ -677,6 +689,7 @@ def main(args: argparse.Namespace):
|
||||
"Glm4MoeForCausalLM",
|
||||
"Glm4MoeLiteForCausalLM",
|
||||
"NemotronHForCausalLM",
|
||||
"MistralLarge3ForCausalLM",
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
@@ -697,16 +710,20 @@ def main(args: argparse.Namespace):
|
||||
topk = text_config.num_experts_per_tok
|
||||
intermediate_size = text_config.moe_intermediate_size
|
||||
hidden_size = text_config.hidden_size
|
||||
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
||||
elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.moe_topk[0]
|
||||
intermediate_size = config.moe_intermediate_size[0]
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
|
||||
elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
|
||||
E = config.thinker_config.text_config.num_experts
|
||||
topk = config.thinker_config.text_config.num_experts_per_tok
|
||||
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
|
||||
hidden_size = config.thinker_config.text_config.hidden_size
|
||||
elif config.architectures[0] == "PixtralForConditionalGeneration":
|
||||
# Pixtral can contain different LLM architectures,
|
||||
# recurse to get their parameters
|
||||
return get_model_params(config.get_text_config())
|
||||
else:
|
||||
# Support for llama4
|
||||
config = config.get_text_config()
|
||||
@@ -715,6 +732,16 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
return E, topk, intermediate_size, hidden_size
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
||||
if args.model_prefix:
|
||||
config = getattr(config, args.model_prefix)
|
||||
E, topk, intermediate_size, hidden_size = get_model_params(config)
|
||||
enable_ep = bool(args.enable_expert_parallel)
|
||||
if enable_ep:
|
||||
ensure_divisibility(E, args.tp_size, "Number of experts")
|
||||
|
||||
@@ -14,7 +14,7 @@ from vllm._custom_ops import (
|
||||
)
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
||||
from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ def main(
|
||||
seed: int = 0,
|
||||
iters: int = 20,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
|
||||
@@ -7,8 +7,8 @@ import time
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
# Check if CPU MoE operations are available
|
||||
try:
|
||||
@@ -41,7 +41,7 @@ def main(
|
||||
seed: int = 0,
|
||||
iters: int = 20,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
# up_dim = 2 * intermediate_size for gate + up projection
|
||||
up_dim = 2 * intermediate_size
|
||||
|
||||
|
||||
@@ -359,6 +359,19 @@ else()
|
||||
add_compile_definitions(-DVLLM_NUMA_DISABLED)
|
||||
endif()
|
||||
|
||||
#
|
||||
# Generate CPU attention dispatch header
|
||||
#
|
||||
message(STATUS "Generating CPU attention dispatch header")
|
||||
execute_process(
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/csrc/cpu/generate_cpu_attn_dispatch.py
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/cpu
|
||||
RESULT_VARIABLE GEN_RESULT
|
||||
)
|
||||
if(NOT GEN_RESULT EQUAL 0)
|
||||
message(FATAL_ERROR "Failed to generate CPU attention dispatch header")
|
||||
endif()
|
||||
|
||||
#
|
||||
# _C extension
|
||||
#
|
||||
|
||||
@@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
|
||||
GIT_TAG 2adfc8c2177c5b0e8ddeedfd5a8990d80eb496ff
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@@ -1,79 +1,4 @@
|
||||
#include "cpu_attn_vec.hpp"
|
||||
#include "cpu_attn_vec16.hpp"
|
||||
|
||||
#ifdef CPU_CAPABILITY_AMXBF16
|
||||
#include "cpu_attn_amx.hpp"
|
||||
#define AMX_DISPATCH(...) \
|
||||
case cpu_attention::ISA::AMX: { \
|
||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
|
||||
scalar_t, head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
|
||||
#endif
|
||||
|
||||
#ifdef __aarch64__
|
||||
#include "cpu_attn_neon.hpp"
|
||||
// NEON requires head_dim to be a multiple of 32
|
||||
#define NEON_DISPATCH(...) \
|
||||
case cpu_attention::ISA::NEON: { \
|
||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
|
||||
scalar_t, head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
|
||||
#endif // #ifdef __aarch64__
|
||||
|
||||
#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
|
||||
case HEAD_DIM: { \
|
||||
constexpr size_t head_dim = HEAD_DIM; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...) \
|
||||
[&] { \
|
||||
switch (HEAD_DIM) { \
|
||||
CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__) \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
|
||||
std::to_string(HEAD_DIM)); \
|
||||
} \
|
||||
} \
|
||||
}()
|
||||
|
||||
#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...) \
|
||||
[&] { \
|
||||
switch (ISA_TYPE) { \
|
||||
AMX_DISPATCH(__VA_ARGS__) \
|
||||
NEON_DISPATCH(__VA_ARGS__) \
|
||||
case cpu_attention::ISA::VEC: { \
|
||||
using attn_impl = \
|
||||
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t, \
|
||||
head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
case cpu_attention::ISA::VEC16: { \
|
||||
using attn_impl = \
|
||||
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
|
||||
head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Invalid CPU attention ISA type."); \
|
||||
} \
|
||||
} \
|
||||
}()
|
||||
#include "cpu_attn_dispatch_generated.h"
|
||||
|
||||
torch::Tensor get_scheduler_metadata(
|
||||
const int64_t num_req, const int64_t num_heads_q,
|
||||
@@ -122,16 +47,14 @@ torch::Tensor get_scheduler_metadata(
|
||||
input.enable_kv_split = enable_kv_split;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
|
||||
input.elem_size = sizeof(scalar_t);
|
||||
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
|
||||
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
|
||||
input.output_buffer_elem_size =
|
||||
sizeof(attn_impl::partial_output_buffer_t);
|
||||
input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
|
||||
input.kv_block_alignment = attn_impl::BlockSizeAlignment;
|
||||
});
|
||||
CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
|
||||
input.elem_size = sizeof(scalar_t);
|
||||
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
|
||||
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
|
||||
input.output_buffer_elem_size =
|
||||
sizeof(attn_impl::partial_output_buffer_t);
|
||||
input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
|
||||
input.kv_block_alignment = attn_impl::BlockSizeAlignment;
|
||||
});
|
||||
});
|
||||
|
||||
@@ -184,18 +107,14 @@ void cpu_attn_reshape_and_cache(
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
|
||||
attn_impl::reshape_and_cache(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
key_cache.data_ptr<scalar_t>(),
|
||||
value_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), token_num,
|
||||
key_token_num_stride, value_token_num_stride, head_num,
|
||||
key_head_num_stride, value_head_num_stride, num_blocks,
|
||||
num_blocks_stride, cache_head_num_stride, block_size,
|
||||
block_size_stride);
|
||||
});
|
||||
CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
|
||||
attn_impl::reshape_and_cache(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), token_num, key_token_num_stride,
|
||||
value_token_num_stride, head_num, key_head_num_stride,
|
||||
value_head_num_stride, num_blocks, num_blocks_stride,
|
||||
cache_head_num_stride, block_size, block_size_stride);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -257,12 +176,10 @@ void cpu_attention_with_kv_cache(
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
|
||||
TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
|
||||
cpu_attention::AttentionMainLoop<attn_impl> mainloop;
|
||||
mainloop(&input);
|
||||
});
|
||||
CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
|
||||
TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
|
||||
cpu_attention::AttentionMainLoop<attn_impl> mainloop;
|
||||
mainloop(&input);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
|
||||
const int32_t q_heads_per_kv, const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, const float scale) {
|
||||
constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
|
||||
// static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
|
||||
static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
|
||||
constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
|
||||
constexpr int64_t head_elem_num_pre_block =
|
||||
AMX_TILE_ROW_BYTES / sizeof(scalar_t);
|
||||
|
||||
@@ -816,14 +816,10 @@ struct VecTypeTrait<float> {
|
||||
using vec_t = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
// ARM only supports BF16 with ARMv8.6-A extension
|
||||
#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
|
||||
#else
|
||||
template <>
|
||||
struct VecTypeTrait<c10::BFloat16> {
|
||||
using vec_t = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
#if !defined(__powerpc__) && !defined(__s390x__)
|
||||
template <>
|
||||
@@ -1585,17 +1581,10 @@ class AttentionMainLoop {
|
||||
|
||||
if (use_sink) {
|
||||
alignas(64) float s_aux_fp32[16];
|
||||
#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
|
||||
// ARM without native BF16 support: manual conversion
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
|
||||
}
|
||||
#else
|
||||
// All other platforms have BF16Vec16 available
|
||||
vec_op::BF16Vec16 vec_bf16(curr_s_aux);
|
||||
vec_op::FP32Vec16 vec_fp32(vec_bf16);
|
||||
vec_fp32.save(s_aux_fp32);
|
||||
#endif
|
||||
|
||||
float* __restrict__ curr_sum_buffer = sum_buffer;
|
||||
float* __restrict__ curr_max_buffer = max_buffer;
|
||||
|
||||
@@ -264,7 +264,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
|
||||
constexpr static ISA ISAType = ISA::NEON;
|
||||
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
|
||||
|
||||
// static_assert(HeadDim % HeadDimAlignment == 0);
|
||||
static_assert(HeadDim % HeadDimAlignment == 0);
|
||||
// the gemm micro kernel is Mx8
|
||||
static_assert(HeadDimAlignment % 8 == 0);
|
||||
static_assert(BlockSizeAlignment % 8 == 0);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,13 +14,11 @@ struct KernelVecType<float> {
|
||||
using cvt_vec_type = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
using load_vec_type = vec_op::BF16Vec16;
|
||||
using cvt_vec_type = vec_op::FP32Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <>
|
||||
struct KernelVecType<c10::Half> {
|
||||
|
||||
203
csrc/cpu/generate_cpu_attn_dispatch.py
Normal file
203
csrc/cpu/generate_cpu_attn_dispatch.py
Normal file
@@ -0,0 +1,203 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Generate CPU attention dispatch switch cases and kernel instantiations.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
# Head dimensions divisible by 32 (support all ISAs)
|
||||
HEAD_DIMS_32 = [32, 64, 96, 128, 160, 192, 224, 256]
|
||||
|
||||
# Head dimensions divisible by 16 but not 32 (VEC16 only)
|
||||
HEAD_DIMS_16 = [80, 112]
|
||||
|
||||
# ISA types
|
||||
ISA_TYPES = {
|
||||
"AMX": 0,
|
||||
"VEC": 1,
|
||||
"VEC16": 2,
|
||||
"NEON": 3,
|
||||
}
|
||||
|
||||
# ISAs supported for head_dims divisible by 32
|
||||
ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16"]
|
||||
|
||||
# ISAs supported for head_dims divisible by 16 only
|
||||
ISA_FOR_16 = ["VEC16"]
|
||||
|
||||
|
||||
def encode_params(head_dim: int, isa_type: str) -> int:
|
||||
"""Encode head_dim and ISA type into a single int64_t."""
|
||||
isa_val = ISA_TYPES[isa_type]
|
||||
# Encoding: (head_dim << 8) | isa_type
|
||||
# This allows head_dim up to 2^56 - 1 and 256 ISA types
|
||||
return (head_dim << 8) | isa_val
|
||||
|
||||
|
||||
def generate_cases_for_isa_group(isa_list: list[str]) -> str:
|
||||
"""Generate switch cases for a specific ISA group."""
|
||||
cases = []
|
||||
|
||||
# Generate cases for head_dims divisible by 32
|
||||
for head_dim in HEAD_DIMS_32:
|
||||
for isa in isa_list:
|
||||
if isa not in ISA_FOR_32:
|
||||
continue
|
||||
encoded = encode_params(head_dim, isa)
|
||||
case_str = (
|
||||
f""" case {encoded}LL: {{ """
|
||||
f"""/* head_dim={head_dim}, isa={isa} */ \\"""
|
||||
f"""
|
||||
constexpr size_t head_dim = {head_dim}; \\"""
|
||||
f"""
|
||||
using attn_impl = cpu_attention::AttentionImpl<"""
|
||||
f"""cpu_attention::ISA::{isa}, \\"""
|
||||
f"""
|
||||
"""
|
||||
f"""scalar_t, head_dim>; \\"""
|
||||
f"""
|
||||
return __VA_ARGS__(); \\"""
|
||||
f"""
|
||||
}} \\"""
|
||||
)
|
||||
cases.append(case_str)
|
||||
|
||||
# Generate cases for head_dims divisible by 16 only
|
||||
for head_dim in HEAD_DIMS_16:
|
||||
for isa in isa_list:
|
||||
encoded = encode_params(head_dim, isa)
|
||||
case_str = (
|
||||
f""" case {encoded}LL: {{ """
|
||||
f"""/* head_dim={head_dim}, isa={isa} """
|
||||
f"""(using VEC16) */ \\"""
|
||||
f"""
|
||||
constexpr size_t head_dim = {head_dim}; \\"""
|
||||
f"""
|
||||
using attn_impl = cpu_attention::AttentionImpl<"""
|
||||
f"""cpu_attention::ISA::VEC16, \\"""
|
||||
f"""
|
||||
"""
|
||||
f"""scalar_t, head_dim>; \\"""
|
||||
f"""
|
||||
return __VA_ARGS__(); \\"""
|
||||
f"""
|
||||
}} \\"""
|
||||
)
|
||||
cases.append(case_str)
|
||||
|
||||
return "\n".join(cases)
|
||||
|
||||
|
||||
def generate_helper_function() -> str:
|
||||
"""Generate helper function to encode parameters."""
|
||||
return """
|
||||
inline int64_t encode_cpu_attn_params(int64_t head_dim, cpu_attention::ISA isa) {
|
||||
return (head_dim << 8) | static_cast<int64_t>(isa);
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def generate_header_file() -> str:
|
||||
"""Generate the complete header file content."""
|
||||
header = """// auto generated by generate_cpu_attn_dispatch.py
|
||||
// clang-format off
|
||||
|
||||
#ifndef CPU_ATTN_DISPATCH_GENERATED_H
|
||||
#define CPU_ATTN_DISPATCH_GENERATED_H
|
||||
|
||||
#include "cpu_attn_vec.hpp"
|
||||
#include "cpu_attn_vec16.hpp"
|
||||
|
||||
#ifdef CPU_CAPABILITY_AMXBF16
|
||||
#include "cpu_attn_amx.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef __aarch64__
|
||||
#include "cpu_attn_neon.hpp"
|
||||
#endif
|
||||
|
||||
"""
|
||||
|
||||
header += generate_helper_function()
|
||||
|
||||
# Generate dispatch macro with conditional compilation for different ISA sets
|
||||
header += """
|
||||
// Dispatch macro using encoded parameters
|
||||
"""
|
||||
|
||||
# x86_64 with AMX
|
||||
header += """#if defined(CPU_CAPABILITY_AMXBF16)
|
||||
#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
|
||||
[&] { \\
|
||||
int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
|
||||
switch (encoded_params) { \\
|
||||
"""
|
||||
header += generate_cases_for_isa_group(["AMX", "VEC", "VEC16"])
|
||||
header += """
|
||||
default: { \\
|
||||
TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
|
||||
std::to_string(HEAD_DIM) + " isa=" + \\
|
||||
std::to_string(static_cast<int>(ISA_TYPE))); \\
|
||||
} \\
|
||||
} \\
|
||||
}()
|
||||
|
||||
"""
|
||||
|
||||
# ARM64 with NEON
|
||||
header += """#elif defined(__aarch64__)
|
||||
#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
|
||||
[&] { \\
|
||||
int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
|
||||
switch (encoded_params) { \\
|
||||
"""
|
||||
header += generate_cases_for_isa_group(["NEON", "VEC", "VEC16"])
|
||||
header += """
|
||||
default: { \\
|
||||
TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
|
||||
std::to_string(HEAD_DIM) + " isa=" + \\
|
||||
std::to_string(static_cast<int>(ISA_TYPE))); \\
|
||||
} \\
|
||||
} \\
|
||||
}()
|
||||
|
||||
"""
|
||||
|
||||
# Fallback: VEC and VEC16 only
|
||||
header += """#else
|
||||
#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
|
||||
[&] { \\
|
||||
int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
|
||||
switch (encoded_params) { \\
|
||||
"""
|
||||
header += generate_cases_for_isa_group(["VEC", "VEC16"])
|
||||
header += """
|
||||
default: { \\
|
||||
TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
|
||||
std::to_string(HEAD_DIM) + " isa=" + \\
|
||||
std::to_string(static_cast<int>(ISA_TYPE))); \\
|
||||
} \\
|
||||
} \\
|
||||
}()
|
||||
|
||||
#endif /* CPU_CAPABILITY_AMXBF16 / __aarch64__ */
|
||||
|
||||
#endif // CPU_ATTN_DISPATCH_GENERATED_H
|
||||
"""
|
||||
|
||||
return header
|
||||
|
||||
|
||||
def main():
|
||||
output_path = os.path.join(
|
||||
os.path.dirname(__file__), "cpu_attn_dispatch_generated.h"
|
||||
)
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
f.write(generate_header_file())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -38,9 +38,7 @@ struct KernelVecType<c10::BFloat16> {
|
||||
using qk_vec_type = vec_op::BF16Vec32;
|
||||
using v_load_vec_type = vec_op::BF16Vec16;
|
||||
};
|
||||
#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
|
||||
// pass
|
||||
#else
|
||||
#elif defined(__aarch64__)
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
using qk_load_vec_type = vec_op::BF16Vec16;
|
||||
|
||||
@@ -265,7 +265,7 @@ void tinygemm_kernel(
|
||||
// mb_size = 4
|
||||
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
|
||||
case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -324,7 +324,7 @@ void tinygemm_kernel(
|
||||
case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
|
||||
case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
|
||||
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,7 +180,7 @@ void tinygemm_kernel(
|
||||
// mb_size = 4
|
||||
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
|
||||
case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -398,7 +398,7 @@ void tinygemm_kernel(
|
||||
case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
|
||||
// mb_size = 4
|
||||
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -511,7 +511,7 @@ void tinygemm_kernel(
|
||||
case 0x32: LAUNCH_TINYGEMM_KERNEL_NN2(3, 32); break;
|
||||
// mb_size = 4
|
||||
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN2(4, 32); break;
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,7 +271,7 @@ void tinygemm_kernel(
|
||||
case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32); break;
|
||||
case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32); break;
|
||||
case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32); break;
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -401,7 +401,7 @@ void tinygemm_kernel(
|
||||
case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32); break;
|
||||
case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32); break;
|
||||
case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32); break;
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
|
||||
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,12 +30,10 @@ struct VecTypeTrait<float> {
|
||||
using vec_t = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
|
||||
template <>
|
||||
struct VecTypeTrait<c10::BFloat16> {
|
||||
using vec_t = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
#if !defined(__powerpc__)
|
||||
template <>
|
||||
|
||||
@@ -115,11 +115,28 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
||||
if (flag) { // support GPUDirect RDMA if possible
|
||||
prop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
}
|
||||
int fab_flag = 0;
|
||||
CUDA_CHECK(cuDeviceGetAttribute(
|
||||
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
|
||||
if (fab_flag) { // support fabric handle if possible
|
||||
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// Allocate memory using cuMemCreate
|
||||
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
|
||||
CUresult ret = (CUresult)cuMemCreate(p_memHandle, size, &prop, 0);
|
||||
if (ret) {
|
||||
if (fab_flag &&
|
||||
(ret == CUDA_ERROR_NOT_PERMITTED || ret == CUDA_ERROR_NOT_SUPPORTED)) {
|
||||
// Fabric allocation may fail without multi-node nvlink,
|
||||
// fallback to POSIX file descriptor
|
||||
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
|
||||
} else {
|
||||
CUDA_CHECK(ret);
|
||||
}
|
||||
}
|
||||
if (error_code != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
#include "cutlass/cutlass.h"
|
||||
#include <climits>
|
||||
#include "cuda_runtime.h"
|
||||
#include <iostream>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
/**
|
||||
* Helper function for checking CUTLASS errors
|
||||
@@ -31,12 +32,63 @@ int32_t get_sm_version_num();
|
||||
* __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
|
||||
* into code that will be executed on the device where it is defined.
|
||||
*/
|
||||
|
||||
template <typename Kernel>
|
||||
struct enable_sm75_to_sm80 : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE static void invoke(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__
|
||||
#if __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
|
||||
Kernel::invoke(std::forward<Args>(args)...);
|
||||
#else
|
||||
printf("This kernel only supports sm[75, 80).\n");
|
||||
asm("trap;");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel>
|
||||
struct enable_sm80_to_sm89 : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE static void invoke(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__
|
||||
#if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
|
||||
Kernel::invoke(std::forward<Args>(args)...);
|
||||
#else
|
||||
printf("This kernel only supports sm[80, 89).\n");
|
||||
asm("trap;");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel>
|
||||
struct enable_sm89_to_sm90 : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE static void invoke(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__
|
||||
#if __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
|
||||
Kernel::invoke(std::forward<Args>(args)...);
|
||||
#else
|
||||
printf("This kernel only supports sm[89, 90).\n");
|
||||
asm("trap;");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel>
|
||||
struct enable_sm90_or_later : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE void operator()(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
|
||||
#if defined __CUDA_ARCH__
|
||||
#if __CUDA_ARCH__ >= 900
|
||||
Kernel::operator()(std::forward<Args>(args)...);
|
||||
#else
|
||||
printf("This kernel only supports sm >= 90.\n");
|
||||
asm("trap;");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
};
|
||||
@@ -45,18 +97,43 @@ template <typename Kernel>
|
||||
struct enable_sm90_only : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE void operator()(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
|
||||
#if defined __CUDA_ARCH__
|
||||
#if __CUDA_ARCH__ == 900
|
||||
Kernel::operator()(std::forward<Args>(args)...);
|
||||
#else
|
||||
printf("This kernel only supports sm90.\n");
|
||||
asm("trap;");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel>
|
||||
struct enable_sm100_only : Kernel {
|
||||
struct enable_sm100f_only : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE void operator()(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1000
|
||||
#if defined __CUDA_ARCH__
|
||||
#if __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
|
||||
Kernel::operator()(std::forward<Args>(args)...);
|
||||
#else
|
||||
printf("This kernel only supports sm100f.\n");
|
||||
asm("trap;");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel>
|
||||
struct enable_sm100a_only : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE void operator()(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__
|
||||
#if __CUDA_ARCH__ == 1000
|
||||
Kernel::operator()(std::forward<Args>(args)...);
|
||||
#else
|
||||
printf("This kernel only supports sm100a.\n");
|
||||
asm("trap;");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
};
|
||||
@@ -65,8 +142,13 @@ template <typename Kernel>
|
||||
struct enable_sm120_only : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE void operator()(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200
|
||||
#if defined __CUDA_ARCH__
|
||||
#if __CUDA_ARCH__ == 1200
|
||||
Kernel::operator()(std::forward<Args>(args)...);
|
||||
#else
|
||||
printf("This kernel only supports sm120.\n");
|
||||
asm("trap;");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -770,7 +770,7 @@ torch::Tensor moe_wna16_marlin_gemm(
|
||||
b_bias = b_bias_or_none.value();
|
||||
TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
|
||||
TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
|
||||
TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
|
||||
TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(1) != size_n");
|
||||
TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
|
||||
} else {
|
||||
b_bias = torch::empty({0}, options);
|
||||
|
||||
@@ -141,8 +141,8 @@ struct cutlass_3x_gemm_sm100 {
|
||||
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
|
||||
using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
|
||||
};
|
||||
|
||||
template <typename ElementAB_, typename ElementD_,
|
||||
@@ -202,8 +202,8 @@ struct cutlass_3x_gemm_sm120 {
|
||||
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
|
||||
using GemmKernel = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
|
||||
};
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
@@ -123,7 +123,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
|
||||
MainloopScheduler
|
||||
>::CollectiveOp>;
|
||||
|
||||
using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
|
||||
using KernelType = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
|
||||
|
||||
struct GemmKernel : public KernelType {};
|
||||
|
||||
@@ -90,8 +90,8 @@ struct cutlass_3x_gemm_sm100_fp8 {
|
||||
// -----------------------------------------------------------
|
||||
// Kernel definition
|
||||
// -----------------------------------------------------------
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
|
||||
using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
|
||||
};
|
||||
|
||||
template <typename InType, typename OutType, bool EnableBias>
|
||||
|
||||
@@ -36,41 +36,6 @@ using namespace cute;
|
||||
*/
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Wrappers for the GEMM kernel that is used to guard against compilation on
|
||||
// architectures that will never use the kernel. The purpose of this is to
|
||||
// reduce the size of the compiled binary.
|
||||
// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
|
||||
// into code that will be executed on the device where it is defined.
|
||||
template <typename Kernel>
|
||||
struct enable_sm75_to_sm80 : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE static void invoke(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
|
||||
Kernel::invoke(std::forward<Args>(args)...);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel>
|
||||
struct enable_sm80_to_sm89 : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE static void invoke(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
|
||||
Kernel::invoke(std::forward<Args>(args)...);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel>
|
||||
struct enable_sm89_to_sm90 : Kernel {
|
||||
template <typename... Args>
|
||||
CUTLASS_DEVICE static void invoke(Args&&... args) {
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
|
||||
Kernel::invoke(std::forward<Args>(args)...);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
template <typename Arch, template <typename> typename ArchGuard,
|
||||
typename ElementAB_, typename ElementD_,
|
||||
template <typename, typename> typename Epilogue_, typename TileShape,
|
||||
|
||||
@@ -50,7 +50,7 @@ struct sm89_fp8_config_default {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -58,7 +58,7 @@ struct sm89_fp8_config_default {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<256, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -67,7 +67,7 @@ struct sm89_fp8_config_default {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -100,7 +100,7 @@ struct sm89_fp8_config_M256 {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -108,7 +108,7 @@ struct sm89_fp8_config_M256 {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -141,7 +141,7 @@ struct sm89_fp8_config_M128 {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -150,7 +150,7 @@ struct sm89_fp8_config_M128 {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -158,7 +158,7 @@ struct sm89_fp8_config_M128 {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<128, 64, 128>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -191,7 +191,7 @@ struct sm89_fp8_config_M64 {
|
||||
using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -201,7 +201,7 @@ struct sm89_fp8_config_M64 {
|
||||
using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -211,7 +211,7 @@ struct sm89_fp8_config_M64 {
|
||||
using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -244,7 +244,7 @@ struct sm89_fp8_config_M32 {
|
||||
using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -253,7 +253,7 @@ struct sm89_fp8_config_M32 {
|
||||
using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 4, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -262,7 +262,7 @@ struct sm89_fp8_config_M32 {
|
||||
using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5, FP8MathOperator>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -296,7 +296,7 @@ struct sm89_fp8_config_M16 {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, MainLoopStages,
|
||||
FP8MathOperator>,
|
||||
@@ -305,7 +305,7 @@ struct sm89_fp8_config_M16 {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<16, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, MainLoopStages,
|
||||
FP8MathOperator>,
|
||||
@@ -314,7 +314,7 @@ struct sm89_fp8_config_M16 {
|
||||
using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, MainLoopStages,
|
||||
FP8MathOperator>,
|
||||
|
||||
@@ -48,7 +48,7 @@ struct sm89_int8_config_default {
|
||||
using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -56,7 +56,7 @@ struct sm89_int8_config_default {
|
||||
using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -64,7 +64,7 @@ struct sm89_int8_config_default {
|
||||
using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -72,7 +72,7 @@ struct sm89_int8_config_default {
|
||||
using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -104,7 +104,7 @@ struct sm89_int8_config_M256 {
|
||||
using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -112,7 +112,7 @@ struct sm89_int8_config_M256 {
|
||||
using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -120,7 +120,7 @@ struct sm89_int8_config_M256 {
|
||||
using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -128,7 +128,7 @@ struct sm89_int8_config_M256 {
|
||||
using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -160,7 +160,7 @@ struct sm89_int8_config_M128 {
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -169,7 +169,7 @@ struct sm89_int8_config_M128 {
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -178,7 +178,7 @@ struct sm89_int8_config_M128 {
|
||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -210,7 +210,7 @@ struct sm89_int8_config_M64 {
|
||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -219,7 +219,7 @@ struct sm89_int8_config_M64 {
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 3>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -251,7 +251,7 @@ struct sm89_int8_config_M32 {
|
||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -260,7 +260,7 @@ struct sm89_int8_config_M32 {
|
||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 4>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -292,7 +292,7 @@ struct sm89_int8_config_M16 {
|
||||
using TileShape = cutlass::gemm::GemmShape<16, 64, 128>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 5>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
@@ -300,7 +300,7 @@ struct sm89_int8_config_M16 {
|
||||
using TileShape = cutlass::gemm::GemmShape<16, 128, 128>;
|
||||
|
||||
return vllm::fallback_cutlass_gemm_caller<
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
|
||||
vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||
InType, OutType, Epilogue, TileShape, WarpShape,
|
||||
InstructionShape, 4>,
|
||||
FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
|
||||
@@ -97,9 +97,7 @@ ARG PYTHON_VERSION
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install system dependencies including build tools
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
ccache \
|
||||
software-properties-common \
|
||||
@@ -502,9 +500,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and system dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
software-properties-common \
|
||||
curl \
|
||||
@@ -586,7 +582,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# This is ~1.1GB and only changes when FlashInfer version bumps
|
||||
# https://docs.flashinfer.ai/installation.html
|
||||
# From versions.json: .flashinfer.version
|
||||
ARG FLASHINFER_VERSION=0.6.1
|
||||
ARG FLASHINFER_VERSION=0.6.2
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
|
||||
&& uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
|
||||
@@ -713,9 +709,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y git
|
||||
|
||||
# We can specify the standard or nightly build of PyTorch
|
||||
|
||||
@@ -20,9 +20,7 @@ ARG PYTHON_VERSION=3.12
|
||||
ARG TARGETPLATFORM
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
# Install Python and other dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||
&& for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
@@ -172,9 +170,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& for i in 1 2 3; do \
|
||||
@@ -221,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
|
||||
|
||||
|
||||
# build flashinfer for torch nightly from source around 10 mins
|
||||
# release version: v0.6.1
|
||||
# release version: v0.6.2
|
||||
# todo(elainewy): cache flashinfer build result for faster build
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
echo "git clone flashinfer..." \
|
||||
&& git clone --depth 1 --branch v0.6.1 --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||
&& git clone --depth 1 --branch v0.6.2 --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||
&& cd flashinfer \
|
||||
&& git submodule update --init --recursive \
|
||||
&& echo "finish git clone flashinfer..." \
|
||||
|
||||
@@ -15,8 +15,6 @@ FROM ${BASE_IMAGE} AS base
|
||||
|
||||
ARG ARG_PYTORCH_ROCM_ARCH
|
||||
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
|
||||
|
||||
# Install some basic utilities
|
||||
RUN apt-get update -q -y && apt-get install -q -y \
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Base UBI image for s390x architecture
|
||||
ARG BASE_UBI_IMAGE_TAG=9.5-1736404155
|
||||
ARG BASE_UBI_IMAGE_TAG=9.6
|
||||
ARG PYTHON_VERSION=3.12
|
||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
|
||||
|
||||
@@ -14,12 +14,18 @@ ENV LANG=C.UTF-8 \
|
||||
|
||||
# Install development utilities
|
||||
RUN microdnf install -y \
|
||||
which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-libatomic-devel patch zlib-devel \
|
||||
which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-binutils gcc-toolset-14-libatomic-devel patch zlib-devel \
|
||||
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
|
||||
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
|
||||
clang llvm-devel llvm-static clang-devel && \
|
||||
microdnf clean all
|
||||
|
||||
ENV GCC_TOOLSET_ROOT=/opt/rh/gcc-toolset-14/root \
|
||||
PATH=/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:/usr/bin:/bin \
|
||||
LD_LIBRARY_PATH=/opt/rh/gcc-toolset-14/root/usr/lib64:/usr/local/lib:/usr/lib64 \
|
||||
LIBRARY_PATH=/opt/rh/gcc-toolset-14/root/usr/lib64 \
|
||||
PKG_CONFIG_PATH=/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig
|
||||
|
||||
# Python Installation
|
||||
FROM base AS python-install
|
||||
ARG PYTHON_VERSION
|
||||
@@ -87,13 +93,13 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
|
||||
|
||||
FROM python-install AS torch-vision
|
||||
# Install torchvision
|
||||
ARG TORCH_VISION_VERSION=v0.23.0
|
||||
ARG TORCH_VISION_VERSION=v0.25.0
|
||||
WORKDIR /tmp
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
git clone https://github.com/pytorch/vision.git && \
|
||||
cd vision && \
|
||||
git checkout $TORCH_VISION_VERSION && \
|
||||
uv pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu && \
|
||||
uv pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cpu && \
|
||||
python setup.py bdist_wheel
|
||||
|
||||
FROM python-install AS hf-xet-builder
|
||||
@@ -174,7 +180,19 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
|
||||
sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
|
||||
fi && python setup.py bdist_wheel
|
||||
|
||||
|
||||
# Build OpenCV from source for s390x
|
||||
FROM python-install AS opencv-builder
|
||||
WORKDIR /tmp
|
||||
ARG MAX_JOBS
|
||||
ARG OPENCV_VERSION=90
|
||||
ARG ENABLE_HEADLESS=1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install numpy setuptools wheel scikit_build build && \
|
||||
git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
|
||||
cd opencv-python && \
|
||||
python -m build --wheel --installer=uv --outdir /tmp/opencv-python/dist
|
||||
|
||||
# Build Outlines Core
|
||||
FROM python-install AS outlines-core-builder
|
||||
WORKDIR /tmp
|
||||
@@ -198,7 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# Final build stage
|
||||
FROM python-install AS vllm-cpu
|
||||
ARG PYTHON_VERSION
|
||||
|
||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
# Set correct library path for torch and numactl
|
||||
ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:/opt/rh/gcc-toolset-14/root/usr/lib64:$LD_LIBRARY_PATH"
|
||||
ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
|
||||
@@ -209,7 +227,8 @@ ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
||||
ENV PCP_DIR=/opt/rh/gcc-toolset-14/root
|
||||
ENV PKG_CONFIG_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:/usr/local/lib/pkgconfig/"
|
||||
ENV PATH="${VIRTUAL_ENV:+${VIRTUAL_ENV}/bin}:/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
||||
|
||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
COPY . /workspace/vllm
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
@@ -225,23 +244,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
|
||||
--mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
|
||||
--mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
|
||||
--mount=type=bind,from=opencv-builder,source=/tmp/opencv-python/dist,target=/tmp/opencv-wheels/ \
|
||||
--mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
|
||||
sed -i '/^torch/d' requirements/build.txt && \
|
||||
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
|
||||
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
|
||||
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \
|
||||
LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
|
||||
NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
|
||||
OPENCV_WHL_FILE=$(ls /tmp/opencv-wheels/*.whl) && \
|
||||
OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
|
||||
uv pip install -v \
|
||||
uv pip install -v \
|
||||
$ARROW_WHL_FILE \
|
||||
$VISION_WHL_FILE \
|
||||
$HF_XET_WHL_FILE \
|
||||
$LLVM_WHL_FILE \
|
||||
$NUMBA_WHL_FILE \
|
||||
$OPENCV_WHL_FILE \
|
||||
$OUTLINES_CORE_WHL_FILE \
|
||||
--index-strategy unsafe-best-match \
|
||||
-r requirements/build.txt \
|
||||
-r requirements/cpu-build.txt \
|
||||
-r requirements/cpu.txt
|
||||
|
||||
|
||||
@@ -252,7 +273,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
# setup non-root user for vllm
|
||||
RUN umask 002 && \
|
||||
useradd --uid 2000 --gid 0 vllm && \
|
||||
/usr/sbin/useradd --uid 2000 --gid 0 vllm && \
|
||||
mkdir -p /home/vllm && \
|
||||
chmod g+rwx /home/vllm
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
|
||||
FROM intel/deep-learning-essentials:2025.3.2-0-devel-ubuntu24.04 AS vllm-base
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
|
||||
add-apt-repository -y ppa:kobuk-team/intel-graphics
|
||||
|
||||
RUN apt clean && apt-get update -y && \
|
||||
apt-get install -y --no-install-recommends --fix-missing \
|
||||
@@ -25,10 +25,13 @@ RUN apt clean && apt-get update -y && \
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
|
||||
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
|
||||
|
||||
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
|
||||
RUN apt update && apt upgrade -y && \
|
||||
apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc && \
|
||||
apt install -y intel-oneapi-compiler-dpcpp-cpp-2025.3
|
||||
|
||||
|
||||
# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
|
||||
ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.6_offline.sh"
|
||||
ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.8_offline.sh"
|
||||
RUN wget "https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/${ONECCL_INSTALLER}" && \
|
||||
bash "${ONECCL_INSTALLER}" -a --silent --eula accept && \
|
||||
rm "${ONECCL_INSTALLER}" && \
|
||||
@@ -85,6 +88,9 @@ RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
ENV NIXL_VERSION=0.7.0
|
||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||
|
||||
# FIX triton
|
||||
RUN --mount=type=cache,target=/root/.cache/pip pip uninstall triton triton-xpu -y && pip install triton-xpu==3.6.0 --extra-index-url=https://download.pytorch.org/whl/xpu
|
||||
|
||||
# PyJWT-2.7.0 will influence some wheel behaviors, remove its dist-info to avoid conflicts
|
||||
RUN rm /usr/lib/python3/dist-packages/PyJWT-2.7.0.dist-info/ -rf
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
"default": "true"
|
||||
},
|
||||
"FLASHINFER_VERSION": {
|
||||
"default": "0.6.1"
|
||||
"default": "0.6.2"
|
||||
},
|
||||
"GDRCOPY_CUDA_VERSION": {
|
||||
"default": "12.8"
|
||||
|
||||
@@ -32,6 +32,7 @@ th {
|
||||
| HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` |
|
||||
| Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` |
|
||||
| Custom | ✅ | ✅ | Local file: `data.jsonl` |
|
||||
| Custom MM | ✅ | ✅ | Local file: `mm_data.jsonl` |
|
||||
|
||||
Legend:
|
||||
|
||||
@@ -133,6 +134,33 @@ vllm bench serve --port 9001 --save-result --save-detailed \
|
||||
|
||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||
|
||||
#### Custom multimodal dataset
|
||||
|
||||
If the multimodal dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomMMDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" and "image_files" field per entry, e.g., `mm_data.jsonl`:
|
||||
|
||||
```json
|
||||
{"prompt": "How many animals are present in the given image?", "image_files": ["/path/to/image/folder/horsepony.jpg"]}
|
||||
{"prompt": "What colour is the bird shown in the image?", "image_files": ["/path/to/image/folder/flycatcher.jpeg"]}
|
||||
```
|
||||
|
||||
```bash
|
||||
# need a model with vision capability here
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||
```
|
||||
|
||||
```bash
|
||||
# run benchmarking script
|
||||
vllm bench serve--save-result --save-detailed \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name custom_mm \
|
||||
--dataset-path <path-to-your-mm-data-jsonl> \
|
||||
--allowed-local-media-path /path/to/image/folder
|
||||
```
|
||||
|
||||
Note that we need to use the `openai-chat` backend and `/v1/chat/completions` endpoint for multimodal inputs.
|
||||
|
||||
#### VisionArena Benchmark for Vision Language Models
|
||||
|
||||
```bash
|
||||
|
||||
@@ -82,7 +82,7 @@ vllm bench sweep serve \
|
||||
You can use `--dry-run` to preview the commands to be run.
|
||||
|
||||
We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
|
||||
Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
|
||||
Between each benchmark run, we call all `/reset_*_cache` endpoints to get a clean slate for the next run.
|
||||
In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
|
||||
|
||||
!!! note
|
||||
|
||||
@@ -251,6 +251,7 @@ No extra registration is required beyond having your model class available via t
|
||||
- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
|
||||
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
|
||||
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
|
||||
- Qwen3-Omni multimodal with audio embeddings: [vllm/model_executor/models/qwen3_omni_moe_thinker.py](../../../vllm/model_executor/models/qwen3_omni_moe_thinker.py)
|
||||
|
||||
## Test with the API
|
||||
|
||||
|
||||
@@ -59,11 +59,15 @@ First, create a Kubernetes PVC and Secret for downloading and storing Hugging Fa
|
||||
Here, the `token` field stores your **Hugging Face access token**. For details on how to generate a token,
|
||||
see the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens).
|
||||
|
||||
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
Next, start the vLLM server as a Kubernetes Deployment and Service.
|
||||
|
||||
Note that you will want to configure your vLLM image based on your processor arch:
|
||||
|
||||
??? console "Config"
|
||||
|
||||
```bash
|
||||
VLLM_IMAGE=public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest # use this for x86_64
|
||||
VLLM_IMAGE=public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest # use this for arm64
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
@@ -81,7 +85,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
image: $VLLM_IMAGE
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
@@ -168,7 +168,7 @@ Priority is **1 = highest** (tried first).
|
||||
| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | All | 9.x |
|
||||
| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | Decoder | Any |
|
||||
| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | Decoder, Encoder Only | Any |
|
||||
| `ROCM_AITER_FA` | | fp16, bf16 | `auto` | %16 | 64, 128, 256 | ❌ | ❌ | Decoder | N/A |
|
||||
| `ROCM_AITER_FA` | | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | Decoder | N/A |
|
||||
| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | Decoder | N/A |
|
||||
| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | Decoder | N/A |
|
||||
| `TREE_ATTN` | | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | Decoder | Any |
|
||||
|
||||
@@ -282,6 +282,15 @@ If vLLM's compile cache is wrong, this usually means that a factor is missing.
|
||||
Please see [this example](https://github.com/vllm-project/vllm/blob/18b39828d90413d05d770dfd2e2f48304f4ca0eb/vllm/config/model.py#L310)
|
||||
of how vLLM computes part of the cache key.
|
||||
|
||||
vLLM's compilation cache requires that the code being compiled ends up being serializable.
|
||||
If this is not the case, then it will error out on save. Usually the fixes are to either:
|
||||
|
||||
- rewrite the non-serializable pieces (perhaps difficult because it's difficult to
|
||||
tell right now what is serializable and what isn't)
|
||||
- file a bug report
|
||||
- ignore the error by setting `VLLM_DISABLE_COMPILE_CACHE=1` (note that this will
|
||||
make warm server starts a lot slower).
|
||||
|
||||
## Debugging CUDAGraphs
|
||||
|
||||
CUDAGraphs is a feature that allows one to:
|
||||
|
||||
@@ -154,4 +154,4 @@ The interface for the model/module may change during vLLM's development. If you
|
||||
!!! warning "Deprecations"
|
||||
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
|
||||
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
|
||||
- `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead.
|
||||
- `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
|
||||
|
||||
@@ -2,6 +2,6 @@
|
||||
|
||||
vLLM's examples are split into three categories:
|
||||
|
||||
- If you are using vLLM from within Python code, see the *Offline Inference* section.
|
||||
- If you are using vLLM from an HTTP application or client, see the *Online Serving* section.
|
||||
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the *Others* section.
|
||||
- If you are using vLLM from within Python code, see the [Offline Inference](../../examples/offline_inference) section.
|
||||
- If you are using vLLM from an HTTP application or client, see the [Online Serving](../../examples/online_serving) section.
|
||||
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](../../examples/others) section.
|
||||
|
||||
@@ -108,6 +108,7 @@ Batch invariance has been tested and verified on the following models:
|
||||
- **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct`
|
||||
- **Qwen2.5**: `Qwen/Qwen2.5-0.5B-Instruct`, `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-3B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`, `Qwen/Qwen2.5-14B-Instruct`, `Qwen/Qwen2.5-32B-Instruct`
|
||||
- **Llama 3**: `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`
|
||||
- **GPT-OSS**: `openai/gpt-oss-20b`, `openai/gpt-oss-120b`
|
||||
|
||||
Other models may also work, but these have been explicitly validated. If you encounter issues with a specific model, please report them on the [GitHub issue tracker](https://github.com/vllm-project/vllm/issues/new/choose).
|
||||
|
||||
|
||||
@@ -19,12 +19,13 @@ Two main reasons:
|
||||
|
||||
Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
|
||||
|
||||
Now supports 5 types of connectors:
|
||||
Now supports 6 types of connectors:
|
||||
|
||||
- **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
|
||||
- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
|
||||
- **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
|
||||
- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
|
||||
- **MooncakeConnector**: refer to [examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh) for the example usage of ExampleConnector disaggregated prefilling. For detailed usage guide, see [MooncakeConnector Usage Guide](mooncake_connector_usage.md).
|
||||
- **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
|
||||
|
||||
```bash
|
||||
|
||||
@@ -31,11 +31,9 @@ vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_conne
|
||||
### Proxy
|
||||
|
||||
```bash
|
||||
python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --prefiller-host 192.168.0.2 --prefiller-port 8010 --decoder-host 192.168.0.3 --decoder-port 8020
|
||||
python examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py --prefill http://192.168.0.2:8010 --decode http://192.168.0.3:8020
|
||||
```
|
||||
|
||||
> NOTE: The Mooncake Connector currently uses the proxy from nixl_integration. This will be replaced with a self-developed proxy in the future.
|
||||
|
||||
Now you can send requests to the proxy server through port 8000.
|
||||
|
||||
## Environment Variables
|
||||
@@ -43,16 +41,29 @@ Now you can send requests to the proxy server through port 8000.
|
||||
- `VLLM_MOONCAKE_BOOTSTRAP_PORT`: Port for Mooncake bootstrap server
|
||||
- Default: 8998
|
||||
- Required only for prefiller instances
|
||||
- Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
|
||||
- For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank
|
||||
- Used for the decoder notifying the prefiller
|
||||
- For headless instances, must be the same as the master instance
|
||||
- Each instance needs a unique port on its host; using the same port number across different hosts is fine
|
||||
|
||||
- `VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
|
||||
- Default: 480
|
||||
- If a request is aborted and the decoder has not yet notified the prefiller, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
|
||||
|
||||
## KV Role Options
|
||||
## KV Transfer Config
|
||||
|
||||
### KV Role Options
|
||||
|
||||
- **kv_producer**: For prefiller instances that generate KV caches
|
||||
- **kv_consumer**: For decoder instances that consume KV caches from prefiller
|
||||
- **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.
|
||||
|
||||
### kv_connector_extra_config
|
||||
|
||||
- **num_workers**: Size of thread pool for one prefiller worker to transfer KV caches by mooncake. (default 10)
|
||||
- **mooncake_protocol**: Mooncake connector protocol. (default "rdma")
|
||||
|
||||
## Example Scripts/Code
|
||||
|
||||
Refer to these example scripts in the vLLM repository:
|
||||
|
||||
- [run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh)
|
||||
- [mooncake_connector_proxy.py](../../examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py)
|
||||
|
||||
@@ -36,6 +36,35 @@ export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1
|
||||
!!! tip
|
||||
When using UCX as the transport backend, NCCL environment variables (like `NCCL_IB_HCA`, `NCCL_SOCKET_IFNAME`) are not applicable to NixlConnector, so configure UCX-specific environment variables instead of NCCL variables.
|
||||
|
||||
#### Selecting a NIXL transport backend (plugin)
|
||||
|
||||
NixlConnector can use different NIXL transport backends (plugins). By default, NixlConnector uses UCX as the transport backend.
|
||||
|
||||
To select a different backend, set `kv_connector_extra_config.backends` in `--kv-transfer-config`.
|
||||
|
||||
### Example: using LIBFABRIC backend
|
||||
|
||||
```bash
|
||||
vllm serve <MODEL> \
|
||||
--kv-transfer-config '{
|
||||
"kv_connector":"NixlConnector",
|
||||
"kv_role":"kv_both",
|
||||
"kv_connector_extra_config":{"backends":["LIBFABRIC"]}
|
||||
}'
|
||||
```
|
||||
|
||||
You can also pass JSON keys individually using dotted arguments, and you can append list elements using `+`:
|
||||
|
||||
```bash
|
||||
vllm serve <MODEL> \
|
||||
--kv-transfer-config.kv_connector NixlConnector \
|
||||
--kv-transfer-config.kv_role kv_both \
|
||||
--kv-transfer-config.kv_connector_extra_config.backends+ LIBFABRIC
|
||||
```
|
||||
|
||||
!!! note
|
||||
Backend availability depends on how NIXL was built and what plugins are present in your environment. Refer to the [NIXL repository](https://github.com/ai-dynamo/nixl) for available backends and build instructions.
|
||||
|
||||
## Basic Usage (on the same host)
|
||||
|
||||
### Producer (Prefiller) Configuration
|
||||
|
||||
@@ -6,34 +6,38 @@
|
||||
!!! warning
|
||||
Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
|
||||
|
||||
To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
|
||||
To run a GGUF model with vLLM, you can use the `repo_id:quant_type` format to load directly from HuggingFace. For example, to load a Q4_K_M quantized model from [unsloth/Qwen3-0.6B-GGUF](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF):
|
||||
|
||||
```bash
|
||||
wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
|
||||
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B
|
||||
```
|
||||
|
||||
You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
|
||||
|
||||
```bash
|
||||
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
|
||||
vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
|
||||
--tokenizer Qwen/Qwen3-0.6B \
|
||||
--tensor-parallel-size 2
|
||||
```
|
||||
|
||||
Alternatively, you can download and use a local GGUF file:
|
||||
|
||||
```bash
|
||||
wget https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf
|
||||
vllm serve ./Qwen3-0.6B-Q4_K_M.gguf --tokenizer Qwen/Qwen3-0.6B
|
||||
```
|
||||
|
||||
!!! warning
|
||||
We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
|
||||
|
||||
GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
|
||||
GGUF assumes that HuggingFace can convert the metadata to a config file. In case HuggingFace doesn't support your model you can manually create a config and pass it as hf-config-path
|
||||
|
||||
```bash
|
||||
# If you model is not supported by huggingface you can manually provide a huggingface compatible config path
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
|
||||
--hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
|
||||
# If your model is not supported by HuggingFace you can manually provide a HuggingFace compatible config path
|
||||
vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
|
||||
--tokenizer Qwen/Qwen3-0.6B \
|
||||
--hf-config-path Qwen/Qwen3-0.6B
|
||||
```
|
||||
|
||||
You can also use the GGUF model directly through the LLM entrypoint:
|
||||
@@ -66,10 +70,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
# Create an LLM using repo_id:quant_type format.
|
||||
llm = LLM(
|
||||
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
model="unsloth/Qwen3-0.6B-GGUF:Q4_K_M",
|
||||
tokenizer="Qwen/Qwen3-0.6B",
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
|
||||
@@ -136,15 +136,31 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
|
||||
To pull the latest image:
|
||||
|
||||
Stable vLLM Docker images are being pre-built for Arm from version 0.12.0. Available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo).
|
||||
```bash
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest
|
||||
```
|
||||
|
||||
To pull an image with a specific vLLM version:
|
||||
|
||||
```bash
|
||||
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${VLLM_VERSION}
|
||||
```
|
||||
|
||||
All available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo).
|
||||
|
||||
You can run these images via:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:<tag> <args...>
|
||||
```
|
||||
|
||||
You can also access the latest code with Docker images. These are not intended for production use and are meant for CI and testing only. They will expire after several days.
|
||||
|
||||
The latest code can contain bugs and may not be stable. Please use it with caution.
|
||||
|
||||
@@ -161,7 +161,23 @@ uv pip install dist/*.whl
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
[https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
|
||||
You can pull the latest available CPU image here via:
|
||||
|
||||
```bash
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
|
||||
```
|
||||
|
||||
If you want a more specific build you can find all published CPU based images here: [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
|
||||
|
||||
You can run these images via:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:<tag> <args...>
|
||||
```
|
||||
|
||||
!!! warning
|
||||
If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. See the build-image-from-source section below for build arguments to match your target CPU capabilities.
|
||||
|
||||
@@ -31,7 +31,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/
|
||||
To install a specific version and ROCm variant of vLLM wheel.
|
||||
|
||||
```bash
|
||||
uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
|
||||
uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
|
||||
```
|
||||
|
||||
!!! warning "Caveats for using `pip`"
|
||||
@@ -41,7 +41,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
|
||||
If you insist on using `pip`, you have to specify the exact vLLM version and full URL of the wheel path `https://wheels.vllm.ai/rocm/<version>/<rocm-variant>` (which can be obtained from the web page).
|
||||
|
||||
```bash
|
||||
pip install vllm==0.14.1+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
|
||||
pip install vllm==0.15.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
|
||||
```
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
|
||||
@@ -307,6 +307,62 @@ An OpenAI client example can be found here: [examples/pooling/embed/openai_embed
|
||||
|
||||
## Specific models
|
||||
|
||||
### ColBERT Late Interaction Models
|
||||
|
||||
[ColBERT](https://arxiv.org/abs/2004.12832) (Contextualized Late Interaction over BERT) is a retrieval model that uses per-token embeddings and MaxSim scoring for document ranking. Unlike single-vector embedding models, ColBERT retains token-level representations and computes relevance scores through late interaction, providing better accuracy while being more efficient than cross-encoders.
|
||||
|
||||
vLLM supports ColBERT models for reranking tasks, automatically applying MaxSim scoring for query-document relevance:
|
||||
|
||||
```shell
|
||||
vllm serve answerdotai/answerai-colbert-small-v1
|
||||
```
|
||||
|
||||
Currently supports ColBERT models with standard BERT encoders (e.g., `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0`).
|
||||
|
||||
ColBERT models with modified encoder architectures are not yet supported, including BERT variants with rotary embeddings (e.g., `jinaai/jina-colbert-v2`) or other custom encoders (e.g., `LiquidAI/LFM2-ColBERT-350M`).
|
||||
|
||||
If your standard BERT ColBERT model's config doesn't specify the architecture as `HF_ColBERT`, override it with:
|
||||
|
||||
```shell
|
||||
vllm serve your-colbert-model --hf-overrides '{"architectures": ["HF_ColBERT"]}'
|
||||
```
|
||||
|
||||
Then you can use the rerank endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"query": "What is machine learning?",
|
||||
"documents": [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a programming language.",
|
||||
"Deep learning uses neural networks."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Or the score endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"text_1": "What is machine learning?",
|
||||
"text_2": ["Machine learning is a subset of AI.", "The weather is sunny."]
|
||||
}'
|
||||
```
|
||||
|
||||
You can also get the raw token embeddings using the pooling endpoint with `token_embed` task:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"input": "What is machine learning?",
|
||||
"task": "token_embed"
|
||||
}'
|
||||
```
|
||||
|
||||
An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../examples/pooling/score/colbert_rerank_online.py)
|
||||
|
||||
### BAAI/bge-m3
|
||||
|
||||
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
|
||||
@@ -352,15 +408,6 @@ We have split the `encode` task into two more specific token-wise tasks: `token_
|
||||
- `token_embed` is the same as `embed`, using normalization as the activation.
|
||||
- `token_classify` is the same as `classify`, by default using softmax as the activation.
|
||||
|
||||
### Remove softmax from PoolingParams
|
||||
|
||||
We are going to remove `softmax` and `activation` from `PoolingParams` in v0.15. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
|
||||
|
||||
### as_reward_model
|
||||
|
||||
!!! warning
|
||||
We are going to remove `--convert reward` in v0.15, use `--convert embed` instead.
|
||||
|
||||
Pooling models now default support all pooling, you can use it without any settings.
|
||||
|
||||
- Extracting hidden states prefers using `token_embed` task.
|
||||
|
||||
@@ -365,6 +365,7 @@ th {
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `thu-coai/ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ |
|
||||
| `CwmForCausalLM` | CWM | `facebook/cwm`, etc. | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ |
|
||||
@@ -375,7 +376,7 @@ th {
|
||||
| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ |
|
||||
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `ExaoneMoeCausalLM` | K-EXAONE | `LGAI-EXAONE/K-EXAONE-236B-A23B`, etc. | | |
|
||||
| `ExaoneMoEForCausalLM` | K-EXAONE | `LGAI-EXAONE/K-EXAONE-236B-A23B`, etc. | | |
|
||||
| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ |
|
||||
| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ |
|
||||
@@ -389,6 +390,7 @@ th {
|
||||
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeLiteForCausalLM` | GLM-4.7-Flash | `zai-org/GLM-4.7-Flash`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `openai-community/gpt2`, `openai-community/gpt2-xl`, etc. | | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
|
||||
@@ -403,7 +405,6 @@ th {
|
||||
| `Grok1ForCausalLM` | Grok2 | `xai-org/grok-2` | ✅︎ | ✅︎ |
|
||||
| `HunYuanDenseV1ForCausalLM` | Hunyuan Dense | `tencent/Hunyuan-7B-Instruct` | ✅︎ | ✅︎ |
|
||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
@@ -416,12 +417,14 @@ th {
|
||||
| `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ |
|
||||
| `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ |
|
||||
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
|
||||
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
|
||||
| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
|
||||
| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ |
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniMaxForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01-hf`, etc. | | |
|
||||
| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
|
||||
@@ -429,10 +432,10 @@ th {
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
|
||||
| `OlmoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `Olmo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Olmo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ |
|
||||
| `OlmoeForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
|
||||
| `OuroForCausalLM` | ouro | `ByteDance/Ouro-1.4B`, `ByteDance/Ouro-2.6B`, etc. | ✅︎ | |
|
||||
@@ -451,18 +454,21 @@ th {
|
||||
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `RWForCausalLM` | Falcon RW | `tiiuae/falcon-40b`, etc. | | ✅︎ |
|
||||
| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
|
||||
| `StableLMEpochForCausalLM` | StableLM Epoch | `stabilityai/stablelm-zephyr-3b`, etc. | | ✅︎ |
|
||||
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
|
||||
| `Step1ForCausalLM` | Step-Audio | `stepfun-ai/Step-Audio-EditX`, etc. | ✅︎ | ✅︎ |
|
||||
| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/step-3.5-flash`, etc. | | ✅︎ |
|
||||
| `TeleChatForCausalLM` | TeleChat | `chuhac/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
|
||||
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
|
||||
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
|
||||
|
||||
!!! note
|
||||
Grok2 requires `tokenizer.tok.json` with `tiktoken` installed. You can optionally override MoE router renormalization with `moe_router_renormalize`.
|
||||
@@ -657,7 +663,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|
||||
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
|
||||
| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
|
||||
| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-2601-hf` | ✅︎ | ✅︎ |
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
|
||||
| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
|
||||
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
|
||||
@@ -666,6 +672,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
|
||||
| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | ✅︎ | ✅︎ |
|
||||
| `DeepseekOCR2ForCausalLM` | DeepSeek-OCR-2 | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR-2`, etc. | ✅︎ | ✅︎ |
|
||||
| `Eagle2_5_VLForConditionalGeneration` | Eagle2.5-VL | T + I<sup>E+</sup> | `nvidia/Eagle2.5-8B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
|
||||
@@ -676,11 +683,13 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
|
||||
| `GlmOcrForConditionalGeneration` | GLM-OCR | T + I<sup>E+</sup> | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
|
||||
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
|
||||
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
|
||||
| `IsaacForConditionalGeneration` | Isaac | T + I<sup>+</sup> | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ |
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternS1ProForConditionalGeneration` | Intern-S1-Pro | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1-Pro`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `KananaVForConditionalGeneration` | Kanana-V | T + I<sup>+</sup> | `kakaocorp/kanana-1.5-v-3b-instruct`, etc. | | ✅︎ |
|
||||
@@ -705,6 +714,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `Molmo2ForConditionalGeneration` | Molmo2 | T + I<sup>+</sup> / V | `allenai/Molmo2-4B`, `allenai/Molmo2-8B`, `allenai/Molmo2-O-7B` | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
|
||||
| `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
|
||||
| `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + I<sup>E+</sup> + V<sup>E+</sup> |`FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
|
||||
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
|
||||
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
|
||||
@@ -771,6 +781,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
| `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | | ✅︎ |
|
||||
| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, etc. | | ✅︎ |
|
||||
| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |
|
||||
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | |
|
||||
|
||||
|
||||
@@ -59,6 +59,8 @@ We currently support the following OpenAI APIs:
|
||||
- Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
|
||||
- [Translation API](#translations-api) (`/v1/audio/translations`)
|
||||
- Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
|
||||
- [Realtime API](#realtime-api) (`/v1/realtime`)
|
||||
- Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
|
||||
|
||||
In addition, we have the following custom APIs:
|
||||
|
||||
@@ -567,6 +569,52 @@ The following extra parameters are supported:
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
|
||||
```
|
||||
|
||||
### Realtime API
|
||||
|
||||
The Realtime API provides WebSocket-based streaming audio transcription, allowing real-time speech-to-text as audio is being recorded.
|
||||
|
||||
!!! note
|
||||
To use the Realtime API, please install with extra audio dependencies using `uv pip install vllm[audio]`.
|
||||
|
||||
#### Audio Format
|
||||
|
||||
Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono channel.
|
||||
|
||||
#### Protocol Overview
|
||||
|
||||
1. Client connects to `ws://host/v1/realtime`
|
||||
2. Server sends `session.created` event
|
||||
3. Client optionally sends `session.update` with model/params
|
||||
4. Client sends `input_audio_buffer.commit` when ready
|
||||
5. Client sends `input_audio_buffer.append` events with base64 PCM16 chunks
|
||||
6. Server sends `transcription.delta` events with incremental text
|
||||
7. Server sends `transcription.done` with final text + usage
|
||||
8. Repeat from step 5 for next utterance
|
||||
9. Optionally, client sends input_audio_buffer.commit with final=True
|
||||
to signal audio input is finished. Useful when streaming audio files
|
||||
|
||||
#### Client → Server Events
|
||||
|
||||
| Event | Description |
|
||||
|-------|-------------|
|
||||
| `input_audio_buffer.append` | Send base64-encoded audio chunk: `{"type": "input_audio_buffer.append", "audio": "<base64>"}` |
|
||||
| `input_audio_buffer.commit` | Trigger transcription processing or end: `{"type": "input_audio_buffer.commit", "final": bool}` |
|
||||
| `session.update` | Configure session: `{"type": "session.update", "model": "model-name"}` |
|
||||
|
||||
#### Server → Client Events
|
||||
|
||||
| Event | Description |
|
||||
|-------|-------------|
|
||||
| `session.created` | Connection established with session ID and timestamp |
|
||||
| `transcription.delta` | Incremental transcription text: `{"type": "transcription.delta", "delta": "text"}` |
|
||||
| `transcription.done` | Final transcription with usage stats |
|
||||
| `error` | Error notification with message and optional code |
|
||||
|
||||
#### Example Clients
|
||||
|
||||
- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file
|
||||
- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
|
||||
|
||||
### Tokenizer API
|
||||
|
||||
Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
|
||||
|
||||
@@ -172,12 +172,13 @@ These endpoints are **only available when the environment variable `VLLM_SERVER_
|
||||
- `/server_info` - Get detailed server configuration
|
||||
- `/reset_prefix_cache` - Reset prefix cache (can disrupt service)
|
||||
- `/reset_mm_cache` - Reset multimodal cache (can disrupt service)
|
||||
- `/reset_encoder_cache` - Reset encoder cache (can disrupt service)
|
||||
- `/sleep` - Put engine to sleep (causes denial of service)
|
||||
- `/wake_up` - Wake engine from sleep
|
||||
- `/is_sleeping` - Check if engine is sleeping
|
||||
- `/collective_rpc` - Execute arbitrary RPC methods on the engine (extremely dangerous)
|
||||
|
||||
**Profiler endpoints (only when `VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE` are set):**
|
||||
**Profiler endpoints (only when profiling is enabled via `--profiler-config`):**
|
||||
|
||||
These endpoints are only available when profiling is enabled and should only be used for local development:
|
||||
|
||||
@@ -206,7 +207,7 @@ An attacker who can reach the vLLM HTTP server can:
|
||||
- Cache manipulation that can disrupt service
|
||||
- Detailed server configuration disclosure
|
||||
|
||||
Similarly, never enable profiler endpoints (`VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE`) in production.
|
||||
Similarly, never enable profiler endpoints in production.
|
||||
|
||||
**Be cautious with `--enable-tokenizer-info-endpoint`:** Only enable the `/tokenizer_info` endpoint if you need to expose tokenizer configuration information. This endpoint reveals chat templates and tokenizer settings that may contain sensitive implementation details or prompt engineering strategies.
|
||||
|
||||
|
||||
@@ -70,6 +70,34 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# MusicFlamingo
|
||||
def run_musicflamingo(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "nvidia/music-flamingo-2601-hf"
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
# MusicFlamingo uses <sound> token for audio
|
||||
audio_placeholder = "<sound>" * audio_count
|
||||
|
||||
prompt = (
|
||||
"<|im_start|>system\n"
|
||||
"You are a helpful assistant.<|im_end|>\n"
|
||||
"<|im_start|>user\n"
|
||||
f"{audio_placeholder}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
# Gemma3N
|
||||
def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "google/gemma-3n-E2B-it"
|
||||
@@ -452,6 +480,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
|
||||
model_example_map = {
|
||||
"audioflamingo3": run_audioflamingo3,
|
||||
"musicflamingo": run_musicflamingo,
|
||||
"gemma3n": run_gemma3n,
|
||||
"glmasr": run_glmasr,
|
||||
"funaudiochat": run_funaudiochat,
|
||||
|
||||
@@ -56,17 +56,10 @@ Try it yourself with the following argument:
|
||||
|
||||
vLLM supports models that are quantized using GGUF.
|
||||
|
||||
Try one yourself by downloading a quantized GGUF model and using the following arguments:
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download
|
||||
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
|
||||
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
|
||||
print(hf_hub_download(repo_id, filename=filename))
|
||||
```
|
||||
Try one yourself using the `repo_id:quant_type` format to load directly from HuggingFace:
|
||||
|
||||
```bash
|
||||
--model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct
|
||||
--model unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B
|
||||
```
|
||||
|
||||
### CPU offload
|
||||
|
||||
@@ -38,8 +38,8 @@ def get_prompt_embeds(
|
||||
embedding_layer: torch.nn.Module,
|
||||
):
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
chat, add_generation_prompt=True, return_tensors="pt"
|
||||
)
|
||||
chat, add_generation_prompt=True, return_tensors="pt", return_dict=True
|
||||
).input_ids
|
||||
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
||||
return prompt_embeds
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ https://docs.ray.io/en/latest/placement-groups.html
|
||||
|
||||
import gc
|
||||
import os
|
||||
import sys
|
||||
|
||||
import ray
|
||||
import torch
|
||||
@@ -40,6 +41,10 @@ from torch.multiprocessing.reductions import reduce_tensor
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
if torch.version.hip is not None:
|
||||
print("Skipping test for ROCm. Ray is unsupported on vLLM ROCm.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
class MyLLM(LLM):
|
||||
"""Configure the vLLM worker for Ray placement group execution.
|
||||
|
||||
@@ -5,7 +5,6 @@ from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.benchmarks.datasets import add_dataset_parser, get_samples
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.v1.metrics.reader import Counter, Vector
|
||||
|
||||
try:
|
||||
@@ -56,6 +55,7 @@ def parse_args():
|
||||
default="eagle",
|
||||
choices=["ngram", "eagle", "eagle3", "mtp", "draft_model"],
|
||||
)
|
||||
parser.add_argument("--backend", type=str, default="openai")
|
||||
parser.add_argument("--num-spec-tokens", type=int, default=2)
|
||||
parser.add_argument("--prompt-lookup-max", type=int, default=5)
|
||||
parser.add_argument("--prompt-lookup-min", type=int, default=2)
|
||||
@@ -75,12 +75,11 @@ def parse_args():
|
||||
parser.add_argument("--gpu-memory-utilization", type=float, default=0.9)
|
||||
parser.add_argument("--disable-padded-drafter-batch", action="store_true")
|
||||
parser.add_argument("--max-num-seqs", type=int, default=None)
|
||||
parser.add_argument("--allowed-local-media-path", type=str, default="")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
args.endpoint_type = "openai-chat"
|
||||
|
||||
model_dir = args.model_dir
|
||||
if args.model_dir is None:
|
||||
if args.custom_mm_prompts:
|
||||
@@ -91,19 +90,25 @@ def main(args):
|
||||
)
|
||||
model_dir = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
||||
args.custom_skip_chat_template = True
|
||||
|
||||
if not args.custom_mm_prompts:
|
||||
prompts = get_samples(args, tokenizer)
|
||||
# add_special_tokens is False to avoid adding bos twice
|
||||
# when using chat templates
|
||||
prompt_ids = [
|
||||
tokenizer.encode(prompt.prompt, add_special_tokens=False)
|
||||
for prompt in prompts
|
||||
]
|
||||
if args.custom_mm_prompts:
|
||||
prompts = llm_prompts = get_custom_mm_prompts(args.num_prompts)
|
||||
else:
|
||||
prompts = get_custom_mm_prompts(args.num_prompts)
|
||||
|
||||
prompts = get_samples(args, tokenizer)
|
||||
if args.enable_multimodal_chat:
|
||||
llm_prompts = [p.prompt for p in prompts]
|
||||
else:
|
||||
# add_special_tokens is False to avoid adding bos twice
|
||||
# when using chat templates
|
||||
llm_prompts = [
|
||||
{
|
||||
"prompt_token_ids": tokenizer.encode(
|
||||
prompt.prompt, add_special_tokens=False
|
||||
),
|
||||
"multi_modal_data": prompt.multi_modal_data,
|
||||
}
|
||||
for prompt in prompts
|
||||
]
|
||||
if args.method == "eagle" or args.method == "eagle3":
|
||||
eagle_dir = args.eagle_dir
|
||||
if args.method == "eagle" and eagle_dir is None:
|
||||
@@ -154,16 +159,17 @@ def main(args):
|
||||
limit_mm_per_prompt={"image": 5},
|
||||
disable_chunked_mm_input=True,
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
allowed_local_media_path=args.allowed_local_media_path,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
|
||||
if not args.custom_mm_prompts:
|
||||
if args.backend == "openai-chat":
|
||||
outputs = llm.chat(llm_prompts, sampling_params=sampling_params)
|
||||
else:
|
||||
outputs = llm.generate(
|
||||
[TokensPrompt(prompt_token_ids=x) for x in prompt_ids],
|
||||
llm_prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
else:
|
||||
outputs = llm.chat(prompts, sampling_params=sampling_params)
|
||||
|
||||
# print the generated text
|
||||
if args.print_output:
|
||||
@@ -219,6 +225,8 @@ def main(args):
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
args.enable_multimodal_chat = args.backend == "openai-chat"
|
||||
|
||||
acceptance_length = main(args)
|
||||
|
||||
if args.test:
|
||||
|
||||
@@ -270,6 +270,49 @@ def run_deepseek_ocr(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_deepseek_ocr2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
|
||||
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "deepseek-ai/DeepSeek-OCR-2"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
logits_processors=[NGramPerReqLogitsProcessor],
|
||||
)
|
||||
|
||||
# deepseek-ocr use plain prompt template
|
||||
prompts = [f"<image>\n{question}" for question in questions]
|
||||
|
||||
# The following sampling params config is taken from
|
||||
# the official Deepseek-OCR inference example.
|
||||
# (IMPORTANT) Use the custom logits processor and avoid skipping
|
||||
# special tokens for this model for the optimal OCR performance.
|
||||
sampling_params = [
|
||||
SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=8192,
|
||||
# ngram logit processor args
|
||||
extra_args=dict(
|
||||
ngram_size=30,
|
||||
window_size=90,
|
||||
# whitelist: <td>, </td>
|
||||
whitelist_token_ids={128821, 128822},
|
||||
),
|
||||
skip_special_tokens=False,
|
||||
)
|
||||
for _ in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
|
||||
# Dots-OCR
|
||||
def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@@ -799,6 +842,40 @@ def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Intern-S1-Pro
|
||||
def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "internlm/Intern-S1-Pro"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=4,
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
elif modality == "video":
|
||||
placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||
messages = [
|
||||
[{"role": "user", "content": f"{placeholder}\n{question}"}]
|
||||
for question in questions
|
||||
]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# InternVL
|
||||
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "OpenGVLab/InternVL3-2B"
|
||||
@@ -952,6 +1029,31 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Kimi-VL
|
||||
def run_kimi_k25(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "vision_chunk"
|
||||
|
||||
prompts = [
|
||||
"<|im_user|>user<|media_begin|>image<|media_content|>"
|
||||
f"<|media_pad|><|media_end|>{question}<|im_end|>"
|
||||
"<|im_assistant|>assistant<|im_middle|>"
|
||||
for question in questions
|
||||
]
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="moonshotai/Kimi-K2.5",
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
tensor_parallel_size=4,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# LightOnOCR
|
||||
def run_lightonocr(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@@ -1394,6 +1496,37 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# OpenPangu
|
||||
def run_openpangu_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "FreedomIntelligence/openPangu-VL-7B"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=4,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "[unused19]"
|
||||
elif modality == "video":
|
||||
placeholder = "[unused32]"
|
||||
|
||||
prompts = [
|
||||
(
|
||||
f"<s>[unused9]系统:[unused10][unused9]用户:[unused18]{placeholder}[unused20]{question}[unused10][unused9]助手:"
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Ovis
|
||||
def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@@ -2014,6 +2147,7 @@ model_example_map = {
|
||||
"command_a_vision": run_command_a_vision,
|
||||
"deepseek_vl_v2": run_deepseek_vl2,
|
||||
"deepseek_ocr": run_deepseek_ocr,
|
||||
"deepseek_ocr2": run_deepseek_ocr2,
|
||||
"dots_ocr": run_dots_ocr,
|
||||
"eagle2_5": run_eagle2_5,
|
||||
"ernie45_vl": run_ernie45_vl,
|
||||
@@ -2030,11 +2164,13 @@ model_example_map = {
|
||||
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
|
||||
"idefics3": run_idefics3,
|
||||
"interns1": run_interns1,
|
||||
"interns1_pro": run_interns1_pro,
|
||||
"internvl_chat": run_internvl,
|
||||
"kanana_v": run_kanana_v,
|
||||
"keye_vl": run_keye_vl,
|
||||
"keye_vl1_5": run_keye_vl1_5,
|
||||
"kimi_vl": run_kimi_vl,
|
||||
"kimi_k25": run_kimi_k25,
|
||||
"lightonocr": run_lightonocr,
|
||||
"lfm2_vl": run_lfm2_vl,
|
||||
"llama4": run_llama4,
|
||||
@@ -2051,6 +2187,7 @@ model_example_map = {
|
||||
"molmo2": run_molmo2,
|
||||
"nemotron_vl": run_nemotron_vl,
|
||||
"NVLM_D": run_nvlm_d,
|
||||
"openpangu_vl": run_openpangu_vl,
|
||||
"ovis": run_ovis,
|
||||
"ovis2_5": run_ovis2_5,
|
||||
"paddleocr_vl": run_paddleocr_vl,
|
||||
@@ -2120,6 +2257,19 @@ def get_multi_modal_input(args):
|
||||
"questions": vid_questions,
|
||||
}
|
||||
|
||||
if args.modality == "vision_chunk":
|
||||
# Input vision chunks and question
|
||||
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
vision_chunk_questions = [
|
||||
"What is the content of this image chunk?",
|
||||
"Describe the content of this image chunk in detail.",
|
||||
]
|
||||
|
||||
return {
|
||||
"data": {"type": "image", "image": image},
|
||||
"questions": vision_chunk_questions,
|
||||
}
|
||||
|
||||
msg = f"Modality {args.modality} is not supported."
|
||||
raise ValueError(msg)
|
||||
|
||||
@@ -2202,7 +2352,7 @@ def parse_args():
|
||||
"--modality",
|
||||
type=str,
|
||||
default="image",
|
||||
choices=["image", "video"],
|
||||
choices=["image", "video", "vision_chunk"],
|
||||
help="Modality of the input.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -2279,7 +2429,7 @@ def main(args):
|
||||
req_data = model_example_map[model](questions, modality)
|
||||
|
||||
# Disable other modalities to save memory
|
||||
default_limits = {"image": 0, "video": 0, "audio": 0}
|
||||
default_limits = {"image": 0, "video": 0, "audio": 0, "vision_chunk": 0}
|
||||
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
|
||||
req_data.engine_args.limit_mm_per_prompt or {}
|
||||
)
|
||||
|
||||
@@ -765,6 +765,32 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# OpenPangu
|
||||
def load_openpangu_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "FreedomIntelligence/openPangu-VL-7B"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = "[unused18][unused19][unused20]" * len(image_urls)
|
||||
prompt = (
|
||||
f"<s>[unused9]系统:[unused10][unused9]用户:{question}{placeholders}"
|
||||
"[unused10][unused9]助手:"
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
# Ovis
|
||||
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "AIDC-AI/Ovis2-1B"
|
||||
@@ -1257,6 +1283,42 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# GLM-4.1V
|
||||
def load_glm4_1v(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "zai-org/GLM-4.1V-9B-Thinking"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=45082,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=image_data,
|
||||
)
|
||||
|
||||
|
||||
# GLM-4.5V
|
||||
def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "zai-org/GLM-4.5V"
|
||||
@@ -1388,6 +1450,7 @@ model_example_map = {
|
||||
"mistral3": load_mistral3,
|
||||
"molmo2": load_molmo2,
|
||||
"NVLM_D": load_nvlm_d,
|
||||
"openpangu_vl": load_openpangu_vl,
|
||||
"ovis": load_ovis,
|
||||
"ovis2_5": load_ovis2_5,
|
||||
"paddleocr_vl": load_paddleocr_vl,
|
||||
@@ -1403,6 +1466,7 @@ model_example_map = {
|
||||
"stepvl": load_step_vl,
|
||||
"tarsier": load_tarsier,
|
||||
"tarsier2": load_tarsier2,
|
||||
"glm4_1v": load_glm4_1v,
|
||||
"glm4_5v": load_glm4_5v,
|
||||
"glm4_5v_fp8": load_glm4_5v_fp8,
|
||||
}
|
||||
|
||||
@@ -6,3 +6,4 @@ This example contains scripts that demonstrate the disaggregated serving feature
|
||||
|
||||
- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
|
||||
- `kv_events.sh` - Demonstrates KV cache event publishing.
|
||||
- `mooncake_connector` - A proxy demo for MooncakeConnector.
|
||||
|
||||
@@ -0,0 +1,376 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import ipaddress
|
||||
import itertools
|
||||
import os
|
||||
import urllib
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
|
||||
def maybe_wrap_ipv6_address(address: str) -> str:
|
||||
try:
|
||||
ipaddress.IPv6Address(address)
|
||||
return f"[{address}]"
|
||||
except ValueError:
|
||||
return address
|
||||
|
||||
|
||||
def make_http_path(host: str, port: int) -> str:
|
||||
return f"http://{host}:{port}"
|
||||
|
||||
|
||||
def prefiller_cycle(prefill_clients: list[Any]):
|
||||
while True:
|
||||
for prefill_client in prefill_clients:
|
||||
for i in range(prefill_client["dp_size"]):
|
||||
yield prefill_client, i
|
||||
|
||||
|
||||
async def get_prefiller_info(prefill_clients: list, ready: asyncio.Event):
|
||||
for prefill_client in prefill_clients:
|
||||
while True:
|
||||
try:
|
||||
# Wait for prefill service to be ready
|
||||
response = await prefill_client["client"].get("/health")
|
||||
response.raise_for_status()
|
||||
except Exception:
|
||||
await asyncio.sleep(1)
|
||||
continue
|
||||
|
||||
response = await prefill_client["client"].get(
|
||||
prefill_client["bootstrap_addr"] + "/query"
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
break
|
||||
|
||||
for dp_rank, dp_entry in data.items():
|
||||
prefill_client["dp_engine_id"][int(dp_rank)] = dp_entry["engine_id"]
|
||||
dp_size = len(data)
|
||||
prefill_client["dp_size"] = dp_size
|
||||
print(f"Inited prefiller {prefill_client['url']} with dp_size={dp_size}")
|
||||
|
||||
ready.set()
|
||||
print("All prefiller instances are ready.")
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
Lifespan context manager to handle startup and shutdown events.
|
||||
"""
|
||||
# Startup: Initialize client pools for prefiller and decoder services
|
||||
app.state.prefill_clients = []
|
||||
app.state.decode_clients = []
|
||||
app.state.ready = asyncio.Event()
|
||||
|
||||
# Create prefill clients
|
||||
for i, (url, bootstrap_port) in enumerate(global_args.prefill):
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
|
||||
app.state.prefill_clients.append(
|
||||
{
|
||||
"client": httpx.AsyncClient(
|
||||
timeout=None,
|
||||
base_url=url,
|
||||
limits=httpx.Limits(
|
||||
max_connections=None,
|
||||
max_keepalive_connections=None,
|
||||
),
|
||||
),
|
||||
"url": url,
|
||||
"bootstrap_addr": make_http_path(hostname, bootstrap_port or 8998),
|
||||
"dp_engine_id": {},
|
||||
}
|
||||
)
|
||||
|
||||
# Create decode clients
|
||||
for i, url in enumerate(global_args.decode):
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
|
||||
app.state.decode_clients.append(
|
||||
{
|
||||
"client": httpx.AsyncClient(
|
||||
timeout=None,
|
||||
base_url=url,
|
||||
limits=httpx.Limits(
|
||||
max_connections=None,
|
||||
max_keepalive_connections=None,
|
||||
),
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
asyncio.create_task(get_prefiller_info(app.state.prefill_clients, app.state.ready))
|
||||
|
||||
# Initialize round-robin iterators
|
||||
app.state.prefill_iterator = prefiller_cycle(app.state.prefill_clients)
|
||||
app.state.decode_iterator = itertools.cycle(range(len(app.state.decode_clients)))
|
||||
|
||||
print(
|
||||
f"Got {len(app.state.prefill_clients)} prefill clients "
|
||||
f"and {len(app.state.decode_clients)} decode clients."
|
||||
)
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown: Close all clients
|
||||
for client_info in app.state.prefill_clients:
|
||||
await client_info["client"].aclose()
|
||||
|
||||
for client_info in app.state.decode_clients:
|
||||
await client_info["client"].aclose()
|
||||
|
||||
|
||||
# Update FastAPI app initialization to use lifespan
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
# Always use 127.0.0.1 as localhost binds to IPv6 which is blocked on CI
|
||||
parser.add_argument("--host", type=str, default="127.0.0.1")
|
||||
|
||||
# For prefiller instances
|
||||
parser.add_argument(
|
||||
"--prefill",
|
||||
nargs="+",
|
||||
action="append",
|
||||
dest="prefill_raw",
|
||||
metavar=("URL", "bootstrap_port"),
|
||||
help=(
|
||||
"Prefill server URL and optional bootstrap port. "
|
||||
"Can be specified multiple times. "
|
||||
"Format: --prefill URL [BOOTSTRAP_PORT]. "
|
||||
"BOOTSTRAP_PORT can be a port number, "
|
||||
"'none', or omitted (defaults to none)."
|
||||
),
|
||||
)
|
||||
|
||||
# For decoder instances
|
||||
parser.add_argument(
|
||||
"--decode",
|
||||
nargs=1,
|
||||
action="append",
|
||||
dest="decode_raw",
|
||||
metavar=("URL",),
|
||||
help="Decode server URL. Can be specified multiple times.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
args.prefill = _parse_prefill_urls(args.prefill_raw)
|
||||
args.decode = _parse_decode_urls(args.decode_raw)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
# From sglang router_args.py
|
||||
def _parse_prefill_urls(prefill_list):
|
||||
"""Parse prefill URLs from --prefill arguments.
|
||||
|
||||
Format: --prefill URL [BOOTSTRAP_PORT]
|
||||
Example:
|
||||
--prefill http://prefill1:8080 9000 # With bootstrap port
|
||||
--prefill http://prefill2:8080 none # Explicitly no bootstrap port
|
||||
--prefill http://prefill3:8080 # Defaults to no bootstrap port
|
||||
"""
|
||||
if not prefill_list:
|
||||
return []
|
||||
|
||||
prefill_urls = []
|
||||
for prefill_args in prefill_list:
|
||||
url = prefill_args[0]
|
||||
|
||||
# Handle optional bootstrap port
|
||||
if len(prefill_args) >= 2:
|
||||
bootstrap_port_str = prefill_args[1]
|
||||
# Handle 'none' as None
|
||||
if bootstrap_port_str.lower() == "none":
|
||||
bootstrap_port = None
|
||||
else:
|
||||
try:
|
||||
bootstrap_port = int(bootstrap_port_str)
|
||||
except ValueError as e:
|
||||
raise ValueError(
|
||||
f"Invalid bootstrap port: {bootstrap_port_str}. Must be a number or 'none'" # noqa: E501
|
||||
) from e
|
||||
else:
|
||||
# No bootstrap port specified, default to None
|
||||
bootstrap_port = None
|
||||
|
||||
prefill_urls.append((url, bootstrap_port))
|
||||
|
||||
return prefill_urls
|
||||
|
||||
|
||||
def _parse_decode_urls(decode_list):
|
||||
"""Parse decode URLs from --decode arguments.
|
||||
|
||||
Format: --decode URL
|
||||
Example: --decode http://decode1:8081 --decode http://decode2:8081
|
||||
"""
|
||||
if not decode_list:
|
||||
return []
|
||||
|
||||
# decode_list is a list of single-element lists due to nargs=1
|
||||
return [url[0] for url in decode_list]
|
||||
|
||||
|
||||
def get_next_client(app, service_type: str):
|
||||
"""
|
||||
Get the next client in round-robin fashion.
|
||||
|
||||
Args:
|
||||
app: The FastAPI app instance
|
||||
service_type: Either 'prefill' or 'decode'
|
||||
|
||||
Returns:
|
||||
The next client to use
|
||||
"""
|
||||
if service_type == "prefill":
|
||||
return next(app.state.prefill_iterator)
|
||||
elif service_type == "decode":
|
||||
client_idx = next(app.state.decode_iterator)
|
||||
return app.state.decode_clients[client_idx]
|
||||
else:
|
||||
raise ValueError(f"Unknown service type: {service_type}")
|
||||
|
||||
|
||||
async def send_request_to_service(
|
||||
client_info: dict, dp_rank: int, endpoint: str, req_data: dict, request_id: str
|
||||
):
|
||||
"""
|
||||
Send a request to a service using a client from the pool.
|
||||
"""
|
||||
req_data = req_data.copy()
|
||||
req_data["kv_transfer_params"] = {
|
||||
"do_remote_decode": True,
|
||||
"do_remote_prefill": False,
|
||||
"transfer_id": f"xfer-{request_id}",
|
||||
}
|
||||
req_data["stream"] = False
|
||||
req_data["max_tokens"] = 1
|
||||
if "max_completion_tokens" in req_data:
|
||||
req_data["max_completion_tokens"] = 1
|
||||
if "stream_options" in req_data:
|
||||
del req_data["stream_options"]
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
"X-Request-Id": request_id,
|
||||
"X-data-parallel-rank": str(dp_rank),
|
||||
}
|
||||
|
||||
response = await client_info["client"].post(
|
||||
endpoint, json=req_data, headers=headers
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# CRITICAL: Release connection back to pool
|
||||
await response.aclose()
|
||||
|
||||
|
||||
async def stream_service_response(
|
||||
prefill_client_info: dict,
|
||||
prefill_dp_rank: int,
|
||||
decode_client_info: dict,
|
||||
endpoint: str,
|
||||
req_data: dict,
|
||||
request_id: str,
|
||||
):
|
||||
"""
|
||||
Asynchronously stream response from a service using a client from the pool.
|
||||
"""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
"X-Request-Id": request_id,
|
||||
}
|
||||
|
||||
req_data["kv_transfer_params"] = {
|
||||
"do_remote_decode": False,
|
||||
"do_remote_prefill": True,
|
||||
"remote_bootstrap_addr": prefill_client_info["bootstrap_addr"],
|
||||
"remote_engine_id": prefill_client_info["dp_engine_id"][prefill_dp_rank],
|
||||
"transfer_id": f"xfer-{request_id}",
|
||||
}
|
||||
|
||||
async with decode_client_info["client"].stream(
|
||||
"POST", endpoint, json=req_data, headers=headers
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
async for chunk in response.aiter_bytes():
|
||||
yield chunk
|
||||
|
||||
|
||||
async def _handle_completions(api: str, request: Request):
|
||||
if not app.state.ready.is_set():
|
||||
raise HTTPException(status_code=503, detail="Service Unavailable")
|
||||
|
||||
try:
|
||||
req_data = await request.json()
|
||||
request_id = str(uuid.uuid4())
|
||||
|
||||
# Get the next prefill client in round-robin fashion
|
||||
prefill_client_info, prefill_dp_rank = get_next_client(request.app, "prefill")
|
||||
|
||||
# Send request to prefill service
|
||||
asyncio.create_task(
|
||||
send_request_to_service(
|
||||
prefill_client_info, prefill_dp_rank, api, req_data, request_id
|
||||
)
|
||||
)
|
||||
|
||||
decode_client_info = get_next_client(request.app, "decode")
|
||||
|
||||
# Stream response from decode service
|
||||
async def generate_stream():
|
||||
async for chunk in stream_service_response(
|
||||
prefill_client_info,
|
||||
prefill_dp_rank,
|
||||
decode_client_info,
|
||||
api,
|
||||
req_data,
|
||||
request_id=request_id,
|
||||
):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(generate_stream(), media_type="application/json")
|
||||
|
||||
except Exception as e:
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
exc_info = sys.exc_info()
|
||||
print(f"Error occurred in disagg prefill proxy server - {api} endpoint")
|
||||
print(e)
|
||||
print("".join(traceback.format_exception(*exc_info)))
|
||||
raise
|
||||
|
||||
|
||||
@app.post("/v1/completions")
|
||||
async def handle_completions(request: Request):
|
||||
return await _handle_completions("/v1/completions", request)
|
||||
|
||||
|
||||
@app.post("/v1/chat/completions")
|
||||
async def handle_chat_completions(request: Request):
|
||||
return await _handle_completions("/v1/chat/completions", request)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
global global_args
|
||||
global_args = parse_args()
|
||||
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host=global_args.host, port=global_args.port)
|
||||
@@ -0,0 +1,222 @@
|
||||
#!/bin/bash
|
||||
|
||||
# =============================================================================
|
||||
# vLLM Disaggregated Serving Script for Mooncake Connector
|
||||
# =============================================================================
|
||||
# This script demonstrates disaggregated prefill and decode serving using
|
||||
# Mooncake Connector.
|
||||
#
|
||||
# Configuration can be customized via environment variables:
|
||||
# MODEL: Model to serve
|
||||
# PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
|
||||
# DECODE_GPUS: Comma-separated GPU IDs for decode servers
|
||||
# PREFILL_PORTS: Comma-separated ports for prefill servers
|
||||
# BOOTSTRAP_PORTS: Bootstrap server port launched by prefill servers
|
||||
# DECODE_PORTS: Comma-separated ports for decode servers
|
||||
# PROXY_PORT: Proxy server port used to setup P/D disaggregated connection.
|
||||
# TIMEOUT_SECONDS: Server startup timeout
|
||||
# =============================================================================
|
||||
|
||||
# Configuration - can be overridden via environment variables
|
||||
MODEL=${MODEL:-Qwen/Qwen2.5-7B-Instruct}
|
||||
TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
|
||||
PROXY_PORT=${PROXY_PORT:-8000}
|
||||
|
||||
PREFILL_GPUS=${PREFILL_GPUS:-0}
|
||||
DECODE_GPUS=${DECODE_GPUS:-1}
|
||||
PREFILL_PORTS=${PREFILL_PORTS:-8010}
|
||||
BOOTSTRAP_PORTS=${BOOTSTRAP_PORTS:-8998}
|
||||
DECODE_PORTS=${DECODE_PORTS:-8020}
|
||||
|
||||
echo "Warning: Mooncake Connector support for vLLM v1 is experimental and subject to change."
|
||||
echo ""
|
||||
echo "Architecture Configuration:"
|
||||
echo " Model: $MODEL"
|
||||
echo " Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS, Bootstrap Port:$BOOTSTRAP_PORTS"
|
||||
echo " Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
|
||||
echo " Proxy Port: $PROXY_PORT"
|
||||
echo " Timeout: ${TIMEOUT_SECONDS}s"
|
||||
echo ""
|
||||
|
||||
PIDS=()
|
||||
|
||||
# Switch to the directory of the current script
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||
|
||||
check_required_files() {
|
||||
local files=("mooncake_connector_proxy.py")
|
||||
for file in "${files[@]}"; do
|
||||
if [[ ! -f "$file" ]]; then
|
||||
echo "Required file $file not found in $(pwd)"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
if [ -z "$HF_TOKEN" ]; then
|
||||
echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
|
||||
echo "Example: export HF_TOKEN=your_token_here"
|
||||
exit 1
|
||||
fi
|
||||
if [[ "$HF_TOKEN" != hf_* ]]; then
|
||||
echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
|
||||
exit 1
|
||||
fi
|
||||
echo "HF_TOKEN is set and valid."
|
||||
}
|
||||
|
||||
check_num_gpus() {
|
||||
# Check if the number of GPUs are >=2 via nvidia-smi
|
||||
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||
if [ "$num_gpus" -lt 2 ]; then
|
||||
echo "You need at least 2 GPUs to run disaggregated prefill."
|
||||
exit 1
|
||||
else
|
||||
echo "Found $num_gpus GPUs."
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_python_library_installed() {
|
||||
echo "Checking if $1 is installed..."
|
||||
if ! python3 -c "import $1" > /dev/null 2>&1; then
|
||||
echo "$1 is not installed. Please install it via pip install $1."
|
||||
exit 1
|
||||
else
|
||||
echo "$1 is installed."
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
echo "Stopping everything…"
|
||||
trap - INT TERM # prevent re-entrancy
|
||||
pkill -9 -f "mooncake_connector_proxy.py"
|
||||
kill -- -$$ # negative PID == "this whole process-group"
|
||||
wait # reap children so we don't leave zombies
|
||||
exit 0
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
local timeout_seconds=$TIMEOUT_SECONDS
|
||||
local start_time=$(date +%s)
|
||||
|
||||
echo "Waiting for server on port $port..."
|
||||
|
||||
while true; do
|
||||
if curl -s "localhost:${port}/v1/completions" > /dev/null; then
|
||||
echo "Server on port $port is ready."
|
||||
return 0
|
||||
fi
|
||||
|
||||
local now=$(date +%s)
|
||||
if (( now - start_time >= timeout_seconds )); then
|
||||
echo "Timeout waiting for server on port $port"
|
||||
return 1
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
check_required_files
|
||||
check_hf_token
|
||||
check_num_gpus
|
||||
ensure_python_library_installed vllm
|
||||
ensure_python_library_installed mooncake.engine
|
||||
|
||||
trap cleanup INT
|
||||
trap cleanup USR1
|
||||
trap cleanup TERM
|
||||
|
||||
echo "Launching disaggregated serving components..."
|
||||
echo "Please check the log files for detailed output:"
|
||||
echo " - prefill*.log: Prefill server logs"
|
||||
echo " - decode*.log: Decode server logs"
|
||||
echo " - proxy.log: Proxy server log"
|
||||
|
||||
# Parse GPU and port arrays
|
||||
IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
|
||||
IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
|
||||
IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
|
||||
IFS=',' read -ra BOOTSTRAP_PORT_ARRAY <<< "$BOOTSTRAP_PORTS"
|
||||
IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
|
||||
|
||||
proxy_param=""
|
||||
|
||||
# =============================================================================
|
||||
# Launch Prefill Servers (X Producers)
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
|
||||
for i in "${!PREFILL_GPU_ARRAY[@]}"; do
|
||||
local gpu_id=${PREFILL_GPU_ARRAY[$i]}
|
||||
local port=${PREFILL_PORT_ARRAY[$i]}
|
||||
local bootstrap_port=${BOOTSTRAP_PORT_ARRAY[$i]}
|
||||
|
||||
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, Bootstrap Port $bootstrap_port"
|
||||
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
--port $port \
|
||||
--kv-transfer-config \
|
||||
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" > prefill$((i+1)).log 2>&1 &
|
||||
PIDS+=($!)
|
||||
proxy_param="${proxy_param} --prefill http://0.0.0.0:${port} $bootstrap_port"
|
||||
done
|
||||
|
||||
# =============================================================================
|
||||
# Launch Decode Servers (Y Decoders)
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
|
||||
for i in "${!DECODE_GPU_ARRAY[@]}"; do
|
||||
local gpu_id=${DECODE_GPU_ARRAY[$i]}
|
||||
local port=${DECODE_PORT_ARRAY[$i]}
|
||||
|
||||
echo " Decode server $((i+1)): GPU $gpu_id, Port $port"
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
--port $port \
|
||||
--kv-transfer-config \
|
||||
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\"}" > decode$((i+1)).log 2>&1 &
|
||||
PIDS+=($!)
|
||||
proxy_param="${proxy_param} --decode http://0.0.0.0:${port}"
|
||||
done
|
||||
|
||||
# =============================================================================
|
||||
# Launch Proxy Server
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Starting proxy server on port $PROXY_PORT..."
|
||||
python3 mooncake_connector_proxy.py $proxy_param --port $PROXY_PORT > proxy.log 2>&1 &
|
||||
PIDS+=($!)
|
||||
|
||||
# =============================================================================
|
||||
# Wait for All Servers to Start
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Waiting for all servers to start..."
|
||||
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
|
||||
if ! wait_for_server $port; then
|
||||
echo "Failed to start server on port $port"
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "All servers are up. Starting benchmark..."
|
||||
|
||||
# =============================================================================
|
||||
# Run Benchmark
|
||||
# =============================================================================
|
||||
vllm bench serve --port $PROXY_PORT --seed $(date +%s) \
|
||||
--backend vllm --model $MODEL \
|
||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
|
||||
|
||||
echo "Benchmarking done. Cleaning up..."
|
||||
|
||||
cleanup
|
||||
}
|
||||
|
||||
main
|
||||
151
examples/online_serving/openai_realtime_client.py
Normal file
151
examples/online_serving/openai_realtime_client.py
Normal file
@@ -0,0 +1,151 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This script demonstrates how to use the vLLM Realtime WebSocket API to perform
|
||||
audio transcription by uploading an audio file.
|
||||
|
||||
Before running this script, you must start the vLLM server with a realtime-capable
|
||||
model, for example:
|
||||
|
||||
vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --enforce-eager
|
||||
|
||||
Requirements:
|
||||
- vllm with audio support
|
||||
- websockets
|
||||
- librosa
|
||||
- numpy
|
||||
|
||||
The script:
|
||||
1. Connects to the Realtime WebSocket endpoint
|
||||
2. Converts an audio file to PCM16 @ 16kHz
|
||||
3. Sends audio chunks to the server
|
||||
4. Receives and prints transcription as it streams
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import websockets
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
|
||||
def audio_to_pcm16_base64(audio_path: str) -> str:
|
||||
"""
|
||||
Load an audio file and convert it to base64-encoded PCM16 @ 16kHz.
|
||||
"""
|
||||
# Load audio and resample to 16kHz mono
|
||||
audio, _ = librosa.load(audio_path, sr=16000, mono=True)
|
||||
# Convert to PCM16
|
||||
pcm16 = (audio * 32767).astype(np.int16)
|
||||
# Encode as base64
|
||||
return base64.b64encode(pcm16.tobytes()).decode("utf-8")
|
||||
|
||||
|
||||
async def realtime_transcribe(audio_path: str, host: str, port: int, model: str):
|
||||
"""
|
||||
Connect to the Realtime API and transcribe an audio file.
|
||||
"""
|
||||
uri = f"ws://{host}:{port}/v1/realtime"
|
||||
|
||||
async with websockets.connect(uri) as ws:
|
||||
# Wait for session.created
|
||||
response = json.loads(await ws.recv())
|
||||
if response["type"] == "session.created":
|
||||
print(f"Session created: {response['id']}")
|
||||
else:
|
||||
print(f"Unexpected response: {response}")
|
||||
return
|
||||
|
||||
# Validate model
|
||||
await ws.send(json.dumps({"type": "session.update", "model": model}))
|
||||
|
||||
# Signal ready to start
|
||||
await ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
|
||||
|
||||
# Convert audio file to base64 PCM16
|
||||
print(f"Loading audio from: {audio_path}")
|
||||
audio_base64 = audio_to_pcm16_base64(audio_path)
|
||||
|
||||
# Send audio in chunks (4KB of raw audio = ~8KB base64)
|
||||
chunk_size = 4096
|
||||
audio_bytes = base64.b64decode(audio_base64)
|
||||
total_chunks = (len(audio_bytes) + chunk_size - 1) // chunk_size
|
||||
|
||||
print(f"Sending {total_chunks} audio chunks...")
|
||||
for i in range(0, len(audio_bytes), chunk_size):
|
||||
chunk = audio_bytes[i : i + chunk_size]
|
||||
await ws.send(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "input_audio_buffer.append",
|
||||
"audio": base64.b64encode(chunk).decode("utf-8"),
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
# Signal all audio is sent
|
||||
await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True}))
|
||||
print("Audio sent. Waiting for transcription...\n")
|
||||
|
||||
# Receive transcription
|
||||
print("Transcription: ", end="", flush=True)
|
||||
while True:
|
||||
response = json.loads(await ws.recv())
|
||||
if response["type"] == "transcription.delta":
|
||||
print(response["delta"], end="", flush=True)
|
||||
elif response["type"] == "transcription.done":
|
||||
print(f"\n\nFinal transcription: {response['text']}")
|
||||
if response.get("usage"):
|
||||
print(f"Usage: {response['usage']}")
|
||||
break
|
||||
elif response["type"] == "error":
|
||||
print(f"\nError: {response['error']}")
|
||||
break
|
||||
|
||||
|
||||
def main(args):
|
||||
if args.audio_path:
|
||||
audio_path = args.audio_path
|
||||
else:
|
||||
# Use default audio asset
|
||||
audio_path = str(AudioAsset("mary_had_lamb").get_local_path())
|
||||
print(f"No audio path provided, using default: {audio_path}")
|
||||
|
||||
asyncio.run(realtime_transcribe(audio_path, args.host, args.port, args.model))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Realtime WebSocket Transcription Client"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="mistralai/Voxtral-Mini-4B-Realtime-2602",
|
||||
help="Model that is served and should be pinged.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--audio_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the audio file to transcribe.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
type=str,
|
||||
default="localhost",
|
||||
help="vLLM server host (default: localhost)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=8000,
|
||||
help="vLLM server port (default: 8000)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
183
examples/online_serving/openai_realtime_microphone_client.py
Normal file
183
examples/online_serving/openai_realtime_microphone_client.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Minimal Gradio demo for real-time speech transcription using the vLLM Realtime API.
|
||||
|
||||
Start the vLLM server first:
|
||||
|
||||
vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --enforce-eager
|
||||
|
||||
Then run this script:
|
||||
|
||||
python openai_realtime_microphone_client.py --host localhost --port 8000
|
||||
|
||||
Use --share to create a public Gradio link.
|
||||
|
||||
Requirements: websockets, numpy, gradio
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import queue
|
||||
import threading
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
import websockets
|
||||
|
||||
SAMPLE_RATE = 16_000
|
||||
|
||||
# Global state
|
||||
audio_queue: queue.Queue = queue.Queue()
|
||||
transcription_text = ""
|
||||
is_running = False
|
||||
ws_url = ""
|
||||
model = ""
|
||||
|
||||
|
||||
async def websocket_handler():
|
||||
"""Connect to WebSocket and handle audio streaming + transcription."""
|
||||
global transcription_text, is_running
|
||||
|
||||
async with websockets.connect(ws_url) as ws:
|
||||
# Wait for session.created
|
||||
await ws.recv()
|
||||
|
||||
# Validate model
|
||||
await ws.send(json.dumps({"type": "session.update", "model": model}))
|
||||
|
||||
# Signal ready
|
||||
await ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
|
||||
|
||||
async def send_audio():
|
||||
while is_running:
|
||||
try:
|
||||
chunk = await asyncio.get_event_loop().run_in_executor(
|
||||
None, lambda: audio_queue.get(timeout=0.1)
|
||||
)
|
||||
await ws.send(
|
||||
json.dumps(
|
||||
{"type": "input_audio_buffer.append", "audio": chunk}
|
||||
)
|
||||
)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
async def receive_transcription():
|
||||
global transcription_text
|
||||
async for message in ws:
|
||||
data = json.loads(message)
|
||||
if data.get("type") == "transcription.delta":
|
||||
transcription_text += data["delta"]
|
||||
|
||||
await asyncio.gather(send_audio(), receive_transcription())
|
||||
|
||||
|
||||
def start_websocket():
|
||||
"""Start WebSocket connection in background thread."""
|
||||
global is_running
|
||||
is_running = True
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
try:
|
||||
loop.run_until_complete(websocket_handler())
|
||||
except Exception as e:
|
||||
print(f"WebSocket error: {e}")
|
||||
|
||||
|
||||
def start_recording():
|
||||
"""Start the transcription service."""
|
||||
global transcription_text
|
||||
transcription_text = ""
|
||||
thread = threading.Thread(target=start_websocket, daemon=True)
|
||||
thread.start()
|
||||
return gr.update(interactive=False), gr.update(interactive=True), ""
|
||||
|
||||
|
||||
def stop_recording():
|
||||
"""Stop the transcription service."""
|
||||
global is_running
|
||||
is_running = False
|
||||
return gr.update(interactive=True), gr.update(interactive=False), transcription_text
|
||||
|
||||
|
||||
def process_audio(audio):
|
||||
"""Process incoming audio and queue for streaming."""
|
||||
global transcription_text
|
||||
|
||||
if audio is None or not is_running:
|
||||
return transcription_text
|
||||
|
||||
sample_rate, audio_data = audio
|
||||
|
||||
# Convert to mono if stereo
|
||||
if len(audio_data.shape) > 1:
|
||||
audio_data = audio_data.mean(axis=1)
|
||||
|
||||
# Normalize to float
|
||||
if audio_data.dtype == np.int16:
|
||||
audio_float = audio_data.astype(np.float32) / 32767.0
|
||||
else:
|
||||
audio_float = audio_data.astype(np.float32)
|
||||
|
||||
# Resample to 16kHz if needed
|
||||
if sample_rate != SAMPLE_RATE:
|
||||
num_samples = int(len(audio_float) * SAMPLE_RATE / sample_rate)
|
||||
audio_float = np.interp(
|
||||
np.linspace(0, len(audio_float) - 1, num_samples),
|
||||
np.arange(len(audio_float)),
|
||||
audio_float,
|
||||
)
|
||||
|
||||
# Convert to PCM16 and base64 encode
|
||||
pcm16 = (audio_float * 32767).astype(np.int16)
|
||||
b64_chunk = base64.b64encode(pcm16.tobytes()).decode("utf-8")
|
||||
audio_queue.put(b64_chunk)
|
||||
|
||||
return transcription_text
|
||||
|
||||
|
||||
# Gradio interface
|
||||
with gr.Blocks(title="Real-time Speech Transcription") as demo:
|
||||
gr.Markdown("# Real-time Speech Transcription")
|
||||
gr.Markdown("Click **Start** and speak into your microphone.")
|
||||
|
||||
with gr.Row():
|
||||
start_btn = gr.Button("Start", variant="primary")
|
||||
stop_btn = gr.Button("Stop", variant="stop", interactive=False)
|
||||
|
||||
audio_input = gr.Audio(sources=["microphone"], streaming=True, type="numpy")
|
||||
transcription_output = gr.Textbox(label="Transcription", lines=5)
|
||||
|
||||
start_btn.click(
|
||||
start_recording, outputs=[start_btn, stop_btn, transcription_output]
|
||||
)
|
||||
stop_btn.click(stop_recording, outputs=[start_btn, stop_btn, transcription_output])
|
||||
audio_input.stream(
|
||||
process_audio, inputs=[audio_input], outputs=[transcription_output]
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Realtime WebSocket Transcription with Gradio"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="mistralai/Voxtral-Mini-4B-Realtime-2602",
|
||||
help="Model that is served and should be pinged.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host", type=str, default="localhost", help="vLLM server host"
|
||||
)
|
||||
parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
|
||||
parser.add_argument(
|
||||
"--share", action="store_true", help="Create public Gradio link"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
ws_url = f"ws://{args.host}:{args.port}/v1/realtime"
|
||||
model = args.model
|
||||
demo.launch(share=args.share)
|
||||
@@ -49,8 +49,8 @@ def main():
|
||||
# Refer to the HuggingFace repo for the correct format to use
|
||||
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
chat, add_generation_prompt=True, return_tensors="pt"
|
||||
)
|
||||
chat, add_generation_prompt=True, return_tensors="pt", return_dict=True
|
||||
).input_ids
|
||||
|
||||
embedding_layer = transformers_model.get_input_embeddings()
|
||||
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
||||
@@ -60,9 +60,7 @@ def main():
|
||||
|
||||
completion = client.completions.create(
|
||||
model=model_name,
|
||||
# NOTE: The OpenAI client does not allow `None` as an input to
|
||||
# `prompt`. Use an empty string if you have no text prompts.
|
||||
prompt="",
|
||||
prompt=None,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
# NOTE: The OpenAI client allows passing in extra JSON body via the
|
||||
|
||||
@@ -27,7 +27,8 @@ def main(client):
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False,
|
||||
)
|
||||
return_dict=True,
|
||||
).input_ids
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
|
||||
@@ -12,11 +12,7 @@ import base64
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from vllm.utils.serial_utils import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
ENDIANNESS,
|
||||
binary2tensor,
|
||||
)
|
||||
from vllm.utils.serial_utils import EMBED_DTYPES, ENDIANNESS, binary2tensor
|
||||
|
||||
|
||||
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
@@ -45,7 +41,7 @@ def main(args):
|
||||
] * 2
|
||||
|
||||
# The OpenAI client does not support the embed_dtype and endianness parameters.
|
||||
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
|
||||
for embed_dtype in EMBED_DTYPES:
|
||||
for endianness in ENDIANNESS:
|
||||
prompt = {
|
||||
"model": model,
|
||||
|
||||
@@ -12,13 +12,12 @@ import json
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from vllm.utils.serial_utils import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
ENDIANNESS,
|
||||
from vllm.entrypoints.pooling.utils import (
|
||||
MetadataItem,
|
||||
build_metadata_items,
|
||||
decode_pooling_output,
|
||||
)
|
||||
from vllm.utils.serial_utils import EMBED_DTYPES, ENDIANNESS
|
||||
|
||||
|
||||
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
@@ -51,7 +50,7 @@ def main(args):
|
||||
|
||||
# The OpenAI client does not support the bytes encoding_format.
|
||||
# The OpenAI client does not support the embed_dtype and endianness parameters.
|
||||
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
|
||||
for embed_dtype in EMBED_DTYPES:
|
||||
for endianness in ENDIANNESS:
|
||||
prompt = {
|
||||
"model": model,
|
||||
@@ -74,7 +73,7 @@ def main(args):
|
||||
# The vllm server always sorts the returned embeddings in the order of input. So
|
||||
# returning metadata is not necessary. You can set encoding_format to bytes_only
|
||||
# to let the server not return metadata.
|
||||
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
|
||||
for embed_dtype in EMBED_DTYPES:
|
||||
for endianness in ENDIANNESS:
|
||||
prompt = {
|
||||
"model": model,
|
||||
|
||||
@@ -12,6 +12,8 @@ on HuggingFace model repository.
|
||||
import argparse
|
||||
from dataclasses import asdict
|
||||
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
|
||||
@@ -20,17 +22,42 @@ text = "A cat standing in the snow."
|
||||
multi_modal_data = {"image": fetch_image(image_url)}
|
||||
|
||||
|
||||
def print_embeddings(embeds):
|
||||
def print_embeddings(embeds: list[float]):
|
||||
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
|
||||
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||
|
||||
|
||||
def run_qwen3_vl():
|
||||
try:
|
||||
from qwen_vl_utils import smart_resize
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"WARNING: `qwen-vl-utils` not installed, input images will not "
|
||||
"be automatically resized. This can cause different results "
|
||||
"comparing with HF repo's example. "
|
||||
"You can enable this functionality by `pip install qwen-vl-utils`."
|
||||
)
|
||||
smart_resize = None
|
||||
|
||||
if smart_resize is not None:
|
||||
|
||||
def post_process_image(image: Image) -> Image:
|
||||
width, height = image.size
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=32,
|
||||
)
|
||||
return image.resize((resized_width, resized_height))
|
||||
|
||||
multi_modal_data["image"] = post_process_image(multi_modal_data["image"])
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen3-VL-Embedding-2B",
|
||||
runner="pooling",
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
|
||||
)
|
||||
default_instruction = "Represent the user's input."
|
||||
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
|
||||
57
examples/pooling/score/colbert_rerank_online.py
Normal file
57
examples/pooling/score/colbert_rerank_online.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Example of using ColBERT late interaction model for reranking.
|
||||
|
||||
ColBERT (Contextualized Late Interaction over BERT) uses per-token embeddings
|
||||
and MaxSim scoring for document reranking, providing better accuracy than
|
||||
single-vector models while being more efficient than cross-encoders.
|
||||
|
||||
Start the server with:
|
||||
vllm serve answerdotai/answerai-colbert-small-v1
|
||||
|
||||
Then run this script:
|
||||
python colbert_rerank_online.py
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import requests
|
||||
|
||||
url = "http://127.0.0.1:8000/rerank"
|
||||
|
||||
headers = {"accept": "application/json", "Content-Type": "application/json"}
|
||||
|
||||
data = {
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"query": "What is machine learning?",
|
||||
"documents": [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a programming language.",
|
||||
"Deep learning uses neural networks for complex tasks.",
|
||||
"The weather today is sunny.",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
|
||||
if response.status_code == 200:
|
||||
print("ColBERT Rerank Request successful!")
|
||||
result = response.json()
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
# Show ranked results
|
||||
print("\nRanked documents (most relevant first):")
|
||||
for item in result["results"]:
|
||||
doc_idx = item["index"]
|
||||
score = item["relevance_score"]
|
||||
print(f" Score {score:.4f}: {data['documents'][doc_idx]}")
|
||||
else:
|
||||
print(f"Request failed with status code: {response.status_code}")
|
||||
print(response.text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -89,6 +89,29 @@ def main(args):
|
||||
response = requests.post(rerank_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: text + image url")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"query": query,
|
||||
"documents": {"content": [documents[0], documents[1]]},
|
||||
}
|
||||
response = requests.post(rerank_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: list")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"query": query,
|
||||
"documents": [
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
],
|
||||
}
|
||||
response = requests.post(rerank_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
@@ -92,6 +92,44 @@ def main(args):
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: text + image url")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"queries": query,
|
||||
"documents": {"content": [documents[0], documents[1]]},
|
||||
}
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: list")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"queries": query,
|
||||
"documents": [
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
],
|
||||
}
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: list & Document: list")
|
||||
data = [
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
]
|
||||
prompt = {
|
||||
"model": model,
|
||||
"queries": data,
|
||||
"documents": data,
|
||||
}
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
@@ -11,7 +11,7 @@ transformers >= 4.56.0, < 5
|
||||
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
|
||||
protobuf # Required by LlamaTokenizer, gRPC.
|
||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||
aiohttp
|
||||
aiohttp >= 3.13.3
|
||||
openai >= 1.99.1 # For Responses API with reasoning content
|
||||
pydantic >= 2.12.0
|
||||
prometheus_client >= 0.18.0
|
||||
|
||||
@@ -4,10 +4,10 @@
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
|
||||
# Dependencies for NVIDIA GPUs
|
||||
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
|
||||
ray[cgraph]>=2.48.0
|
||||
torch==2.9.1
|
||||
torchaudio==2.9.1
|
||||
# These must be updated alongside torch
|
||||
torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
flashinfer-python==0.6.1
|
||||
flashinfer-python==0.6.2
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
lmcache
|
||||
lmcache >= 0.3.9
|
||||
nixl >= 0.7.1 # Required for disaggregated prefill
|
||||
|
||||
@@ -14,7 +14,7 @@ pytest-shard==0.1.2
|
||||
# Async/HTTP dependencies
|
||||
anyio==4.6.2.post1
|
||||
# via httpx, starlette
|
||||
aiohttp==3.13.0
|
||||
aiohttp==3.13.3
|
||||
# via gpt-oss
|
||||
httpx==0.27.2
|
||||
# HTTP testing
|
||||
@@ -94,3 +94,5 @@ timm==1.0.17
|
||||
albumentations==1.4.6
|
||||
# Pin transformers version
|
||||
transformers==4.57.3
|
||||
# Pin HF Hub version
|
||||
huggingface-hub==0.36.1
|
||||
|
||||
@@ -5,7 +5,7 @@ numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
|
||||
# Dependencies for AMD GPUs
|
||||
datasets
|
||||
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
|
||||
ray[cgraph]>=2.48.0
|
||||
peft
|
||||
pytest-asyncio
|
||||
tensorizer==2.10.1
|
||||
|
||||
@@ -12,7 +12,7 @@ affine==2.4.0
|
||||
# via rasterio
|
||||
aiohappyeyeballs==2.6.1
|
||||
# via aiohttp
|
||||
aiohttp==3.13.0
|
||||
aiohttp==3.13.3
|
||||
# via
|
||||
# aiohttp-cors
|
||||
# datasets
|
||||
@@ -332,7 +332,7 @@ httpx==0.27.2
|
||||
# -r requirements/test.in
|
||||
# perceptron
|
||||
# schemathesis
|
||||
huggingface-hub==0.34.3
|
||||
huggingface-hub==0.36.1
|
||||
# via
|
||||
# accelerate
|
||||
# datasets
|
||||
|
||||
@@ -11,8 +11,8 @@ jinja2>=3.1.6
|
||||
datasets # for benchmark scripts
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
--extra-index-url=https://download.pytorch.org/whl/xpu
|
||||
torch==2.9.0+xpu
|
||||
torch==2.10.0+xpu
|
||||
torchaudio
|
||||
torchvision
|
||||
|
||||
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
|
||||
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.1/vllm_xpu_kernels-0.1.1-cp312-cp312-linux_x86_64.whl
|
||||
@@ -1,321 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
|
||||
from tests.compile.fusion_test_utils import (
|
||||
CUSTOM_OPS_FP8,
|
||||
CUSTOM_OPS_QUANT_RMS_NORM,
|
||||
CUSTOM_OPS_RMS_NORM,
|
||||
MODELS,
|
||||
MODELS_FP4,
|
||||
MODELS_FP8,
|
||||
MODELS_GROUP_FP8,
|
||||
Matches,
|
||||
custom_ops_product,
|
||||
is_blackwell,
|
||||
run_model,
|
||||
)
|
||||
from tests.v1.attention.utils import AttentionBackendEnum
|
||||
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import has_flashinfer
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ...utils import flat_product, multi_gpu_test
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
||||
# Toggle RMSNorm and QuantFP8 for FP8 models
|
||||
list(
|
||||
flat_product(
|
||||
MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
|
||||
)
|
||||
)
|
||||
# Toggle RMSNorm for FP4 models and unquant models
|
||||
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
|
||||
)
|
||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda()
|
||||
or not has_flashinfer()
|
||||
or not current_platform.has_device_capability(90),
|
||||
reason="allreduce+rmsnorm fusion requires flashinfer",
|
||||
)
|
||||
def test_tp2_attn_quant_allreduce_rmsnorm(
|
||||
model_name: str,
|
||||
model_kwargs: dict,
|
||||
backend: AttentionBackendEnum,
|
||||
matches: Matches,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
monkeypatch,
|
||||
):
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
||||
|
||||
if "fp4" in model_name.lower() and not is_blackwell():
|
||||
pytest.skip("NVFP4 quant requires Blackwell")
|
||||
|
||||
if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
|
||||
# FlashInfer attn fusion requires Blackwell
|
||||
matches = matches._replace(attention_fusion=0)
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
|
||||
if inductor_graph_partition:
|
||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
splitting_ops: list[str] | None = None
|
||||
else:
|
||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
||||
splitting_ops = []
|
||||
|
||||
# Disable, compile cache to make sure custom passes run.
|
||||
# Otherwise, we can't verify fusion happened through the logs.
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
||||
# Force spawn as it is more general.
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
|
||||
model_kwargs["attention_config"] = {"backend": backend.name}
|
||||
|
||||
compilation_config = CompilationConfig(
|
||||
# Testing properties
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
cudagraph_mode=mode,
|
||||
custom_ops=custom_ops_list,
|
||||
splitting_ops=splitting_ops,
|
||||
# Common
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(
|
||||
fuse_attn_quant=True,
|
||||
eliminate_noops=True,
|
||||
fuse_allreduce_rms=True,
|
||||
),
|
||||
# Inductor caches custom passes by default as well via uuid
|
||||
inductor_compile_config={"force_disable_caches": True},
|
||||
)
|
||||
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(
|
||||
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
|
||||
)
|
||||
log_matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
# 2 for each compile range
|
||||
# (global compile range can be split due to fuse_allreduce_rmsnorm)
|
||||
num_compile_ranges = len(compilation_config.get_compile_ranges())
|
||||
assert num_compile_ranges in [1, 2]
|
||||
|
||||
assert len(log_matches) == 2 * num_compile_ranges, log_holder.text
|
||||
|
||||
assert all(int(log_match) == matches.attention_fusion for log_match in log_matches)
|
||||
|
||||
log_matches = re.findall(
|
||||
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.allreduce_fusion
|
||||
assert int(log_matches[1]) == matches.allreduce_fusion
|
||||
|
||||
log_matches = re.findall(
|
||||
r"pass_manager.py:\d+] Skipping .*AllReduceFusionPass.* with compile range",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2 * (num_compile_ranges - 1), log_holder.text
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
||||
# Toggle RMSNorm and QuantFP8 for FP8 models
|
||||
list(
|
||||
flat_product(
|
||||
MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
|
||||
)
|
||||
)
|
||||
# Toggle RMSNorm for FP4 models and unquant models
|
||||
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
|
||||
)
|
||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="sequence parallel only tested on CUDA",
|
||||
)
|
||||
def test_tp2_attn_quant_async_tp(
|
||||
model_name: str,
|
||||
model_kwargs: dict,
|
||||
backend: AttentionBackendEnum,
|
||||
matches: Matches,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
monkeypatch,
|
||||
):
|
||||
if is_blackwell():
|
||||
# TODO: https://github.com/vllm-project/vllm/issues/27893
|
||||
pytest.skip("Blackwell is not supported for AsyncTP pass")
|
||||
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
||||
|
||||
if "fp4" in model_name.lower() and not is_blackwell():
|
||||
pytest.skip("NVFP4 quant requires Blackwell")
|
||||
|
||||
if backend == AttentionBackendEnum.FLASHINFER:
|
||||
if not has_flashinfer():
|
||||
pytest.skip("FlashInfer backend requires flashinfer installed")
|
||||
if not is_blackwell():
|
||||
# FlashInfer attn fusion requires Blackwell
|
||||
matches = matches._replace(attention_fusion=0)
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
|
||||
if inductor_graph_partition:
|
||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
splitting_ops: list[str] | None = None
|
||||
else:
|
||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
||||
splitting_ops = []
|
||||
|
||||
# Disable, compile cache to make sure custom passes run.
|
||||
# Otherwise, we can't verify fusion happened through the logs.
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
||||
# Force spawn as it is more general.
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
|
||||
model_kwargs["attention_config"] = {"backend": backend.name}
|
||||
|
||||
compilation_config = CompilationConfig(
|
||||
# Testing properties
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
cudagraph_mode=mode,
|
||||
custom_ops=custom_ops_list,
|
||||
splitting_ops=splitting_ops,
|
||||
# Common
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(
|
||||
fuse_attn_quant=True,
|
||||
eliminate_noops=True,
|
||||
enable_sp=True,
|
||||
fuse_gemm_comms=True,
|
||||
),
|
||||
# Inductor caches custom passes by default as well via uuid
|
||||
inductor_compile_config={"force_disable_caches": True},
|
||||
)
|
||||
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(
|
||||
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
|
||||
)
|
||||
log_matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.attention_fusion
|
||||
assert int(log_matches[1]) == matches.attention_fusion
|
||||
|
||||
log_matches = re.findall(
|
||||
r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.sequence_parallel
|
||||
assert int(log_matches[1]) == matches.sequence_parallel
|
||||
|
||||
log_matches = re.findall(
|
||||
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.async_tp
|
||||
assert int(log_matches[1]) == matches.async_tp
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
||||
# Test rms norm+group quant_fp8 fusion
|
||||
list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
|
||||
)
|
||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
||||
# TODO: remove skip after we fix the fusion thoroughly
|
||||
@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell")
|
||||
def test_rms_group_quant(
|
||||
model_name: str,
|
||||
model_kwargs: dict[str, Any],
|
||||
backend: AttentionBackendEnum,
|
||||
matches: Matches,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
monkeypatch,
|
||||
):
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
|
||||
if inductor_graph_partition:
|
||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
splitting_ops: list[str] | None = None
|
||||
else:
|
||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
||||
splitting_ops = []
|
||||
|
||||
# Disable, compile cache to make sure custom passes run.
|
||||
# Otherwise, we can't verify fusion happened through the logs.
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
||||
# Force spawn as it is more general.
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
|
||||
# TODO: remove this after fusion is fixed
|
||||
monkeypatch.setenv("VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES", "0")
|
||||
|
||||
model_kwargs["attention_config"] = {"backend": backend.name}
|
||||
|
||||
compilation_config = CompilationConfig(
|
||||
# Testing properties
|
||||
custom_ops=custom_ops_list,
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
cudagraph_mode=mode,
|
||||
splitting_ops=splitting_ops,
|
||||
# Common
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(
|
||||
fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
|
||||
),
|
||||
# Inductor caches custom passes by default as well via uuid
|
||||
inductor_compile_config={"force_disable_caches": True},
|
||||
)
|
||||
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(compilation_config, model_name, **model_kwargs)
|
||||
|
||||
log_matches = re.findall(
|
||||
r"\[fusion.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 1, log_holder.text
|
||||
assert int(log_matches[0]) == matches.rms_quant_norm_fusion
|
||||
@@ -1,208 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Shared utilities for fusion tests (e.g. test_fusion_attn.py)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
from tests.v1.attention.utils import AttentionBackendEnum
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationConfig, CUDAGraphMode
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
is_blackwell = lambda: current_platform.is_device_capability_family(100)
|
||||
"""Are we running on Blackwell, a lot of tests depend on it"""
|
||||
|
||||
|
||||
def has_cuda_graph_wrapper_metadata() -> bool:
|
||||
from importlib import import_module
|
||||
|
||||
try:
|
||||
module = import_module("torch._inductor.utils")
|
||||
module.CUDAGraphWrapperMetadata # noqa B018
|
||||
except AttributeError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class Matches(NamedTuple):
|
||||
attention_fusion: int = 0
|
||||
allreduce_fusion: int = 0
|
||||
sequence_parallel: int = 0
|
||||
async_tp: int = 0
|
||||
rms_quant_norm_fusion: int = 0
|
||||
|
||||
|
||||
class ModelBackendTestCase(NamedTuple):
|
||||
model_name: str
|
||||
model_kwargs: dict[str, Any]
|
||||
backend: AttentionBackendEnum
|
||||
matches: Matches
|
||||
|
||||
|
||||
# E2E model test cases
|
||||
MODELS_FP8: list[ModelBackendTestCase] = []
|
||||
MODELS_FP4: list[ModelBackendTestCase] = []
|
||||
MODELS: list[ModelBackendTestCase] = [] # tp-only (unquantized)
|
||||
MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
|
||||
|
||||
if current_platform.is_cuda():
|
||||
MODELS_FP8 = [
|
||||
ModelBackendTestCase(
|
||||
# Use smaller model for L40s in CI
|
||||
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(
|
||||
attention_fusion=32,
|
||||
allreduce_fusion=65,
|
||||
sequence_parallel=65,
|
||||
async_tp=128,
|
||||
),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
# TODO FlashInfer attn broken on Hopper with kvcache=fp8:
|
||||
# https://github.com/vllm-project/vllm/issues/28568
|
||||
backend=AttentionBackendEnum.FLASHINFER
|
||||
if is_blackwell()
|
||||
else AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(
|
||||
attention_fusion=48,
|
||||
allreduce_fusion=96,
|
||||
sequence_parallel=96,
|
||||
async_tp=95, # mlp is moe, no fusion there
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
MODELS_FP4 = [
|
||||
ModelBackendTestCase(
|
||||
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.FLASHINFER,
|
||||
matches=Matches(
|
||||
attention_fusion=32,
|
||||
allreduce_fusion=65,
|
||||
sequence_parallel=65,
|
||||
async_tp=128,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
# TP only (unquantized models)
|
||||
MODELS = [
|
||||
ModelBackendTestCase(
|
||||
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(
|
||||
attention_fusion=0,
|
||||
allreduce_fusion=65,
|
||||
sequence_parallel=65,
|
||||
async_tp=128,
|
||||
),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="Qwen/Qwen3-30B-A3B",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(
|
||||
attention_fusion=0,
|
||||
allreduce_fusion=97,
|
||||
sequence_parallel=97,
|
||||
async_tp=96, # MLP is MoE, half the fusions of dense
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
MODELS_GROUP_FP8 = [
|
||||
ModelBackendTestCase(
|
||||
model_name="Qwen/Qwen3-30B-A3B-FP8",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(
|
||||
rms_quant_norm_fusion=48,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
elif current_platform.is_rocm():
|
||||
MODELS_FP8 = [
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(attention_fusion=32),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.ROCM_ATTN,
|
||||
matches=Matches(attention_fusion=32),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
|
||||
matches=Matches(attention_fusion=32),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# Custom ops toggle lists for parametrization
|
||||
CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
|
||||
CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
|
||||
CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
|
||||
|
||||
|
||||
def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
|
||||
"""Generate all combinations of custom ops for parametrization."""
|
||||
for op_list in itertools.product(*custom_ops_lists):
|
||||
yield ",".join(op_list)
|
||||
|
||||
|
||||
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
|
||||
"""Run a model with the given compilation config for E2E fusion tests."""
|
||||
compilation_config = (
|
||||
compile_config
|
||||
if isinstance(compile_config, CompilationConfig)
|
||||
else CompilationConfig(mode=compile_config)
|
||||
)
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
# Allow override from model_kwargs
|
||||
model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
|
||||
model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
|
||||
|
||||
# No cudagraphs by default
|
||||
if compilation_config.cudagraph_mode is None:
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
llm = LLM(
|
||||
model=model,
|
||||
compilation_config=compilation_config,
|
||||
**model_kwargs,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
# Get the compile ranges split points after vllm config post init
|
||||
# in order to compute compile ranges correctly
|
||||
compilation_config.compile_ranges_split_points = (
|
||||
llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
|
||||
)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user