Compare commits
225 Commits
v0.17.0rc0
...
v0.17.1rc0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f83b933b84 | ||
|
|
82f3f30e26 | ||
|
|
9095cbbfb6 | ||
|
|
721ae79f50 | ||
|
|
aefc59f088 | ||
|
|
d88f28da05 | ||
|
|
106ff69c4e | ||
|
|
ca5fb4bbd8 | ||
|
|
cf88b23749 | ||
|
|
a3189a08b0 | ||
|
|
409c4e632d | ||
|
|
8850738b70 | ||
|
|
234860399b | ||
|
|
c88510083b | ||
|
|
4ff8c3c8f9 | ||
|
|
507ddbe992 | ||
|
|
ddbb0d230a | ||
|
|
9efc3bdcd6 | ||
|
|
156e33553c | ||
|
|
d0cd736caa | ||
|
|
195c997203 | ||
|
|
04b67d8f62 | ||
|
|
7279374f91 | ||
|
|
006aea17d7 | ||
|
|
0836be3b03 | ||
|
|
4e95ec111c | ||
|
|
179547d62c | ||
|
|
f85b4eda3a | ||
|
|
2a194ddd72 | ||
|
|
203a7f27da | ||
|
|
483463f735 | ||
|
|
4e571ce643 | ||
|
|
4ff9b045fe | ||
|
|
3fd03f1ec2 | ||
|
|
10a5f4d53d | ||
|
|
fe0c085c28 | ||
|
|
8d6b3d5dda | ||
|
|
4b87ffbefb | ||
|
|
fa028207aa | ||
|
|
d460a18fc6 | ||
|
|
6e956d9eca | ||
|
|
1e0f917b34 | ||
|
|
c174d54f86 | ||
|
|
55d27cca55 | ||
|
|
580864d81e | ||
|
|
2b28b9b269 | ||
|
|
70485a11bd | ||
|
|
74a9f54cdb | ||
|
|
00c4cb5606 | ||
|
|
941e52c298 | ||
|
|
be292b7c14 | ||
|
|
77a73458e3 | ||
|
|
5578f2a4d3 | ||
|
|
3ec2115015 | ||
|
|
b0906d8b02 | ||
|
|
aaf5fa9abf | ||
|
|
f96c3ab08c | ||
|
|
dc6b578466 | ||
|
|
1bc9c77f6d | ||
|
|
65a4da1504 | ||
|
|
217f27598d | ||
|
|
fff3711a24 | ||
|
|
c4d859c274 | ||
|
|
747431044d | ||
|
|
d62856b928 | ||
|
|
bd2659a566 | ||
|
|
90512b2e8b | ||
|
|
dcf8862fd4 | ||
|
|
43aa389231 | ||
|
|
384425f84e | ||
|
|
a0f44bb616 | ||
|
|
fde4771bbd | ||
|
|
e5ff140216 | ||
|
|
0a6a3a1290 | ||
|
|
4497431df6 | ||
|
|
b7332b058c | ||
|
|
40077ea3de | ||
|
|
5d6aae4577 | ||
|
|
63298ee173 | ||
|
|
2dde535df1 | ||
|
|
379689d533 | ||
|
|
a6be75dbd2 | ||
|
|
ee54f9cdb9 | ||
|
|
fc4657756f | ||
|
|
eebd14651f | ||
|
|
ebb9cc5f2b | ||
|
|
85f50eb41f | ||
|
|
5261223c2d | ||
|
|
00b814ba5a | ||
|
|
ee8a29511f | ||
|
|
755356b3d1 | ||
|
|
58928475e4 | ||
|
|
1a9718085c | ||
|
|
7eb524e64c | ||
|
|
c7f32e08c2 | ||
|
|
b354686524 | ||
|
|
6a18d8789b | ||
|
|
24a03915f5 | ||
|
|
b5e34e1fca | ||
|
|
ce8546a12b | ||
|
|
c188749bcd | ||
|
|
225d1090a0 | ||
|
|
f3c6c9c9d7 | ||
|
|
26bd43b52d | ||
|
|
6b625a8807 | ||
|
|
54756b6109 | ||
|
|
39f9ea0da4 | ||
|
|
e4ae148a78 | ||
|
|
1d0c0d209c | ||
|
|
fcb73f306c | ||
|
|
e2090bf3af | ||
|
|
2a00d3241f | ||
|
|
10f4db4dbe | ||
|
|
5b3ba94ab4 | ||
|
|
90f3c01fa4 | ||
|
|
807d680337 | ||
|
|
5afb387bd4 | ||
|
|
43e77e59ab | ||
|
|
00bd08edee | ||
|
|
43f10573c9 | ||
|
|
86e1060b17 | ||
|
|
27066d1b2b | ||
|
|
57c84ff129 | ||
|
|
e68de8adc0 | ||
|
|
a1ffa56a1e | ||
|
|
0a208d1f54 | ||
|
|
03a49bb8f0 | ||
|
|
8e87cc57f1 | ||
|
|
6dd302653f | ||
|
|
de00ebeac4 | ||
|
|
639680d220 | ||
|
|
c5362c739f | ||
|
|
0a49676fb0 | ||
|
|
c012a8c477 | ||
|
|
ebed80a7c8 | ||
|
|
a73af584fe | ||
|
|
a97954b6a8 | ||
|
|
a911f4dd20 | ||
|
|
5395471d29 | ||
|
|
a57c877f18 | ||
|
|
f917020983 | ||
|
|
86483ca774 | ||
|
|
b93a9e6f6d | ||
|
|
d8839ef7d9 | ||
|
|
e998fa76b9 | ||
|
|
6a895197fa | ||
|
|
8c760b6ab6 | ||
|
|
3ee68590c7 | ||
|
|
7196348157 | ||
|
|
176c799f4c | ||
|
|
612e7729c2 | ||
|
|
ecde7af9c4 | ||
|
|
8df523351f | ||
|
|
b03ff6a96b | ||
|
|
ed81d5edd1 | ||
|
|
3c23ac840e | ||
|
|
a708ef5944 | ||
|
|
66a2209645 | ||
|
|
0bfa229bf1 | ||
|
|
7493c51c55 | ||
|
|
ac773bbe80 | ||
|
|
48e376a007 | ||
|
|
21eb2c3372 | ||
|
|
e2b31243c0 | ||
|
|
c3598d02fa | ||
|
|
57c629e9c1 | ||
|
|
d106bf39f5 | ||
|
|
b0651021e5 | ||
|
|
f600d5192e | ||
|
|
8e7820131e | ||
|
|
0a12cea25f | ||
|
|
dd6dbd93f8 | ||
|
|
26366009c5 | ||
|
|
16c472abe7 | ||
|
|
3b23d57c96 | ||
|
|
2f4226fe52 | ||
|
|
792cbd64ca | ||
|
|
2ed4722e26 | ||
|
|
a3299c3d1d | ||
|
|
6c21a0c2d7 | ||
|
|
562339abc3 | ||
|
|
d7adcadb9b | ||
|
|
f678c3f61a | ||
|
|
be0a3f7570 | ||
|
|
17dc9c7fc9 | ||
|
|
7eca859110 | ||
|
|
636ee223ac | ||
|
|
b7d59ffce2 | ||
|
|
5569f5218d | ||
|
|
138d891d7f | ||
|
|
d7166e74c1 | ||
|
|
417fd28fb1 | ||
|
|
7faba503c4 | ||
|
|
bc6be89d16 | ||
|
|
32224f568a | ||
|
|
f3dc292e9f | ||
|
|
138c5fa186 | ||
|
|
2f2c1d73a7 | ||
|
|
fb3e78ab09 | ||
|
|
fd3bfe74c9 | ||
|
|
bfdb512f11 | ||
|
|
d25c1ec3c9 | ||
|
|
7cc6058ac6 | ||
|
|
28028dff2f | ||
|
|
3417ba5648 | ||
|
|
58cfe0dc44 | ||
|
|
e86221deb6 | ||
|
|
289fc48ab7 | ||
|
|
2f2212e6cc | ||
|
|
18e01a0a10 | ||
|
|
6cb901093f | ||
|
|
ead7bde1ab | ||
|
|
6aa6ad8992 | ||
|
|
c8c3935b70 | ||
|
|
bb6888b8b1 | ||
|
|
1aaec59d79 | ||
|
|
1659b2e058 | ||
|
|
d6e04f4c43 | ||
|
|
a8f66cbde8 | ||
|
|
16d2ad1d38 | ||
|
|
5dc3538736 | ||
|
|
36bf213181 | ||
|
|
6f0dd93801 | ||
|
|
5d199ac8f2 | ||
|
|
9e0f44bec4 |
@@ -13,9 +13,10 @@ import os
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
DEFAULT_RTOL = 0.08
|
DEFAULT_RTOL = 0.08
|
||||||
|
|
||||||
|
|
||||||
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
"allow_deprecated_quantization=True,"
|
"allow_deprecated_quantization=True,"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
|
||||||
|
model_args += "attention_backend=TRITON_ATTN"
|
||||||
|
|
||||||
env_vars = eval_config.get("env_vars", None)
|
env_vars = eval_config.get("env_vars", None)
|
||||||
with scoped_env_vars(env_vars):
|
with scoped_env_vars(env_vars):
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
f"ground_truth={ground_truth:.3f} | "
|
f"ground_truth={ground_truth:.3f} | "
|
||||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||||
)
|
)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
|
||||||
|
min_acceptable = ground_truth * (1 - rtol)
|
||||||
|
success = success and measured_value >= min_acceptable
|
||||||
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -10,7 +10,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"max-model-len": 2048,
|
"max-model-len": 2048,
|
||||||
@@ -37,7 +36,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"max-model-len": 2048,
|
"max-model-len": 2048,
|
||||||
@@ -64,7 +62,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"max-model-len": 2048,
|
"max-model-len": 2048,
|
||||||
@@ -91,7 +88,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "deepseek-ai/DeepSeek-R1",
|
"model": "deepseek-ai/DeepSeek-R1",
|
||||||
"tensor_parallel_size": 8,
|
"tensor_parallel_size": 8,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"max-model-len": 2048,
|
"max-model-len": 2048,
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
@@ -23,7 +22,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
@@ -41,7 +39,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
@@ -59,7 +56,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
|
||||||
"speculative_config": {
|
"speculative_config": {
|
||||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"num_speculative_tokens": 4,
|
"num_speculative_tokens": 4,
|
||||||
|
|||||||
@@ -166,8 +166,15 @@ See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for contex
|
|||||||
EOF
|
EOF
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Notify Slack if webhook is configured.
|
# Notify Slack if webhook is configured and PR/branch are valid.
|
||||||
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
|
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
|
||||||
|
PR="${BUILDKITE_PULL_REQUEST:-}"
|
||||||
|
BRANCH="${BUILDKITE_BRANCH:-}"
|
||||||
|
|
||||||
|
# Skip notification if PR is invalid or branch is empty
|
||||||
|
if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
|
||||||
|
echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
|
||||||
|
else
|
||||||
echo ">>> Sending Slack notification"
|
echo ">>> Sending Slack notification"
|
||||||
# Single quotes are intentional: the f-string expressions are Python, not shell.
|
# Single quotes are intentional: the f-string expressions are Python, not shell.
|
||||||
# shellcheck disable=SC2016
|
# shellcheck disable=SC2016
|
||||||
@@ -198,6 +205,7 @@ print(json.dumps(data))
|
|||||||
-H 'Content-type: application/json' \
|
-H 'Content-type: application/json' \
|
||||||
-d "$PAYLOAD")
|
-d "$PAYLOAD")
|
||||||
echo " Slack webhook response: $HTTP_CODE"
|
echo " Slack webhook response: $HTTP_CODE"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
|
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ function cpu_tests() {
|
|||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run model tests
|
# Run model tests
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ function cpu_tests() {
|
|||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
export TORCH_COMPILE_DISABLE=1
|
export TORCH_COMPILE_DISABLE=1
|
||||||
set -xve
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
|
|||||||
@@ -25,5 +25,5 @@ remove_docker_container
|
|||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
|
||||||
'
|
'
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
|
|||||||
-e PT_HPU_LAZY_MODE=1 \
|
-e PT_HPU_LAZY_MODE=1 \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
/bin/bash -c '
|
/bin/bash -c '
|
||||||
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
'
|
'
|
||||||
|
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
|
|||||||
@@ -34,17 +34,17 @@ docker run \
|
|||||||
set -e
|
set -e
|
||||||
echo $ZE_AFFINITY_MASK
|
echo $ZE_AFFINITY_MASK
|
||||||
pip install tblib==3.1.0
|
pip install tblib==3.1.0
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||||
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
|
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
|
|||||||
BACKENDS=("allgather_reducescatter")
|
BACKENDS=("allgather_reducescatter")
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
export VLLM_ROCM_MOE_PADDING=0
|
||||||
PLATFORM_ARGS=("--no-async-scheduling")
|
PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
|
||||||
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
||||||
else
|
else
|
||||||
# Non-ROCm platform (CUDA/other)
|
# Non-ROCm platform (CUDA/other)
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ obj_json="objects.json"
|
|||||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||||
|
|
||||||
# call script to generate indicies for all existing wheels
|
# call script to generate indices for all existing wheels
|
||||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||||
|
|||||||
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
|
|||||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||||
echo "Wheels copied to local directory"
|
echo "Wheels copied to local directory"
|
||||||
# generate source tarball
|
# generate source distribution using setup.py
|
||||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
|
python setup.py sdist --dist-dir=$DIST_DIR
|
||||||
ls -la $DIST_DIR
|
ls -la $DIST_DIR
|
||||||
|
|
||||||
|
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
|
||||||
|
echo "Found sdist: $SDIST_FILE"
|
||||||
|
|
||||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||||
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||||
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
python3 -m twine check "$PYPI_WHEEL_FILES"
|
python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||||
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
|
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||||
echo "Wheels uploaded to PyPI"
|
echo "Wheels and source distribution uploaded to PyPI"
|
||||||
|
|||||||
@@ -467,7 +467,7 @@ steps:
|
|||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
# TODO: Add the "V1 Test attetion (MI300)" test group
|
# TODO: Add the "V1 Test attention (MI300)" test group
|
||||||
|
|
||||||
- label: V1 Test attention (H100) # 10min
|
- label: V1 Test attention (H100) # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@@ -499,17 +499,6 @@ steps:
|
|||||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||||
|
|
||||||
- label: V1 Test attention (B200) # 10min
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/config/attention.py
|
|
||||||
- vllm/model_executor/layers/attention
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/v1/attention
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/attention
|
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
- label: V1 Test others (CPU) # 5 mins
|
||||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
@@ -540,12 +529,12 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
# for basic
|
# for basic
|
||||||
- python3 offline_inference/basic/chat.py
|
- python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 basic/offline_inference/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 basic/offline_inference/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 basic/offline_inference/score.py
|
||||||
# for multi-modal models
|
# for multi-modal models
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
@@ -1180,52 +1169,45 @@ steps:
|
|||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
# - pytest -v -s tests/models/multimodal/processing/
|
# - pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
- label: Blackwell Test # 21 min
|
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 40
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
# optional: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
- csrc/attention/mla/
|
|
||||||
- csrc/quantization/cutlass_w8a8/moe/
|
|
||||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
- vllm/v1/worker/
|
||||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
- vllm/v1/attention/selector.py
|
- vllm/compilation/
|
||||||
- vllm/platforms/cuda.py
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/passes/test_fusion_attn.py
|
||||||
|
- tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||||
|
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||||
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- pytest -v -s tests/compile/passes/test_fusion_attn.py
|
||||||
# Attention
|
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
# # Wrap with quotes to escape yaml
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
# Quantization
|
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@@ -1258,16 +1240,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: Blackwell LM Eval Small Models
|
|
||||||
timeout_in_minutes: 120
|
|
||||||
gpu: b200
|
|
||||||
optional: true # run on nightlies
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
@@ -1514,6 +1486,20 @@ steps:
|
|||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||||
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||||
|
- CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
##### A100 test #####
|
##### A100 test #####
|
||||||
|
|
||||||
@@ -1654,7 +1640,7 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt
|
||||||
|
|
||||||
##### EPLB Accuracy Tests #####
|
##### EPLB Accuracy Tests #####
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
@@ -1681,16 +1667,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
|
||||||
|
|
||||||
|
|
||||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@@ -2174,20 +2150,7 @@ steps:
|
|||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
# TODO: Add the "V1 Test attetion (MI300)" test group
|
# TODO: Add the "V1 Test attention (MI300)" test group
|
||||||
|
|
||||||
- label: V1 Test attention (H100) # 10min
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi355_1
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
gpu: h100
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/config/attention.py
|
|
||||||
- vllm/model_executor/layers/attention
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/v1/attention
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/attention
|
|
||||||
|
|
||||||
- label: Batch Invariance Tests (H100) # 10min
|
- label: Batch Invariance Tests (H100) # 10min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -2205,6 +2168,8 @@ steps:
|
|||||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||||
|
|
||||||
- label: V1 Test attention (B200) # 10min
|
- label: V1 Test attention (B200) # 10min
|
||||||
|
mirror_hardwares: [amdexperimental, amdmi355]
|
||||||
|
agent_pool: mi355_1
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
gpu: b200
|
gpu: b200
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -2243,12 +2208,12 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
# for basic
|
# for basic
|
||||||
- python3 offline_inference/basic/chat.py
|
- python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 basic/offline_inference/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 basic/offline_inference/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 basic/offline_inference/score.py
|
||||||
# for multi-modal models
|
# for multi-modal models
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
@@ -2824,12 +2789,14 @@ steps:
|
|||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
# - pytest -v -s tests/models/multimodal/processing/
|
# - pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
- label: Blackwell Test # 21 min
|
- label: Blackwell Test (MI355) # 21 min
|
||||||
|
mirror_hardwares: [amdexperimental, amdmi355]
|
||||||
|
agent_pool: mi355_1
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
@@ -2848,28 +2815,28 @@ steps:
|
|||||||
- vllm/v1/attention/selector.py
|
- vllm/v1/attention/selector.py
|
||||||
- vllm/platforms/cuda.py
|
- vllm/platforms/cuda.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- rocm-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
# Attention
|
# Attention
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
#- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
#- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
#- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
#- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||||
# Quantization
|
## Quantization
|
||||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
#- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
#- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
#- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
#- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
#- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
#- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
#- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
#- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
#- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
#- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
#- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
#- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||||
|
|
||||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -2939,13 +2906,15 @@ steps:
|
|||||||
|
|
||||||
- label: Blackwell LM Eval Small Models
|
- label: Blackwell LM Eval Small Models
|
||||||
timeout_in_minutes: 120
|
timeout_in_minutes: 120
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
|
||||||
|
agent_pool: mi355_2
|
||||||
gpu: b200
|
gpu: b200
|
||||||
optional: true # run on nightlies
|
optional: true # run on nightlies
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@@ -3181,6 +3150,20 @@ steps:
|
|||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||||
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi355_4
|
||||||
|
# grade: Blocking
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||||
|
- CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
##### A100 test #####
|
##### A100 test #####
|
||||||
|
|
||||||
@@ -3314,7 +3297,7 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt
|
||||||
|
|
||||||
##### EPLB Accuracy Tests #####
|
##### EPLB Accuracy Tests #####
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
@@ -3328,18 +3311,9 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355)
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
|
||||||
agent_pool: mi355_4
|
agent_pool: mi355_2
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
gpu: b200
|
gpu: b200
|
||||||
optional: true
|
optional: true
|
||||||
@@ -3358,3 +3332,18 @@ steps:
|
|||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||||
|
|
||||||
|
- label: Attention Benchmarks Smoke Test (B200-MI355)
|
||||||
|
device: b200
|
||||||
|
mirror_hardwares: [amdexperimental, amdmi355]
|
||||||
|
agent_pool: mi355_2
|
||||||
|
num_gpus: 2
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- benchmarks/attention_benchmarks/
|
||||||
|
- vllm/v1/attention/
|
||||||
|
commands:
|
||||||
|
- python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
|
||||||
|
|
||||||
|
|||||||
@@ -36,6 +36,16 @@ steps:
|
|||||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||||
|
|
||||||
|
- label: AsyncTP Correctness Tests (B200)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: b200
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
commands:
|
||||||
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
|
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||||
|
|
||||||
- label: Distributed Compile Unit Tests (2xH100)
|
- label: Distributed Compile Unit Tests (2xH100)
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ steps:
|
|||||||
- tests/v1/distributed
|
- tests/v1/distributed
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
|
- tests/distributed/test_multiproc_executor.py
|
||||||
commands:
|
commands:
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
@@ -95,6 +96,8 @@ steps:
|
|||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
|
# test multi-node TP with multiproc executor (simulated on single node)
|
||||||
|
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
# OLD rlhf examples
|
# OLD rlhf examples
|
||||||
@@ -210,6 +213,19 @@ steps:
|
|||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
device: a100
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- vllm/v1/worker/kv_connector_model_runner_mixin.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
|
||||||
|
|
||||||
- label: Pipeline + Context Parallelism (4 GPUs)
|
- label: Pipeline + Context Parallelism (4 GPUs)
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
|||||||
@@ -41,6 +41,11 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Entrypoints Integration (API Server 2)
|
- label: Entrypoints Integration (API Server 2)
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
@@ -55,6 +60,11 @@ steps:
|
|||||||
- pytest -v -s entrypoints/instrumentator
|
- pytest -v -s entrypoints/instrumentator
|
||||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s tool_use
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Entrypoints Integration (Pooling)
|
- label: Entrypoints Integration (Pooling)
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
@@ -87,6 +97,11 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s v1/entrypoints
|
- pytest -v -s v1/entrypoints
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: OpenAI API Correctness
|
- label: OpenAI API Correctness
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
|
|||||||
@@ -8,8 +8,9 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
- tests/kernels/test_top_k_per_row.py
|
- tests/kernels/test_top_k_per_row.py
|
||||||
|
- tests/kernels/test_concat_mla_q.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
- label: Kernels Attention Test %N
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
@@ -96,7 +97,7 @@ steps:
|
|||||||
- vllm/platforms/cuda.py
|
- vllm/platforms/cuda.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
# Attention
|
# Attention
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||||
|
|||||||
@@ -67,12 +67,13 @@ steps:
|
|||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
- python3 offline_inference/basic/chat.py # for basic
|
# for basic
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 basic/offline_inference/chat.py
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 basic/offline_inference/classify.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 basic/offline_inference/embed.py
|
||||||
|
- python3 basic/offline_inference/score.py
|
||||||
# for multi-modal models
|
# for multi-modal models
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
@@ -87,6 +88,11 @@ steps:
|
|||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Metrics, Tracing (2 GPUs)
|
- label: Metrics, Tracing (2 GPUs)
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ steps:
|
|||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|||||||
@@ -12,6 +12,11 @@ steps:
|
|||||||
- pip freeze | grep -E 'torch'
|
- pip freeze | grep -E 'torch'
|
||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test (CPU)
|
- label: Multi-Modal Processor Test (CPU)
|
||||||
depends_on:
|
depends_on:
|
||||||
@@ -54,6 +59,11 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Multi-Modal Models (Extended) 2
|
- label: Multi-Modal Models (Extended) 2
|
||||||
optional: true
|
optional: true
|
||||||
|
|||||||
@@ -15,9 +15,12 @@ steps:
|
|||||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
- pip uninstall vllm_add_dummy_platform -y
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
# end platform plugin tests
|
# end platform plugin tests
|
||||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
# begin io_processor plugins test
|
||||||
|
# test generic io_processor plugins functions
|
||||||
|
- pytest -v -s ./plugins_tests/test_io_processor_plugins.py
|
||||||
|
# test Terratorch io_processor plugins
|
||||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
- pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
# test bge_m3_sparse io_processor plugin
|
# test bge_m3_sparse io_processor plugin
|
||||||
- pip install -e ./plugins/bge_m3_sparse_plugin
|
- pip install -e ./plugins/bge_m3_sparse_plugin
|
||||||
@@ -36,3 +39,8 @@ steps:
|
|||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_2
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|||||||
7
.github/mergify.yml
vendored
7
.github/mergify.yml
vendored
@@ -3,6 +3,7 @@ pull_request_rules:
|
|||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
- label != stale
|
||||||
|
- -closed
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@@ -37,15 +38,13 @@ pull_request_rules:
|
|||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> <details>
|
> <details>
|
||||||
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
|
> <summary>Is <code>mypy</code> failing?</summary>
|
||||||
> <br/>
|
> <br/>
|
||||||
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
|
> <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
|
||||||
>
|
>
|
||||||
> ```bash
|
> ```bash
|
||||||
> # For mypy (substitute "3.10" with the failing version if needed)
|
> # For mypy (substitute "3.10" with the failing version if needed)
|
||||||
> pre-commit run --hook-stage manual mypy-3.10
|
> pre-commit run --hook-stage manual mypy-3.10
|
||||||
> # For markdownlint
|
|
||||||
> pre-commit run --hook-stage manual markdownlint
|
|
||||||
> ```
|
> ```
|
||||||
> </details>
|
> </details>
|
||||||
|
|
||||||
|
|||||||
3
.github/workflows/macos-smoke-test.yml
vendored
3
.github/workflows/macos-smoke-test.yml
vendored
@@ -6,6 +6,9 @@ on:
|
|||||||
- main
|
- main
|
||||||
workflow_dispatch: # Manual trigger
|
workflow_dispatch: # Manual trigger
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
macos-m1-smoke-test:
|
macos-m1-smoke-test:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ repos:
|
|||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.38.1
|
rev: v1.43.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
args: [--force-exclude]
|
args: [--force-exclude]
|
||||||
@@ -24,12 +24,12 @@ repos:
|
|||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/igorshubovych/markdownlint-cli
|
- repo: https://github.com/DavidAnson/markdownlint-cli2
|
||||||
rev: v0.45.0
|
rev: v0.21.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: markdownlint
|
- id: markdownlint-cli2
|
||||||
exclude: '.*\.inc\.md'
|
language_version: lts
|
||||||
stages: [manual] # Only run in CI
|
args: [--fix]
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
@@ -55,7 +55,7 @@ repos:
|
|||||||
language: python
|
language: python
|
||||||
types_or: [python, pyi]
|
types_or: [python, pyi]
|
||||||
require_serial: true
|
require_serial: true
|
||||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.10
|
name: Run mypy for Python 3.10
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
||||||
@@ -127,6 +127,13 @@ repos:
|
|||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
|
# prevent use torch.cuda APIs
|
||||||
|
- id: check-torch-cuda-call
|
||||||
|
name: "Prevent new 'torch.cuda' APIs call"
|
||||||
|
entry: python tools/pre_commit/check_torch_cuda.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: [regex]
|
||||||
- id: validate-config
|
- id: validate-config
|
||||||
name: Validate configuration has default values and that each field has a docstring
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
entry: python tools/pre_commit/validate_config.py
|
entry: python tools/pre_commit/validate_config.py
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ build:
|
|||||||
python: "3.12"
|
python: "3.12"
|
||||||
jobs:
|
jobs:
|
||||||
post_checkout:
|
post_checkout:
|
||||||
|
- bash docs/maybe_skip_pr_build.sh
|
||||||
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
|
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
|
||||||
pre_create_environment:
|
pre_create_environment:
|
||||||
- pip install uv
|
- pip install uv
|
||||||
|
|||||||
@@ -187,7 +187,7 @@ python benchmark.py \
|
|||||||
## Hardware Requirements
|
## Hardware Requirements
|
||||||
|
|
||||||
| Backend | Hardware |
|
| Backend | Hardware |
|
||||||
|---------|----------|
|
| ------- | -------- |
|
||||||
| Flash/Triton/FlashInfer | Any CUDA GPU |
|
| Flash/Triton/FlashInfer | Any CUDA GPU |
|
||||||
| CUTLASS MLA | Blackwell (SM100+) |
|
| CUTLASS MLA | Blackwell (SM100+) |
|
||||||
| FlashAttn MLA | Hopper (SM90+) |
|
| FlashAttn MLA | Hopper (SM90+) |
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
|
|||||||
max_kv_len = max(r.kv_len for r in requests) if requests else 0
|
max_kv_len = max(r.kv_len for r in requests) if requests else 0
|
||||||
return (batch_size, max_q_len, max_kv_len)
|
return (batch_size, max_q_len, max_kv_len)
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fallback for unparseable specs
|
# Fallback for unparsable specs
|
||||||
return (0, 0, 0)
|
return (0, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -145,7 +145,6 @@ def create_minimal_vllm_config(
|
|||||||
cache_config = CacheConfig(
|
cache_config = CacheConfig(
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
gpu_memory_utilization=0.9,
|
gpu_memory_utilization=0.9,
|
||||||
swap_space=0,
|
|
||||||
cache_dtype="auto",
|
cache_dtype="auto",
|
||||||
enable_prefix_caching=False,
|
enable_prefix_caching=False,
|
||||||
)
|
)
|
||||||
@@ -701,7 +700,7 @@ def _run_single_benchmark(
|
|||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(config.warmup_iters):
|
for _ in range(config.warmup_iters):
|
||||||
forward_fn()
|
forward_fn()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
times = []
|
times = []
|
||||||
@@ -714,7 +713,7 @@ def _run_single_benchmark(
|
|||||||
forward_fn()
|
forward_fn()
|
||||||
end.record()
|
end.record()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
elapsed_ms = start.elapsed_time(end)
|
elapsed_ms = start.elapsed_time(end)
|
||||||
times.append(elapsed_ms / 1000.0 / config.num_layers)
|
times.append(elapsed_ms / 1000.0 / config.num_layers)
|
||||||
|
|
||||||
|
|||||||
@@ -141,7 +141,6 @@ def _create_vllm_config(
|
|||||||
cache_config = CacheConfig(
|
cache_config = CacheConfig(
|
||||||
block_size=config.block_size,
|
block_size=config.block_size,
|
||||||
cache_dtype="auto",
|
cache_dtype="auto",
|
||||||
swap_space=0,
|
|
||||||
)
|
)
|
||||||
cache_config.num_gpu_blocks = max_num_blocks
|
cache_config.num_gpu_blocks = max_num_blocks
|
||||||
cache_config.num_cpu_blocks = 0
|
cache_config.num_cpu_blocks = 0
|
||||||
@@ -391,7 +390,7 @@ def _run_single_benchmark(
|
|||||||
attn_metadata,
|
attn_metadata,
|
||||||
output=out,
|
output=out,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
times = []
|
times = []
|
||||||
@@ -412,7 +411,7 @@ def _run_single_benchmark(
|
|||||||
)
|
)
|
||||||
end.record()
|
end.record()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
elapsed_ms = start.elapsed_time(end)
|
elapsed_ms = start.elapsed_time(end)
|
||||||
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
|
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
|
||||||
|
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
|
|||||||
| --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
||||||
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
|
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
|
||||||
| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
|
| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
|
||||||
| `TP` | **Required.** The tensor-parallelism size. | `1` |
|
| `TP` | **Required.** The tensor-parallelism size. | `1` |
|
||||||
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
||||||
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
||||||
|
|||||||
@@ -94,7 +94,7 @@ def create_logits(
|
|||||||
|
|
||||||
def measure_memory() -> tuple[int, int]:
|
def measure_memory() -> tuple[int, int]:
|
||||||
"""Return (allocated, reserved) memory in bytes."""
|
"""Return (allocated, reserved) memory in bytes."""
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
|
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
|
||||||
|
|
||||||
|
|
||||||
@@ -102,7 +102,7 @@ def reset_memory_stats():
|
|||||||
"""Reset peak memory statistics."""
|
"""Reset peak memory statistics."""
|
||||||
reset_buffer_cache()
|
reset_buffer_cache()
|
||||||
torch.cuda.reset_peak_memory_stats()
|
torch.cuda.reset_peak_memory_stats()
|
||||||
torch.cuda.empty_cache()
|
torch.accelerator.empty_cache()
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
@@ -123,7 +123,7 @@ def benchmark_function(
|
|||||||
for _ in range(warmup_iters):
|
for _ in range(warmup_iters):
|
||||||
logits_copy = logits.clone()
|
logits_copy = logits.clone()
|
||||||
func(logits_copy, k, p)
|
func(logits_copy, k, p)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Reset memory stats before benchmark
|
# Reset memory stats before benchmark
|
||||||
reset_memory_stats()
|
reset_memory_stats()
|
||||||
@@ -140,7 +140,7 @@ def benchmark_function(
|
|||||||
func(logits_copy, k, p)
|
func(logits_copy, k, p)
|
||||||
end_events[i].record()
|
end_events[i].record()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Calculate timing
|
# Calculate timing
|
||||||
times = [
|
times = [
|
||||||
|
|||||||
98
benchmarks/kernels/bench_concat_mla_q.py
Normal file
98
benchmarks/kernels/bench_concat_mla_q.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
# DeepSeek V3 dimensions
|
||||||
|
NOPE_DIM = 512
|
||||||
|
ROPE_DIM = 64
|
||||||
|
NUM_HEADS = 128
|
||||||
|
|
||||||
|
NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
|
||||||
|
|
||||||
|
|
||||||
|
def get_configs():
|
||||||
|
return NUM_TOKENS
|
||||||
|
|
||||||
|
|
||||||
|
def make_inputs(num_tokens, dtype):
|
||||||
|
"""Create inputs matching the real code path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
contiguous_nope: If False, simulate the transposed BMM output
|
||||||
|
(non-contiguous nope with stride pattern from
|
||||||
|
[N,B,L].transpose(0,1)).
|
||||||
|
"""
|
||||||
|
# Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
|
||||||
|
raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
|
||||||
|
ql_nope = raw.transpose(0, 1)
|
||||||
|
|
||||||
|
q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
|
||||||
|
return ql_nope, q_pe
|
||||||
|
|
||||||
|
|
||||||
|
# ---- Non-contiguous nope benchmark (real code path) ----
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["num_tokens"],
|
||||||
|
x_vals=get_configs(),
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=["torch_cat", "concat_mla_q"],
|
||||||
|
line_names=["torch.cat", "concat_mla_q (v8)"],
|
||||||
|
styles=[("blue", "--"), ("green", "-")],
|
||||||
|
ylabel="Latency (us)",
|
||||||
|
plot_name="concat_mla_q-transposed",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def bench_transposed(num_tokens, provider):
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
ql_nope, q_pe = make_inputs(num_tokens, dtype)
|
||||||
|
|
||||||
|
q_out = torch.empty(
|
||||||
|
num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
|
||||||
|
)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "torch_cat":
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
|
||||||
|
)
|
||||||
|
|
||||||
|
return ms * 1000, max_ms * 1000, min_ms * 1000 # us
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
|
||||||
|
parser.add_argument(
|
||||||
|
"--save-path", type=str, default=None, help="Path to save benchmark results"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("CONCAT MLA Q KERNEL BENCHMARKS")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
|
||||||
|
print(
|
||||||
|
f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
|
||||||
|
f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
|
||||||
|
)
|
||||||
|
print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
|
||||||
|
bench_transposed.run(print_data=True, save_path=args.save_path)
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Benchmarking complete!")
|
||||||
|
print("=" * 70)
|
||||||
153
benchmarks/kernels/bench_cp_gather_fp8.py
Normal file
153
benchmarks/kernels/bench_cp_gather_fp8.py
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import argparse
|
||||||
|
import math
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
# DeepSeek V3 MLA dimensions
|
||||||
|
NOPE_DIM = 512
|
||||||
|
ROPE_DIM = 64
|
||||||
|
HEAD_DIM = NOPE_DIM + ROPE_DIM # 576 BF16 output elements per token
|
||||||
|
ENTRY_BYTES = 656 # 512 FP8 + 16 scales + 128 BF16 RoPE
|
||||||
|
BLOCK_SIZE = 64 # tokens per physical cache block - get_supported_kernel_block_sizes
|
||||||
|
|
||||||
|
# Realistic prefill scenarios:
|
||||||
|
# - 1 long prefill: single request, 16K-96K tokens
|
||||||
|
# - 4 medium prefills: 4 requests, 4K-24K tokens each
|
||||||
|
# - 16 shorter prefills: 16 requests, 1K-6K tokens each
|
||||||
|
SCENARIOS = [
|
||||||
|
# (label, num_reqs, total_tokens_list)
|
||||||
|
("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
|
||||||
|
("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
|
||||||
|
("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def make_inputs(total_tokens, num_reqs, block_size):
|
||||||
|
"""Create synthetic FP8 cache, block table, and output buffer.
|
||||||
|
|
||||||
|
Fills the cache with random bytes (we only measure throughput,
|
||||||
|
not correctness). Block table maps each request to contiguous
|
||||||
|
physical blocks.
|
||||||
|
"""
|
||||||
|
# Divide tokens evenly across requests
|
||||||
|
base_len = total_tokens // num_reqs
|
||||||
|
remainder = total_tokens % num_reqs
|
||||||
|
seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
|
||||||
|
|
||||||
|
# workspace_starts: cumulative sum of seq_lens
|
||||||
|
workspace_starts = [0] * num_reqs
|
||||||
|
for r in range(1, num_reqs):
|
||||||
|
workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
|
||||||
|
|
||||||
|
# Physical blocks needed per request
|
||||||
|
blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
|
||||||
|
total_blocks = sum(blocks_per_req)
|
||||||
|
max_blocks = max(blocks_per_req)
|
||||||
|
|
||||||
|
# Allocate cache with random data (content doesn't matter for perf)
|
||||||
|
cache = torch.randint(
|
||||||
|
0,
|
||||||
|
256,
|
||||||
|
(total_blocks, block_size, ENTRY_BYTES),
|
||||||
|
dtype=torch.uint8,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Block table: contiguous block assignments
|
||||||
|
block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
|
||||||
|
block_idx = 0
|
||||||
|
for r in range(num_reqs):
|
||||||
|
for b in range(blocks_per_req[r]):
|
||||||
|
block_table[r, b] = block_idx
|
||||||
|
block_idx += 1
|
||||||
|
|
||||||
|
# Output workspace
|
||||||
|
dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
|
||||||
|
|
||||||
|
seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
|
||||||
|
workspace_starts_t = torch.tensor(
|
||||||
|
workspace_starts, dtype=torch.int32, device="cuda"
|
||||||
|
)
|
||||||
|
|
||||||
|
return cache, dst, block_table, seq_lens_t, workspace_starts_t
|
||||||
|
|
||||||
|
|
||||||
|
def bench_scenario(label, num_reqs, total_tokens_list, save_path):
|
||||||
|
"""Run benchmark for a specific (num_reqs, total_tokens) scenario."""
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["total_tokens"],
|
||||||
|
x_vals=total_tokens_list,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=["cuda_kernel"],
|
||||||
|
line_names=["cp_gather_fp8 (CUDA)"],
|
||||||
|
styles=[("green", "-")],
|
||||||
|
ylabel="Latency (us)",
|
||||||
|
plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
|
||||||
|
args={"num_reqs": num_reqs},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def bench_fn(total_tokens, provider, num_reqs):
|
||||||
|
cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
|
||||||
|
total_tokens, num_reqs, BLOCK_SIZE
|
||||||
|
)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
|
||||||
|
cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
|
||||||
|
),
|
||||||
|
quantiles=quantiles,
|
||||||
|
rep=500,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ms * 1000, max_ms * 1000, min_ms * 1000 # us
|
||||||
|
|
||||||
|
seq_len_per_req = total_tokens_list[0] // num_reqs
|
||||||
|
seq_len_per_req_max = total_tokens_list[-1] // num_reqs
|
||||||
|
print(
|
||||||
|
f"\n--- {label}: {num_reqs} request(s), "
|
||||||
|
f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
|
||||||
|
)
|
||||||
|
bench_fn.run(print_data=True, save_path=save_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--save-path",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to save benchmark results as CSV",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Print data volume info for bandwidth analysis
|
||||||
|
read_per_token = ENTRY_BYTES # 656 bytes from cache
|
||||||
|
write_per_token = HEAD_DIM * 2 # 576 * 2 = 1152 bytes to workspace
|
||||||
|
total_per_token = read_per_token + write_per_token # 1808 bytes
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
|
||||||
|
print(f"Output row: {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
|
||||||
|
print(f"Per token: {total_per_token} bytes (read + write)")
|
||||||
|
print(f"Block size: {BLOCK_SIZE} tokens/block")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for label, num_reqs, total_tokens_list in SCENARIOS:
|
||||||
|
bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Benchmarking complete!")
|
||||||
|
print("=" * 70)
|
||||||
@@ -168,7 +168,7 @@ def bench_impl(
|
|||||||
# warmup
|
# warmup
|
||||||
for kwargs in kwargs_list:
|
for kwargs in kwargs_list:
|
||||||
impl_type.get_impl()(**kwargs)
|
impl_type.get_impl()(**kwargs)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||||
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
|
|||||||
# reference output
|
# reference output
|
||||||
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
||||||
|
|
||||||
# test ouptut
|
# test output
|
||||||
out_q, out_s = output_from_impl(
|
out_q, out_s = output_from_impl(
|
||||||
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -171,7 +171,7 @@ def bench_run(
|
|||||||
activation=MoEActivation.SILU,
|
activation=MoEActivation.SILU,
|
||||||
global_num_experts=num_experts,
|
global_num_experts=num_experts,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
|
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
|
||||||
triton_stream = torch.cuda.Stream()
|
triton_stream = torch.cuda.Stream()
|
||||||
@@ -187,14 +187,14 @@ def bench_run(
|
|||||||
topk_ids,
|
topk_ids,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
|
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
|
||||||
"""Benchmark CUDA graph using events like benchmark_moe.py"""
|
"""Benchmark CUDA graph using events like benchmark_moe.py"""
|
||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(num_warmup):
|
for _ in range(num_warmup):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Timing
|
# Timing
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
@@ -202,7 +202,7 @@ def bench_run(
|
|||||||
|
|
||||||
latencies = []
|
latencies = []
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_event.record()
|
start_event.record()
|
||||||
graph.replay()
|
graph.replay()
|
||||||
end_event.record()
|
end_event.record()
|
||||||
|
|||||||
@@ -307,7 +307,7 @@ def bench_run(
|
|||||||
def replay_graph(graph, num_repeats):
|
def replay_graph(graph, num_repeats):
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
cutlass_stream = torch.cuda.Stream()
|
cutlass_stream = torch.cuda.Stream()
|
||||||
cutlass_graph = torch.cuda.CUDAGraph()
|
cutlass_graph = torch.cuda.CUDAGraph()
|
||||||
@@ -330,7 +330,7 @@ def bench_run(
|
|||||||
e=num_experts,
|
e=num_experts,
|
||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
triton_stream = torch.cuda.Stream()
|
triton_stream = torch.cuda.Stream()
|
||||||
triton_graph = torch.cuda.CUDAGraph()
|
triton_graph = torch.cuda.CUDAGraph()
|
||||||
@@ -345,7 +345,7 @@ def bench_run(
|
|||||||
w2_fp8scale,
|
w2_fp8scale,
|
||||||
a_fp8_scale,
|
a_fp8_scale,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
min_run_time = 5
|
min_run_time = 5
|
||||||
num_warmup = 5
|
num_warmup = 5
|
||||||
|
|||||||
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
|
|||||||
if not should_use_fn(tensor):
|
if not should_use_fn(tensor):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
stream = torch.cuda.Stream()
|
stream = torch.cuda.Stream()
|
||||||
with torch.cuda.stream(stream):
|
with torch.cuda.stream(stream):
|
||||||
graph_input = tensor.clone()
|
graph_input = tensor.clone()
|
||||||
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
|
|||||||
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
||||||
allreduce_fn(graph_input)
|
allreduce_fn(graph_input)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
for _ in range(num_warmup):
|
for _ in range(num_warmup):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
for _ in range(num_trials):
|
for _ in range(num_trials):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
|
|
||||||
|
|||||||
@@ -385,7 +385,7 @@ def benchmark_operation(
|
|||||||
# Warmup before graph capture
|
# Warmup before graph capture
|
||||||
for _ in range(warmup):
|
for _ in range(warmup):
|
||||||
operation_func(*args, **kwargs)
|
operation_func(*args, **kwargs)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Create CUDA graph
|
# Create CUDA graph
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
@@ -398,19 +398,19 @@ def benchmark_operation(
|
|||||||
operation_func(*args, **kwargs)
|
operation_func(*args, **kwargs)
|
||||||
|
|
||||||
# Graph warmup
|
# Graph warmup
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
for _ in range(warmup):
|
for _ in range(warmup):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
|
|
||||||
# Benchmark with CUDA graph
|
# Benchmark with CUDA graph
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
for _ in range(trials // num_op_per_cudagraph):
|
for _ in range(trials // num_op_per_cudagraph):
|
||||||
# operation_func(*args, **kwargs)
|
# operation_func(*args, **kwargs)
|
||||||
graph.replay()
|
graph.replay()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
|
|
||||||
avg_time_ms = ((end_time - start_time) / trials) * 1000
|
avg_time_ms = ((end_time - start_time) / trials) * 1000
|
||||||
|
|||||||
@@ -224,7 +224,7 @@ def bench_run(
|
|||||||
def replay_graph(graph, num_repeats):
|
def replay_graph(graph, num_repeats):
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
cutlass_stream = torch.cuda.Stream()
|
cutlass_stream = torch.cuda.Stream()
|
||||||
cutlass_graph = torch.cuda.CUDAGraph()
|
cutlass_graph = torch.cuda.CUDAGraph()
|
||||||
@@ -239,7 +239,7 @@ def bench_run(
|
|||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
triton_stream = torch.cuda.Stream()
|
triton_stream = torch.cuda.Stream()
|
||||||
triton_graph = torch.cuda.CUDAGraph()
|
triton_graph = torch.cuda.CUDAGraph()
|
||||||
@@ -254,7 +254,7 @@ def bench_run(
|
|||||||
w2_scale,
|
w2_scale,
|
||||||
a_scale,
|
a_scale,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
min_run_time = 5
|
min_run_time = 5
|
||||||
num_warmup = 5
|
num_warmup = 5
|
||||||
|
|||||||
@@ -34,14 +34,14 @@ def main(
|
|||||||
residual = torch.randn_like(x) * scale if add_residual else None
|
residual = torch.randn_like(x) * scale if add_residual else None
|
||||||
|
|
||||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
if profile:
|
if profile:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
layer(x, residual)
|
layer(x, residual)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
if profile:
|
if profile:
|
||||||
|
|||||||
@@ -1035,7 +1035,7 @@ def bench_optype(
|
|||||||
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
||||||
for kwargs in kwargs_list:
|
for kwargs in kwargs_list:
|
||||||
op_type.bench_fn()(**kwargs)
|
op_type.bench_fn()(**kwargs)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||||
|
|||||||
@@ -47,13 +47,13 @@ def benchmark_method(
|
|||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(num_warmup):
|
for _ in range(num_warmup):
|
||||||
_ = method(k_nope, k_pe)
|
_ = method(k_nope, k_pe)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
_ = method(k_nope, k_pe)
|
_ = method(k_nope, k_pe)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
return (end - start) / num_iters * 1000 # Convert to ms
|
return (end - start) / num_iters * 1000 # Convert to ms
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ def clear_triton_cache():
|
|||||||
|
|
||||||
# Clear CUDA memory cache
|
# Clear CUDA memory cache
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.empty_cache()
|
torch.accelerator.empty_cache()
|
||||||
|
|
||||||
# Try to clear Triton's runtime cache
|
# Try to clear Triton's runtime cache
|
||||||
try:
|
try:
|
||||||
@@ -304,19 +304,19 @@ def benchmark_config(
|
|||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Capture 10 invocations with CUDA graph
|
# Capture 10 invocations with CUDA graph
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(graph):
|
with torch.cuda.graph(graph):
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
@@ -324,7 +324,7 @@ def benchmark_config(
|
|||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
prepare(i)
|
prepare(i)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event.record()
|
start_event.record()
|
||||||
graph.replay()
|
graph.replay()
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ def benchmark_config(
|
|||||||
topk_ids,
|
topk_ids,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
start = torch.cuda.Event(enable_timing=True)
|
start = torch.cuda.Event(enable_timing=True)
|
||||||
@@ -149,7 +149,7 @@ def benchmark_config(
|
|||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
end.record()
|
end.record()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
return start.elapsed_time(end) / num_iters * 1000 # ms -> us
|
return start.elapsed_time(end) / num_iters * 1000 # ms -> us
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -69,19 +69,19 @@ def benchmark_permute(
|
|||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Capture 10 invocations with CUDA graph
|
# Capture 10 invocations with CUDA graph
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(graph):
|
with torch.cuda.graph(graph):
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
@@ -89,7 +89,7 @@ def benchmark_permute(
|
|||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
prepare(i)
|
prepare(i)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event.record()
|
start_event.record()
|
||||||
graph.replay()
|
graph.replay()
|
||||||
@@ -159,26 +159,26 @@ def benchmark_unpermute(
|
|||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
input = prepare()
|
input = prepare()
|
||||||
run(input)
|
run(input)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Capture 10 invocations with CUDA graph
|
# Capture 10 invocations with CUDA graph
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(graph):
|
with torch.cuda.graph(graph):
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
run(input)
|
run(input)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_event.record()
|
start_event.record()
|
||||||
graph.replay()
|
graph.replay()
|
||||||
end_event.record()
|
end_event.record()
|
||||||
|
|||||||
@@ -135,14 +135,14 @@ def benchmark_mrope(
|
|||||||
key.clone(),
|
key.clone(),
|
||||||
)
|
)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Time reference implementation
|
# Time reference implementation
|
||||||
torch_times = []
|
torch_times = []
|
||||||
for _ in range(benchmark_iter):
|
for _ in range(benchmark_iter):
|
||||||
query_clone = query.clone()
|
query_clone = query.clone()
|
||||||
key_clone = key.clone()
|
key_clone = key.clone()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
mrope_helper_class.forward_native(
|
mrope_helper_class.forward_native(
|
||||||
@@ -151,7 +151,7 @@ def benchmark_mrope(
|
|||||||
key_clone,
|
key_clone,
|
||||||
)
|
)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
torch_times.append(time.time() - start_time)
|
torch_times.append(time.time() - start_time)
|
||||||
|
|
||||||
# Time triton kernel implementation
|
# Time triton kernel implementation
|
||||||
@@ -159,14 +159,14 @@ def benchmark_mrope(
|
|||||||
for _ in range(benchmark_iter):
|
for _ in range(benchmark_iter):
|
||||||
query_clone = query.clone()
|
query_clone = query.clone()
|
||||||
key_clone = key.clone()
|
key_clone = key.clone()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
mrope_helper_class.forward_cuda(
|
mrope_helper_class.forward_cuda(
|
||||||
positions,
|
positions,
|
||||||
query_clone,
|
query_clone,
|
||||||
key_clone,
|
key_clone,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
triton_times.append(time.time() - start_time)
|
triton_times.append(time.time() - start_time)
|
||||||
|
|
||||||
# Calculate statistics
|
# Calculate statistics
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ def main(
|
|||||||
max_logits = torch.empty_like(exp_sums)
|
max_logits = torch.empty_like(exp_sums)
|
||||||
|
|
||||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
if profile:
|
if profile:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
@@ -173,7 +173,7 @@ def main(
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid version: {version}")
|
raise ValueError(f"Invalid version: {version}")
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
if profile:
|
if profile:
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ def _time_cuda(
|
|||||||
# warmup
|
# warmup
|
||||||
for _ in range(warmup_iters):
|
for _ in range(warmup_iters):
|
||||||
fn()
|
fn()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start = torch.Event(enable_timing=True)
|
start = torch.Event(enable_timing=True)
|
||||||
end = torch.Event(enable_timing=True)
|
end = torch.Event(enable_timing=True)
|
||||||
@@ -37,7 +37,7 @@ def _time_cuda(
|
|||||||
for _ in range(bench_iters):
|
for _ in range(bench_iters):
|
||||||
fn()
|
fn()
|
||||||
end.record()
|
end.record()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
return start.elapsed_time(end) / bench_iters # ms/iter
|
return start.elapsed_time(end) / bench_iters # ms/iter
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ def main(
|
|||||||
scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
|
scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
|
||||||
|
|
||||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
if profile:
|
if profile:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
@@ -39,7 +39,7 @@ def main(
|
|||||||
ops.scaled_int8_quant(x, scale)
|
ops.scaled_int8_quant(x, scale)
|
||||||
else:
|
else:
|
||||||
ops.scaled_fp8_quant(x, scale)
|
ops.scaled_fp8_quant(x, scale)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
if profile:
|
if profile:
|
||||||
|
|||||||
@@ -84,16 +84,16 @@ def run_benchmark(
|
|||||||
g = torch.cuda.CUDAGraph()
|
g = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(g):
|
with torch.cuda.graph(g):
|
||||||
function_under_test()
|
function_under_test()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
function_under_test = lambda: g.replay()
|
function_under_test = lambda: g.replay()
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for _ in range(n_iters):
|
for _ in range(n_iters):
|
||||||
function_under_test()
|
function_under_test()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return (end - start) / n_iters
|
return (end - start) / n_iters
|
||||||
|
|
||||||
@@ -104,7 +104,7 @@ def run_benchmark(
|
|||||||
|
|
||||||
# free tensors to mitigate OOM when sweeping
|
# free tensors to mitigate OOM when sweeping
|
||||||
del key, value, key_cache, value_cache, slot_mapping
|
del key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.empty_cache()
|
torch.accelerator.empty_cache()
|
||||||
|
|
||||||
return lat
|
return lat
|
||||||
|
|
||||||
|
|||||||
@@ -109,16 +109,16 @@ def run_benchmark(
|
|||||||
g = torch.cuda.CUDAGraph()
|
g = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(g):
|
with torch.cuda.graph(g):
|
||||||
function_under_test()
|
function_under_test()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
function_under_test = lambda: g.replay()
|
function_under_test = lambda: g.replay()
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for _ in range(n_iters):
|
for _ in range(n_iters):
|
||||||
function_under_test()
|
function_under_test()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return (end - start) / n_iters
|
return (end - start) / n_iters
|
||||||
|
|
||||||
@@ -129,7 +129,7 @@ def run_benchmark(
|
|||||||
|
|
||||||
# free tensors to mitigate OOM when sweeping
|
# free tensors to mitigate OOM when sweeping
|
||||||
del key, value, key_cache, value_cache, slot_mapping
|
del key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.empty_cache()
|
torch.accelerator.empty_cache()
|
||||||
|
|
||||||
return lat
|
return lat
|
||||||
|
|
||||||
|
|||||||
@@ -251,7 +251,7 @@ def benchmark(
|
|||||||
kernel(
|
kernel(
|
||||||
y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
|
y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
@@ -259,7 +259,7 @@ def benchmark(
|
|||||||
# Benchmark
|
# Benchmark
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for _ in range(runs):
|
for _ in range(runs):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event.record()
|
start_event.record()
|
||||||
for i in range(iterations_per_run):
|
for i in range(iterations_per_run):
|
||||||
|
|||||||
@@ -126,7 +126,7 @@ def benchmark_decode(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = torch.Event(enable_timing=True)
|
start = torch.Event(enable_timing=True)
|
||||||
end = torch.Event(enable_timing=True)
|
end = torch.Event(enable_timing=True)
|
||||||
times = []
|
times = []
|
||||||
@@ -136,7 +136,7 @@ def benchmark_decode(
|
|||||||
start.record()
|
start.record()
|
||||||
fn()
|
fn()
|
||||||
end.record()
|
end.record()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
times.append(start.elapsed_time(end)) # ms
|
times.append(start.elapsed_time(end)) # ms
|
||||||
return sum(times) / len(times), torch.std(torch.tensor(times))
|
return sum(times) / len(times), torch.std(torch.tensor(times))
|
||||||
|
|
||||||
|
|||||||
@@ -138,7 +138,7 @@ def benchmark_prefill(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = torch.Event(enable_timing=True)
|
start = torch.Event(enable_timing=True)
|
||||||
end = torch.Event(enable_timing=True)
|
end = torch.Event(enable_timing=True)
|
||||||
times = []
|
times = []
|
||||||
@@ -148,7 +148,7 @@ def benchmark_prefill(
|
|||||||
start.record()
|
start.record()
|
||||||
fn()
|
fn()
|
||||||
end.record()
|
end.record()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
times.append(start.elapsed_time(end)) # ms
|
times.append(start.elapsed_time(end)) # ms
|
||||||
return sum(times) / len(times), torch.std(torch.tensor(times))
|
return sum(times) / len(times), torch.std(torch.tensor(times))
|
||||||
|
|
||||||
|
|||||||
@@ -177,18 +177,18 @@ def benchmark_config(
|
|||||||
def run():
|
def run():
|
||||||
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
|
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
# JIT complication & warmup
|
# JIT complication & warmup
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_event.record()
|
start_event.record()
|
||||||
run()
|
run()
|
||||||
end_event.record()
|
end_event.record()
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ def benchmark_shape(
|
|||||||
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
# Reference result in BF16
|
# Reference result in BF16
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
C_ref = A @ B.t()
|
C_ref = A @ B.t()
|
||||||
|
|
||||||
# Pre-quantize B for all implementations
|
# Pre-quantize B for all implementations
|
||||||
@@ -121,14 +121,14 @@ def benchmark_shape(
|
|||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(warmup):
|
for _ in range(warmup):
|
||||||
func()
|
func()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Timing loop
|
# Timing loop
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = time.time()
|
start = time.time()
|
||||||
for _ in range(repeat):
|
for _ in range(repeat):
|
||||||
func()
|
func()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
||||||
# Calculate timing and TFLOPS
|
# Calculate timing and TFLOPS
|
||||||
|
|||||||
@@ -242,6 +242,16 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
|
|||||||
)
|
)
|
||||||
else()
|
else()
|
||||||
message(STATUS "Downloading oneDNN from GitHub")
|
message(STATUS "Downloading oneDNN from GitHub")
|
||||||
|
if(ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
|
||||||
|
message(STATUS "aarch64 detected: using pinned oneDNN commit 9c5be1cc59e368aebf0909e6cf20f981ea61462a")
|
||||||
|
FetchContent_Declare(
|
||||||
|
oneDNN
|
||||||
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
|
GIT_TAG 9c5be1cc59e368aebf0909e6cf20f981ea61462a
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
GIT_SHALLOW FALSE
|
||||||
|
)
|
||||||
|
else()
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
oneDNN
|
oneDNN
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
@@ -250,6 +260,7 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
|
|||||||
GIT_SHALLOW TRUE
|
GIT_SHALLOW TRUE
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||||
set(ONEDNN_BUILD_DOC "OFF")
|
set(ONEDNN_BUILD_DOC "OFF")
|
||||||
|
|||||||
@@ -74,6 +74,12 @@ void indexer_k_quant_and_cache(
|
|||||||
int64_t quant_block_size, // quantization block size
|
int64_t quant_block_size, // quantization block size
|
||||||
const std::string& scale_fmt);
|
const std::string& scale_fmt);
|
||||||
|
|
||||||
|
// Concatenate query nope and rope for MLA/DSA attention
|
||||||
|
void concat_mla_q(
|
||||||
|
torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
|
||||||
|
torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim]
|
||||||
|
torch::Tensor& q_out); // [num_tokens, num_heads, nope_dim + rope_dim]
|
||||||
|
|
||||||
// Extract function to gather quantized K cache
|
// Extract function to gather quantized K cache
|
||||||
void cp_gather_indexer_k_quant_cache(
|
void cp_gather_indexer_k_quant_cache(
|
||||||
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
#include "cuda_compat.h"
|
#include "cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "quantization/vectorization_utils.cuh"
|
#include "quantization/vectorization_utils.cuh"
|
||||||
|
#include "concat_mla_q.cuh"
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
||||||
@@ -995,75 +996,67 @@ namespace vllm {
|
|||||||
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
||||||
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||||
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||||
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
|
__nv_bfloat16* __restrict__ dst, // [total_tokens, 576]
|
||||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
const int32_t* __restrict__ block_table, // [num_reqs, BLOCK_INDICES]
|
||||||
const int32_t* __restrict__ seq_lens, // [BATCH]
|
const int32_t* __restrict__ workspace_starts, // [num_reqs]
|
||||||
const int32_t* __restrict__ workspace_starts, // [BATCH]
|
const int32_t num_reqs, const int32_t block_size,
|
||||||
const int32_t block_size, const int32_t head_dim,
|
const int32_t total_tokens, const int64_t block_table_stride,
|
||||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
const int64_t cache_block_stride, const int64_t cache_entry_stride,
|
||||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
|
const int64_t dst_entry_stride) {
|
||||||
const int64_t bid = blockIdx.x; // Batch ID
|
const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
|
||||||
const int32_t num_splits = gridDim.y;
|
if (flat_warp_id >= total_tokens) return;
|
||||||
const int32_t split = blockIdx.y;
|
const int lane_id = threadIdx.x & 31;
|
||||||
const int32_t seq_start = workspace_starts[bid];
|
|
||||||
const int32_t seq_len = seq_lens[bid];
|
|
||||||
const int32_t tot_slots = seq_len;
|
|
||||||
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
|
|
||||||
|
|
||||||
const int32_t split_start = split * split_slots;
|
// Binary search to find which request owns this output token
|
||||||
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
|
int lo = 0, hi = num_reqs - 1;
|
||||||
|
while (lo < hi) {
|
||||||
|
int mid = (lo + hi + 1) >> 1;
|
||||||
|
if (workspace_starts[mid] <= flat_warp_id)
|
||||||
|
lo = mid;
|
||||||
|
else
|
||||||
|
hi = mid - 1;
|
||||||
|
}
|
||||||
|
const int req_id = lo;
|
||||||
|
|
||||||
const bool is_active_split = (split_start < tot_slots);
|
// Compute physical token address via block table
|
||||||
|
const int out_token_id = flat_warp_id;
|
||||||
|
const int token_offset = out_token_id - workspace_starts[req_id];
|
||||||
|
const int cache_block_idx = token_offset / block_size;
|
||||||
|
const int offset_in_block = token_offset % block_size;
|
||||||
|
const int physical_block =
|
||||||
|
block_table[req_id * block_table_stride + cache_block_idx];
|
||||||
|
|
||||||
if (!is_active_split) return;
|
const uint8_t* token_ptr = src_cache + physical_block * cache_block_stride +
|
||||||
|
offset_in_block * cache_entry_stride;
|
||||||
|
|
||||||
// Adjust the pointer for the block_table for this batch
|
const int4* nope_src = reinterpret_cast<const int4*>(token_ptr);
|
||||||
const int32_t batch_offset = bid * block_table_stride;
|
const int4 fp8_data = nope_src[lane_id];
|
||||||
int32_t offset = split_start;
|
|
||||||
int32_t offset_div = offset / block_size;
|
|
||||||
offset = offset % block_size;
|
|
||||||
const int32_t* batch_block_table = block_table + batch_offset;
|
|
||||||
|
|
||||||
// Adjust dst pointer based on the cumulative sequence lengths
|
|
||||||
dst += seq_start * dst_entry_stride;
|
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
|
||||||
|
|
||||||
// Process each token in this split
|
|
||||||
for (int pid = split_start; pid < split_end; ++pid) {
|
|
||||||
auto block_id = batch_block_table[offset_div];
|
|
||||||
const uint8_t* token_ptr =
|
|
||||||
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
|
|
||||||
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
|
|
||||||
|
|
||||||
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
|
|
||||||
const uint8_t* no_pe_ptr = token_ptr;
|
|
||||||
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
||||||
const __nv_bfloat16* rope_ptr =
|
const float scale = scales_ptr[lane_id >> 3];
|
||||||
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
|
|
||||||
|
|
||||||
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
|
const uint2 fp8_lo = make_uint2(fp8_data.x, fp8_data.y);
|
||||||
if (tid < 512) {
|
const uint2 fp8_hi = make_uint2(fp8_data.z, fp8_data.w);
|
||||||
// FP8 dequantization
|
#ifdef USE_ROCM
|
||||||
const int tile = tid >> 7; // each tile is 128 elements
|
const bf16_8_t bf16_lo =
|
||||||
const float scale = scales_ptr[tile];
|
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale);
|
||||||
const uint8_t val = no_pe_ptr[tid];
|
const bf16_8_t bf16_hi =
|
||||||
dst_ptr[tid] =
|
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale);
|
||||||
fp8::scaled_convert<__nv_bfloat16, uint8_t,
|
#else
|
||||||
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
|
const bf16_8_t bf16_lo =
|
||||||
} else if (tid < 576) {
|
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale, __NV_E4M3);
|
||||||
// Rope copy (64 bf16 elements)
|
const bf16_8_t bf16_hi =
|
||||||
const int rope_idx = tid - 512;
|
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale, __NV_E4M3);
|
||||||
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
|
#endif
|
||||||
}
|
|
||||||
|
|
||||||
// Move to next token
|
__nv_bfloat16* dst_ptr = dst + out_token_id * dst_entry_stride;
|
||||||
offset += 1;
|
int4* nope_dst = reinterpret_cast<int4*>(dst_ptr) + lane_id * 2;
|
||||||
if (offset == block_size) {
|
nope_dst[0] = *reinterpret_cast<const int4*>(&bf16_lo);
|
||||||
offset_div += 1;
|
nope_dst[1] = *reinterpret_cast<const int4*>(&bf16_hi);
|
||||||
offset = 0;
|
|
||||||
}
|
const int* rope_src = reinterpret_cast<const int*>(token_ptr + 528);
|
||||||
}
|
int* rope_dst = reinterpret_cast<int*>(dst_ptr + 512);
|
||||||
|
rope_dst[lane_id] = rope_src[lane_id];
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
@@ -1257,15 +1250,16 @@ void cp_gather_and_upconvert_fp8_kv_cache(
|
|||||||
src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
|
src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Decide on the number of splits based on the batch size
|
const int total_tokens = dst.size(0);
|
||||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
constexpr int warps_per_block = 8;
|
||||||
dim3 grid(batch_size, num_splits);
|
const int grid_size = (total_tokens + warps_per_block - 1) / warps_per_block;
|
||||||
dim3 block(576);
|
const int block_size_threads = warps_per_block * 32; // 256 threads
|
||||||
|
|
||||||
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
|
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid_size, block_size_threads, 0,
|
||||||
|
stream>>>(
|
||||||
src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
||||||
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
|
block_table.data_ptr<int32_t>(), workspace_starts.data_ptr<int32_t>(),
|
||||||
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
|
static_cast<int32_t>(batch_size), block_size, total_tokens,
|
||||||
block_table_stride, cache_block_stride, cache_entry_stride,
|
block_table_stride, cache_block_stride, cache_entry_stride,
|
||||||
dst_entry_stride);
|
dst_entry_stride);
|
||||||
}
|
}
|
||||||
@@ -1365,3 +1359,43 @@ void cp_gather_indexer_k_quant_cache(
|
|||||||
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
|
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Concatenate ql_nope and q_pe into a contiguous q_out tensor for MLA/DSA.
|
||||||
|
// Replaces torch.cat((ql_nope, q_pe), dim=-1).
|
||||||
|
void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
|
||||||
|
torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim]
|
||||||
|
torch::Tensor& q_out // [num_tokens, num_heads, nope_dim +
|
||||||
|
// rope_dim]
|
||||||
|
) {
|
||||||
|
const int num_tokens = ql_nope.size(0);
|
||||||
|
const int num_heads = ql_nope.size(1);
|
||||||
|
const int nope_dim = ql_nope.size(2);
|
||||||
|
const int rope_dim = q_pe.size(2);
|
||||||
|
|
||||||
|
TORCH_CHECK(nope_dim % 512 == 0, "nope_dim must be a multiple of 512, got ",
|
||||||
|
nope_dim);
|
||||||
|
TORCH_CHECK(rope_dim == 64, "rope_dim must be 64, got ", rope_dim);
|
||||||
|
TORCH_CHECK(q_out.size(2) == nope_dim + rope_dim);
|
||||||
|
|
||||||
|
TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
|
||||||
|
TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
|
||||||
|
TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
|
||||||
|
|
||||||
|
if (num_tokens == 0) return;
|
||||||
|
|
||||||
|
constexpr int warps_per_block = 8;
|
||||||
|
const int total_warps = num_tokens * num_heads;
|
||||||
|
const int grid_size = (total_warps + warps_per_block - 1) / warps_per_block;
|
||||||
|
const int block_size = warps_per_block * 32;
|
||||||
|
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
|
||||||
|
vllm::ConcatMLAQKernel<scalar_t, 512><<<grid_size, block_size, 0, stream>>>(
|
||||||
|
q_out.data_ptr<scalar_t>(), ql_nope.data_ptr<scalar_t>(),
|
||||||
|
q_pe.data_ptr<scalar_t>(), num_tokens, num_heads, q_out.stride(0),
|
||||||
|
q_out.stride(1), ql_nope.stride(0), ql_nope.stride(1), q_pe.stride(0),
|
||||||
|
q_pe.stride(1));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|||||||
60
csrc/concat_mla_q.cuh
Normal file
60
csrc/concat_mla_q.cuh
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
#ifndef CONCAT_MLA_Q_CUH_
|
||||||
|
#define CONCAT_MLA_Q_CUH_
|
||||||
|
|
||||||
|
#include <cuda_bf16.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
#include "cuda_vec_utils.cuh"
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
|
||||||
|
// Concatenates ql_nope [num_tokens, num_heads, NOPE_DIM] and
|
||||||
|
// q_pe [num_tokens, num_heads, 64]
|
||||||
|
// into q_out [num_tokens, num_heads, NOPE_DIM+64].
|
||||||
|
// Currently instantiated only for NOPE_DIM=512.
|
||||||
|
// Rope dim is hardcoded to 64 (DeepSeek V3.2 MLA)
|
||||||
|
template <typename DType, int NOPE_DIM>
|
||||||
|
__global__ void ConcatMLAQKernel(
|
||||||
|
DType* __restrict__ q_out, const DType* __restrict__ ql_nope,
|
||||||
|
const DType* __restrict__ q_pe, const int num_tokens, const int num_heads,
|
||||||
|
const int64_t out_stride_0, const int64_t out_stride_1,
|
||||||
|
const int64_t nope_stride_0, const int64_t nope_stride_1,
|
||||||
|
const int64_t pe_stride_0, const int64_t pe_stride_1) {
|
||||||
|
const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
|
||||||
|
if (flat_warp_id >= num_tokens * num_heads) return;
|
||||||
|
|
||||||
|
const int token_id = flat_warp_id / num_heads;
|
||||||
|
const int head_id = flat_warp_id % num_heads;
|
||||||
|
const int lane_id = threadIdx.x & 31;
|
||||||
|
|
||||||
|
constexpr bool use_256b = VLLM_256B_PTX_ENABLED;
|
||||||
|
constexpr int nope_vec_loads =
|
||||||
|
NOPE_DIM * sizeof(DType) / (VecTraits<use_256b>::ARCH_MAX_VEC_SIZE * 32);
|
||||||
|
|
||||||
|
const DType* nope_src =
|
||||||
|
ql_nope + token_id * nope_stride_0 + head_id * nope_stride_1;
|
||||||
|
DType* nope_dst = q_out + token_id * out_stride_0 + head_id * out_stride_1;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < nope_vec_loads; i++) {
|
||||||
|
const int offset = i * 32 + lane_id;
|
||||||
|
if constexpr (use_256b) {
|
||||||
|
st256_cs(reinterpret_cast<u32x8_t*>(nope_dst) + offset,
|
||||||
|
ld256_cs(reinterpret_cast<const u32x8_t*>(nope_src) + offset));
|
||||||
|
} else {
|
||||||
|
st128_cs(reinterpret_cast<int4*>(nope_dst) + offset,
|
||||||
|
ld128_cs(reinterpret_cast<const int4*>(nope_src) + offset));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int* rope_src = reinterpret_cast<const int*>(
|
||||||
|
q_pe + token_id * pe_stride_0 + head_id * pe_stride_1);
|
||||||
|
int* rope_dst = reinterpret_cast<int*>(q_out + token_id * out_stride_0 +
|
||||||
|
head_id * out_stride_1 + NOPE_DIM);
|
||||||
|
|
||||||
|
st32_cs(rope_dst + lane_id, ld32_cs(rope_src + lane_id));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
#endif // CONCAT_MLA_Q_CUH_
|
||||||
@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
|
|||||||
const int64_t block_size, const int64_t block_size_stride) {
|
const int64_t block_size, const int64_t block_size_stride) {
|
||||||
// For AMX 2D tiles, size of each line is 64 bytes
|
// For AMX 2D tiles, size of each line is 64 bytes
|
||||||
constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
|
constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
|
||||||
// For AMX B martix, N always is 16
|
// For AMX B matrix, N always is 16
|
||||||
constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
|
constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
|
||||||
constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
|
constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
|
||||||
// For now suppose block_size is divisible by amx_tile_column_num
|
// For now suppose block_size is divisible by amx_tile_column_num
|
||||||
|
|||||||
@@ -237,13 +237,10 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
|
|||||||
};
|
};
|
||||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||||
{b_k_stride_, b_n_stride_});
|
{b_k_stride_, b_n_stride_});
|
||||||
#ifdef __aarch64__
|
|
||||||
// dummy M size for prepacking weights
|
// dummy M size for prepacking weights
|
||||||
// Prepacking weights improves performance and avoid runtime reorders
|
// Prepacking weights improves performance and avoid runtime reorders
|
||||||
constexpr dnnl_dim_t kProbeM = 128;
|
constexpr dnnl_dim_t kProbeM = 128;
|
||||||
#else
|
|
||||||
constexpr dnnl_dim_t kProbeM = DNNL_RUNTIME_DIM_VAL;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
prepack_weight(args.b_ptr, original_b_md,
|
prepack_weight(args.b_ptr, original_b_md,
|
||||||
create_primitive_desc(
|
create_primitive_desc(
|
||||||
@@ -411,19 +408,17 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
|
|||||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||||
{b_k_stride_, b_n_stride_});
|
{b_k_stride_, b_n_stride_});
|
||||||
|
|
||||||
|
// dummy M size for prepacking weights
|
||||||
|
// Prepacking weights improves performance and avoid runtime reorders
|
||||||
|
constexpr dnnl_dim_t kProbeM = 128;
|
||||||
|
|
||||||
prepack_weight(args.b_ptr, original_b_md,
|
prepack_weight(args.b_ptr, original_b_md,
|
||||||
create_primitive_desc(
|
create_primitive_desc(
|
||||||
MSizeCacheKey{
|
MSizeCacheKey{// Use a concrete M so oneDNN's kernel
|
||||||
#ifdef VLLM_USE_ACL
|
// selector can choose an optimally blocked
|
||||||
// Arm Compute Library (ACL) backend for oneDNN does
|
// weight layout.
|
||||||
// not support runtime
|
.a_m_size = kProbeM,
|
||||||
// dimensions, so we set M to a default value
|
|
||||||
.a_m_size = 128,
|
|
||||||
.a_m_stride = b_k_size_,
|
.a_m_stride = b_k_size_,
|
||||||
#else
|
|
||||||
.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
|
||||||
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
|
||||||
#endif
|
|
||||||
.use_bias = false,
|
.use_bias = false,
|
||||||
.bias_type = dnnl::memory::data_type::undef},
|
.bias_type = dnnl::memory::data_type::undef},
|
||||||
true)
|
true)
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
// Note: overwrite the external defination for sharing same name between
|
// Note: overwrite the external definition for sharing same name between
|
||||||
// libraries use different ISAs.
|
// libraries use different ISAs.
|
||||||
#define TORCH_EXTENSION_NAME _C
|
#define TORCH_EXTENSION_NAME _C
|
||||||
|
|
||||||
|
|||||||
@@ -196,7 +196,6 @@ __forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
|
|||||||
return val;
|
return val;
|
||||||
#else
|
#else
|
||||||
assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
|
assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
|
||||||
return {};
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -211,23 +210,51 @@ __forceinline__ __device__ void st256_cs(u32x8_t* addr, u32x8_t val) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// 32-bit cache-streaming (.cs) load / store — SM100+ only.
|
// 32-bit load / store.
|
||||||
|
__device__ __forceinline__ int ld32(const int* addr) { return __ldg(addr); }
|
||||||
|
|
||||||
|
__device__ __forceinline__ void st32(int* addr, int val) { *addr = val; }
|
||||||
|
|
||||||
|
// 32-bit cache-streaming (.cs) load / store.
|
||||||
|
// Falls back to ld32/st32 on ROCm (no .cs hint).
|
||||||
__forceinline__ __device__ int ld32_cs(const int* addr) {
|
__forceinline__ __device__ int ld32_cs(const int* addr) {
|
||||||
#if VLLM_256B_PTX_ENABLED
|
|
||||||
int val;
|
int val;
|
||||||
|
#ifndef USE_ROCM
|
||||||
asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
|
asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
|
||||||
return val;
|
|
||||||
#else
|
#else
|
||||||
assert(false && "ld32_cs requires SM100+ with CUDA 12.9+");
|
val = ld32(addr);
|
||||||
return 0;
|
|
||||||
#endif
|
#endif
|
||||||
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
__forceinline__ __device__ void st32_cs(int* addr, int val) {
|
__forceinline__ __device__ void st32_cs(int* addr, int val) {
|
||||||
#if VLLM_256B_PTX_ENABLED
|
#ifndef USE_ROCM
|
||||||
asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
|
asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
|
||||||
#else
|
#else
|
||||||
assert(false && "st32_cs requires SM100+ with CUDA 12.9+");
|
st32(addr, val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// 128-bit cache-streaming (.cs) load / store.
|
||||||
|
// Falls back to ld128/st128 on ROCm (no .cs hint).
|
||||||
|
__forceinline__ __device__ int4 ld128_cs(const int4* addr) {
|
||||||
|
int4 val;
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
asm volatile("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"
|
||||||
|
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||||
|
: "l"(addr));
|
||||||
|
#else
|
||||||
|
ld128(val, addr);
|
||||||
|
#endif
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
__forceinline__ __device__ void st128_cs(int4* addr, int4 val) {
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
asm volatile("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(addr),
|
||||||
|
"r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
|
||||||
|
#else
|
||||||
|
st128(val, addr);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -260,7 +287,7 @@ __device__ __forceinline__ void ld256_cg_or_zero(u32x8_t& val, const void* ptr,
|
|||||||
|
|
||||||
__device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
|
__device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
|
||||||
bool pred) {
|
bool pred) {
|
||||||
#if VLLM_256B_PTX_ENABLED
|
#ifndef USE_ROCM
|
||||||
uint32_t r0, r1, r2, r3;
|
uint32_t r0, r1, r2, r3;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile(
|
||||||
@@ -278,7 +305,7 @@ __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
|
|||||||
|
|
||||||
val = uint4{r0, r1, r2, r3};
|
val = uint4{r0, r1, r2, r3};
|
||||||
#else
|
#else
|
||||||
assert(false && "ld128_cg_or_zero requires SM100+ with CUDA 12.9+");
|
assert(false && "ld128_cg_or_zero is not supported on ROCm");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
|
|||||||
int32_t const block_ids_size = sorted_ids_size / block_size;
|
int32_t const block_ids_size = sorted_ids_size / block_size;
|
||||||
int32_t const SENTINEL =
|
int32_t const SENTINEL =
|
||||||
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
||||||
// Intialize sorted_ids
|
// Initialize sorted_ids
|
||||||
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
||||||
sorted_ids[i] = SENTINEL;
|
sorted_ids[i] = SENTINEL;
|
||||||
}
|
}
|
||||||
// Intialize expert_ids with -1
|
// Initialize expert_ids with -1
|
||||||
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
||||||
block_ids[i] = -1;
|
block_ids[i] = -1;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -542,7 +542,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
|||||||
if (!lane_id) {
|
if (!lane_id) {
|
||||||
// Store scales.
|
// Store scales.
|
||||||
if constexpr (std::is_same<scale_t, uint8_t>::value) {
|
if constexpr (std::is_same<scale_t, uint8_t>::value) {
|
||||||
// Packed UE8MO format. Remove Mantissa.
|
// Packed UE8M0 format. Remove Mantissa.
|
||||||
*y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
|
*y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
|
||||||
|
|
||||||
bool const jump_pack = (current_group_id + 1) % 4 == 0;
|
bool const jump_pack = (current_group_id + 1) % 4 == 0;
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "quantization/w8a8/fp8/common.cuh"
|
#include "quantization/w8a8/fp8/common.cuh"
|
||||||
|
#include "core/batch_invariant.hpp"
|
||||||
|
|
||||||
// TODO(rasmith): The kernels in this file are susceptible to integer overflow
|
// TODO(rasmith): The kernels in this file are susceptible to integer overflow
|
||||||
// issues, do not take strides, and are unable to handle PyTorch tensors that
|
// issues, do not take strides, and are unable to handle PyTorch tensors that
|
||||||
@@ -1224,17 +1225,14 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
|||||||
#if defined(__gfx950__)
|
#if defined(__gfx950__)
|
||||||
#define WVSPLITKRC_1KPASS
|
#define WVSPLITKRC_1KPASS
|
||||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||||
int UNRL, int N, int GrpsShrB, int CHUNKK>
|
int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
|
||||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||||
__attribute__((amdgpu_waves_per_eu(1, 1)))
|
__attribute__((amdgpu_waves_per_eu(1, 1)))
|
||||||
wvSplitKrc_(const int actlN, const int K, const int M, const int Bx,
|
wvSplitKrc_(const int actlN, const int K, const int Kap, const int M,
|
||||||
const int By, const scalar_t* __restrict__ B,
|
const int Bx, const int By, const scalar_t* __restrict__ A,
|
||||||
const scalar_t* __restrict__ A,
|
const scalar_t* __restrict__ B,
|
||||||
const scalar_t* __restrict__ BIAS, float* glbl, scalar_t* C,
|
const scalar_t* __restrict__ BIAS, float* glbl, int* cntr,
|
||||||
const int CuCount) {
|
scalar_t* C, const int CuCount) {
|
||||||
// Use upper half of glbl buffer for atomic reduce counting
|
|
||||||
int* cntr = (int*)(&glbl[M * N]);
|
|
||||||
|
|
||||||
constexpr int NTILE = 16;
|
constexpr int NTILE = 16;
|
||||||
constexpr int APAD = 1;
|
constexpr int APAD = 1;
|
||||||
constexpr int ASTRD = 64;
|
constexpr int ASTRD = 64;
|
||||||
@@ -1425,10 +1423,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff);
|
unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff);
|
||||||
for (unsigned int n = 0; n < N; n += CHUNKK * sprdN) {
|
for (unsigned int n = 0; n < N; n += CHUNKK * sprdN) {
|
||||||
__builtin_amdgcn_global_load_lds(
|
__builtin_amdgcn_global_load_lds(
|
||||||
(int*)(&A[min__(
|
(int*)(&A[min__(Kap * actlN - A_CHUNK,
|
||||||
K * actlN - A_CHUNK,
|
kOffcp + Kap * (n / CHUNKK +
|
||||||
kOffcp + K * (n / CHUNKK +
|
(N / CHUNKK) * (threadIdx.x /
|
||||||
(N / CHUNKK) * (threadIdx.x / (64 / CHUNKK)) +
|
(64 / CHUNKK)) +
|
||||||
(threadIdx.y % sprdN)))]),
|
(threadIdx.y % sprdN)))]),
|
||||||
(int*)(&s[(k +
|
(int*)(&s[(k +
|
||||||
kFitPdd * ((n / CHUNKK) + (threadIdx.y % sprdN)))]),
|
kFitPdd * ((n / CHUNKK) + (threadIdx.y % sprdN)))]),
|
||||||
@@ -1476,7 +1474,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// B[] staging is cooperative across GrpsShrB, so sync here before reading
|
// B[] staging is cooperative across GrpsShrB, so sync here before reading
|
||||||
// back. This wait is currently inserted by compiler, but not gauranteed.
|
// back. This wait is currently inserted by compiler, but not guaranteed.
|
||||||
asm volatile("s_waitcnt 0");
|
asm volatile("s_waitcnt 0");
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
@@ -1533,30 +1531,66 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
union flt4 {
|
||||||
|
scalar8 s8;
|
||||||
|
float2 f2[2];
|
||||||
|
float4 f4;
|
||||||
|
};
|
||||||
if (m + (threadIdx.x % 16) < M) {
|
if (m + (threadIdx.x % 16) < M) {
|
||||||
int my_cntr;
|
int my_cntr;
|
||||||
int mindx = m + (threadIdx.x % 16);
|
int mindx = m + (threadIdx.x % 16);
|
||||||
int g_mindx = m * 4 + (threadIdx.x % 64); // coalesced atomic reduction
|
int g_mindx = m * 4 + (threadIdx.x % 64); // coalesced atomic reduction
|
||||||
scalar_t biases[N / NTILE / GrpsShrB][4] = {};
|
scalar_t biases[N / NTILE / GrpsShrB][4] = {};
|
||||||
// Atomic add the output, read biases
|
// Atomic add the output, read biases
|
||||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
|
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||||
for (uint32_t j = 0; j < 4; j++) {
|
|
||||||
// int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
|
||||||
// (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
|
||||||
// int adr = mindx + M * nindx;
|
|
||||||
int g_nindx =
|
int g_nindx =
|
||||||
j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
|
(nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
|
||||||
int g_adr = g_mindx + M * g_nindx * 4;
|
int g_adr = g_mindx * 4 + 0 + M * g_nindx * 4;
|
||||||
atomicAdd(&glbl[g_adr], sum4[nt][0][j]);
|
if (DTRMNSTC) {
|
||||||
|
flt4 flt4_ = {.s8 = sum4[nt][0]};
|
||||||
|
__hip_atomic_store((float2*)&glbl[g_adr + M * N * (m0 / Mmod)],
|
||||||
|
flt4_.f2[0], __ATOMIC_RELAXED,
|
||||||
|
__HIP_MEMORY_SCOPE_AGENT);
|
||||||
|
__hip_atomic_store((float2*)&glbl[g_adr + 2 + M * N * (m0 / Mmod)],
|
||||||
|
flt4_.f2[1], __ATOMIC_RELAXED,
|
||||||
|
__HIP_MEMORY_SCOPE_AGENT);
|
||||||
|
} else {
|
||||||
|
for (uint32_t j = 0; j < 4; j++)
|
||||||
|
atomicAdd((&glbl[g_adr + j]), sum4[nt][0][j]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__atomic_signal_fence(__ATOMIC_SEQ_CST);
|
||||||
|
asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
|
||||||
|
__atomic_signal_fence(__ATOMIC_SEQ_CST);
|
||||||
|
|
||||||
int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
|
int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
|
||||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||||
int adr_ = mindx + M * nindx_ / 4;
|
int adr_ = mindx + M * nindx_ / 4;
|
||||||
// Update the complete counter
|
|
||||||
my_cntr = atomicAdd(&cntr[adr_], 1);
|
my_cntr = atomicAdd(&cntr[adr_], 1);
|
||||||
float vals[N / NTILE / GrpsShrB][4] = {};
|
|
||||||
|
// make sure LDS is free for write out staging
|
||||||
|
if (DTRMNSTC) __syncthreads();
|
||||||
|
|
||||||
|
// Update the complete counter
|
||||||
|
flt4 vals[N / NTILE / GrpsShrB] = {};
|
||||||
// If we're the last k-shard, read back the value and convert...
|
// If we're the last k-shard, read back the value and convert...
|
||||||
if (my_cntr + 1 == k_rnd) {
|
if (my_cntr + 1 == k_rnd) {
|
||||||
|
cntr[adr_] = 0; // clear for next round
|
||||||
|
if constexpr (DTRMNSTC) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int ks = 0; ks < k_rnd; ks++) {
|
||||||
|
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||||
|
int g_nindx =
|
||||||
|
(nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
|
||||||
|
int g_adr = g_mindx * 4 + 0 + M * g_nindx * 4;
|
||||||
|
__builtin_amdgcn_global_load_lds(
|
||||||
|
(float4*)(&glbl[g_adr + M * N * ks]),
|
||||||
|
&(((float4*)s)[(threadIdx.y * THRDS) + ks * THRDS * 4 +
|
||||||
|
nt * THRDS * 4 * k_rnd]),
|
||||||
|
16, 0, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
if (BIAS)
|
if (BIAS)
|
||||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||||
for (uint32_t j = 0; j < 4; j++) {
|
for (uint32_t j = 0; j < 4; j++) {
|
||||||
@@ -1565,12 +1599,29 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
|
biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
asm volatile("s_waitcnt 0");
|
||||||
|
for (int ks = 0; ks < k_rnd; ks++) {
|
||||||
|
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||||
|
float4 eval = ((float4*)s)[(threadIdx.x + threadIdx.y * THRDS) +
|
||||||
|
ks * THRDS * 4 + nt * THRDS * 4 * k_rnd];
|
||||||
|
vals[nt].f4 += eval;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||||
|
int g_nindx =
|
||||||
|
(nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
|
||||||
|
int g_adr = g_mindx * 4 + 0 + M * g_nindx * 4;
|
||||||
|
vals[nt].f4 = *(float4*)(&glbl[g_adr]);
|
||||||
|
*(float4*)(&glbl[g_adr]) = {}; // clear out for next round
|
||||||
|
}
|
||||||
|
if (BIAS)
|
||||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||||
for (uint32_t j = 0; j < 4; j++) {
|
for (uint32_t j = 0; j < 4; j++) {
|
||||||
int g_nindx =
|
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||||
j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
|
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||||
int g_adr = g_mindx + M * g_nindx * 4;
|
biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
|
||||||
vals[nt][j] = glbl[g_adr];
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__builtin_amdgcn_sched_barrier(0);
|
__builtin_amdgcn_sched_barrier(0);
|
||||||
@@ -1581,11 +1632,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
if (nindx < actlN) {
|
if (nindx < actlN) {
|
||||||
int adr = mindx + M * nindx;
|
int adr = mindx + M * nindx;
|
||||||
if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
|
if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
|
||||||
vals[nt][j] += __bfloat162float(biases[nt][j]);
|
vals[nt].s8[j] += __bfloat162float(biases[nt][j]);
|
||||||
C[adr] = __float2bfloat16(vals[nt][j]);
|
C[adr] = __float2bfloat16(vals[nt].s8[j]);
|
||||||
} else {
|
} else {
|
||||||
vals[nt][j] += __half2float(biases[nt][j]);
|
vals[nt].s8[j] += __half2float(biases[nt][j]);
|
||||||
C[adr] = __float2half(vals[nt][j]);
|
C[adr] = __float2half(vals[nt].s8[j]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1604,21 +1655,25 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
}
|
}
|
||||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||||
int UNRL, int N, int GrpsShrB, int CHUNKK>
|
int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
|
||||||
__global__ void wvSplitKrc_(const int actlN, const int K, const int M,
|
__global__ void wvSplitKrc_(const int actlN, const int K, const int Kap,
|
||||||
const int Bx, const int By, const scalar_t* B,
|
const int M, const int Bx, const int By,
|
||||||
const scalar_t* __restrict__ A,
|
const scalar_t* B, const scalar_t* __restrict__ A,
|
||||||
const scalar_t* __restrict__ BIAS, float* glbl,
|
const scalar_t* __restrict__ BIAS, float* glbl,
|
||||||
// int* cntr,
|
int* cntr, scalar_t* C,
|
||||||
scalar_t* C, const int CuCount){UNREACHABLE_CODE}
|
const int CuCount){UNREACHABLE_CODE}
|
||||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||||
|
|
||||||
torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||||
const std::optional<at::Tensor>& in_bias,
|
const std::optional<at::Tensor>& in_bias,
|
||||||
const int64_t CuCount) {
|
const int64_t CuCount) {
|
||||||
auto M_in = in_a.size(0);
|
int _DTRMNSTC = 1; // vllm::vllm_is_batch_invariant();
|
||||||
auto N_in = in_b.size(0);
|
|
||||||
auto K_in = in_a.size(1);
|
auto M_in = in_b.size(0);
|
||||||
|
auto N_in = in_a.size(0);
|
||||||
|
auto K_in = in_b.size(1);
|
||||||
|
auto Kap_in = in_a.stride(0);
|
||||||
|
|
||||||
auto Bx_in =
|
auto Bx_in =
|
||||||
(in_bias.has_value() && in_bias->numel() > 0)
|
(in_bias.has_value() && in_bias->numel() > 0)
|
||||||
? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
|
? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
|
||||||
@@ -1635,13 +1690,9 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
|||||||
|
|
||||||
auto out_c = torch::empty(
|
auto out_c = torch::empty(
|
||||||
{N_in, M_in},
|
{N_in, M_in},
|
||||||
torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
|
torch::TensorOptions().dtype(in_a.dtype()).device(in_a.device()));
|
||||||
|
|
||||||
auto N_p2 = 1U << (32 - __builtin_clz(N_in - 1));
|
auto N_p2 = 1U << (32 - __builtin_clz(N_in - 1));
|
||||||
auto axl_glbl = torch::empty(
|
|
||||||
{N_p2 + N_p2 / 4, M_in + M_in / 4},
|
|
||||||
torch::TensorOptions().dtype(torch::kFloat32).device(in_b.device()));
|
|
||||||
axl_glbl.zero_(); // disable for FAST_UNSAFE_RDC_INIT
|
|
||||||
|
|
||||||
dim3 grid(CuCount);
|
dim3 grid(CuCount);
|
||||||
|
|
||||||
@@ -1649,25 +1700,6 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
|||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
// const int max_lds_len = get_lds_size() / 2;
|
// const int max_lds_len = get_lds_size() / 2;
|
||||||
|
|
||||||
#define WVSPLITKrc(_N, _GrpsShrB, _CHUNKK) \
|
|
||||||
{ \
|
|
||||||
dim3 block(64, 4); \
|
|
||||||
wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK> \
|
|
||||||
<<<grid, block, 0, stream>>>(N_in, K_in, M_in, Bx_in, By_in, af4, bf4, \
|
|
||||||
biasf4, glbl, c, CuCount); \
|
|
||||||
}
|
|
||||||
|
|
||||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitKrc", [&] {
|
|
||||||
using fptype = typename scalar<scalar_t>::type;
|
|
||||||
fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
|
|
||||||
const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
|
|
||||||
const fptype* biasf4 =
|
|
||||||
(in_bias.has_value() && in_bias->numel() > 0)
|
|
||||||
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
|
|
||||||
: nullptr;
|
|
||||||
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
|
|
||||||
auto glbl = axl_glbl.data_ptr<float>();
|
|
||||||
|
|
||||||
// With 64 Ms per CU (each of 4 SIMDs working on a 16x16 tile),
|
// With 64 Ms per CU (each of 4 SIMDs working on a 16x16 tile),
|
||||||
// and each working on a 512-shard of K, how many CUs would we need?
|
// and each working on a 512-shard of K, how many CUs would we need?
|
||||||
int rndup_cus = ((M_in + 64 - 1) / 64) * ((K_in + 512 - 1) / 512);
|
int rndup_cus = ((M_in + 64 - 1) / 64) * ((K_in + 512 - 1) / 512);
|
||||||
@@ -1680,24 +1712,58 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
|||||||
// Given the above, how many CUs would we need?
|
// Given the above, how many CUs would we need?
|
||||||
int CuNeeded = rndup_cus * GrpsShrB;
|
int CuNeeded = rndup_cus * GrpsShrB;
|
||||||
|
|
||||||
if (CuNeeded > CuCount) std::runtime_error("Invalid wvSplitKrc size");
|
if (CuNeeded > CuCount) throw std::runtime_error("Invalid wvSplitKrc size");
|
||||||
|
|
||||||
// Can we increase SplitK by shrinking the K-shared to 256?
|
// Can we increase SplitK by shrinking the K-shared to 256?
|
||||||
int chunkk = (CuNeeded * 2 <= CuCount) ? 2 : 1;
|
int chunkk = (CuNeeded * 2 <= CuCount) ? 2 : 1;
|
||||||
|
|
||||||
|
static torch::Tensor axl_glbl =
|
||||||
|
torch::zeros(
|
||||||
|
128 * 1024 * (_DTRMNSTC ? 12 : 1),
|
||||||
|
torch::TensorOptions().dtype(torch::kFloat32).device(in_a.device()))
|
||||||
|
.detach();
|
||||||
|
static torch::Tensor axl_cntr =
|
||||||
|
torch::zeros(
|
||||||
|
128 * 1024 * (_DTRMNSTC ? 12 : 1) / 4,
|
||||||
|
torch::TensorOptions().dtype(torch::kInt).device(in_a.device()))
|
||||||
|
.detach();
|
||||||
|
auto glbl = axl_glbl.data_ptr<float>();
|
||||||
|
auto cntr = axl_cntr.data_ptr<int>();
|
||||||
|
|
||||||
|
#define WVSPLITKrc(_N, _GrpsShrB, _CHUNKK) \
|
||||||
|
{ \
|
||||||
|
dim3 block(64, 4); \
|
||||||
|
if (_DTRMNSTC) \
|
||||||
|
wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK, 1> \
|
||||||
|
<<<grid, block, 0, stream>>>(N_in, K_in, Kap_in, M_in, Bx_in, By_in, \
|
||||||
|
af4, bf4, biasf4, glbl, cntr, c, \
|
||||||
|
CuCount); \
|
||||||
|
else \
|
||||||
|
wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK, 0> \
|
||||||
|
<<<grid, block, 0, stream>>>(N_in, K_in, Kap_in, M_in, Bx_in, By_in, \
|
||||||
|
af4, bf4, biasf4, glbl, cntr, c, \
|
||||||
|
CuCount); \
|
||||||
|
}
|
||||||
|
|
||||||
|
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_a.scalar_type(), "wvSplitKrc", [&] {
|
||||||
|
using fptype = typename scalar<scalar_t>::type;
|
||||||
|
const fptype* af4 = reinterpret_cast<const fptype*>(in_a.data_ptr());
|
||||||
|
const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
|
||||||
|
const fptype* biasf4 =
|
||||||
|
(in_bias.has_value() && in_bias->numel() > 0)
|
||||||
|
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
|
||||||
|
: nullptr;
|
||||||
|
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
|
||||||
|
|
||||||
switch (N_p2) {
|
switch (N_p2) {
|
||||||
case 16:
|
case 16:
|
||||||
WVSPLITKrc(16, 1, 1) break;
|
WVSPLITKrc(16, 1, 1) break;
|
||||||
case 32:
|
case 32:
|
||||||
if (chunkk == 2)
|
if (chunkk == 2) WVSPLITKrc(32, 2, 2) else WVSPLITKrc(32, 2, 1) break;
|
||||||
WVSPLITKrc(32, 2, 2) else if (chunkk == 1) WVSPLITKrc(32, 2, 1) break;
|
|
||||||
case 64:
|
case 64:
|
||||||
if (chunkk == 2)
|
if (chunkk == 2) WVSPLITKrc(64, 4, 2) else WVSPLITKrc(64, 4, 1) break;
|
||||||
WVSPLITKrc(64, 4, 2) else if (chunkk == 1) WVSPLITKrc(64, 4, 1) break;
|
|
||||||
case 128:
|
case 128:
|
||||||
if (chunkk == 2)
|
if (chunkk == 2) WVSPLITKrc(128, 4, 2) else WVSPLITKrc(128, 4, 1) break;
|
||||||
WVSPLITKrc(128, 4, 2) else if (chunkk == 1)
|
|
||||||
WVSPLITKrc(128, 4, 1) break;
|
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
"Unsupported N value: " + std::to_string(M_in) + "," +
|
"Unsupported N value: " + std::to_string(M_in) + "," +
|
||||||
|
|||||||
@@ -802,6 +802,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
|
cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
|
||||||
&indexer_k_quant_and_cache);
|
&indexer_k_quant_and_cache);
|
||||||
|
|
||||||
|
cache_ops.def(
|
||||||
|
"concat_mla_q(Tensor ql_nope, Tensor q_pe, Tensor! q_out) -> ()");
|
||||||
|
cache_ops.impl("concat_mla_q", torch::kCUDA, &concat_mla_q);
|
||||||
|
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"cp_gather_indexer_k_quant_cache(Tensor kv_cache, Tensor! dst_k, Tensor! "
|
"cp_gather_indexer_k_quant_cache(Tensor kv_cache, Tensor! dst_k, Tensor! "
|
||||||
"dst_scale, Tensor block_table, Tensor cu_seq_lens) -> ()");
|
"dst_scale, Tensor block_table, Tensor cu_seq_lens) -> ()");
|
||||||
|
|||||||
@@ -18,14 +18,14 @@ th {
|
|||||||
</style>
|
</style>
|
||||||
|
|
||||||
| Dataset | Online | Offline | Data Path |
|
| Dataset | Online | Offline | Data Path |
|
||||||
|---------|--------|---------|-----------|
|
| ------- | ------ | ------- | --------- |
|
||||||
| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
|
| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
|
||||||
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
|
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
|
||||||
| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
|
| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
|
||||||
| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
|
| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
|
||||||
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
||||||
| Random | ✅ | ✅ | `synthetic` |
|
| Random | ✅ | ✅ | `synthetic` |
|
||||||
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
|
| RandomMultiModal (Image/Video) | ✅ | ✅ | `synthetic` |
|
||||||
| RandomForReranking | ✅ | ✅ | `synthetic` |
|
| RandomForReranking | ✅ | ✅ | `synthetic` |
|
||||||
| Prefix Repetition | ✅ | ✅ | `synthetic` |
|
| Prefix Repetition | ✅ | ✅ | `synthetic` |
|
||||||
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
|
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
|
||||||
@@ -545,6 +545,24 @@ vllm bench throughput \
|
|||||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
--lora-path yard1/llama-2-7b-sql-lora-test
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Synthetic Random Multimodal (random-mm)
|
||||||
|
|
||||||
|
Generate synthetic multimodal inputs for offline throughput testing without external datasets.
|
||||||
|
Use `--backend vllm-chat` so that image tokens are counted correctly.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm bench throughput \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--backend vllm-chat \
|
||||||
|
--dataset-name random-mm \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--random-input-len 300 \
|
||||||
|
--random-output-len 40 \
|
||||||
|
--random-mm-base-items-per-request 2 \
|
||||||
|
--random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
|
||||||
|
--random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}'
|
||||||
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
### 🛠️ Structured Output Benchmark
|
### 🛠️ Structured Output Benchmark
|
||||||
@@ -846,8 +864,8 @@ Generate synthetic image inputs alongside random text prompts to stress-test vis
|
|||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
|
|
||||||
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
- For online benchmarks, use `--backend openai-chat` with endpoint `/v1/chat/completions`.
|
||||||
- Video sampling is not yet implemented.
|
- For offline benchmarks, use `--backend vllm-chat` (see [Offline Throughput Benchmark](#-offline-throughput-benchmark) for an example).
|
||||||
|
|
||||||
Start the server (example):
|
Start the server (example):
|
||||||
|
|
||||||
@@ -913,6 +931,74 @@ This should be seen as an edge case, and if this behavior can be avoided by sett
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
### 🔬 Multimodal Processor Benchmark
|
||||||
|
|
||||||
|
Benchmark per-stage latency of the multimodal (MM) input processor pipeline, including the encoder forward pass. This is useful for profiling preprocessing bottlenecks in vision-language models.
|
||||||
|
|
||||||
|
<details class="admonition abstract" markdown="1">
|
||||||
|
<summary>Show more</summary>
|
||||||
|
|
||||||
|
The benchmark measures the following stages for each request:
|
||||||
|
|
||||||
|
| Stage | Description |
|
||||||
|
| ----- | ----------- |
|
||||||
|
| `get_mm_hashes_secs` | Time spent hashing multimodal inputs |
|
||||||
|
| `get_cache_missing_items_secs` | Time spent looking up the processor cache |
|
||||||
|
| `apply_hf_processor_secs` | Time spent in the HuggingFace processor |
|
||||||
|
| `merge_mm_kwargs_secs` | Time spent merging multimodal kwargs |
|
||||||
|
| `apply_prompt_updates_secs` | Time spent updating prompt tokens |
|
||||||
|
| `preprocessor_total_secs` | Total preprocessing time |
|
||||||
|
| `encoder_forward_secs` | Time spent in the encoder model forward pass |
|
||||||
|
| `num_encoder_calls` | Number of encoder invocations per request |
|
||||||
|
|
||||||
|
The benchmark also reports end-to-end latency (TTFT + decode time) per
|
||||||
|
request. Use `--metric-percentiles` to select which percentiles to report
|
||||||
|
(default: p99) and `--output-json` to save results.
|
||||||
|
|
||||||
|
#### Basic Example with Synthetic Data (random-mm)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm bench mm-processor \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--dataset-name random-mm \
|
||||||
|
--num-prompts 50 \
|
||||||
|
--random-input-len 300 \
|
||||||
|
--random-output-len 40 \
|
||||||
|
--random-mm-base-items-per-request 2 \
|
||||||
|
--random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
|
||||||
|
--random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}'
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Using a HuggingFace Dataset
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm bench mm-processor \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 100
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Warmup, Custom Percentiles, and JSON Output
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm bench mm-processor \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--dataset-name random-mm \
|
||||||
|
--num-prompts 200 \
|
||||||
|
--num-warmups 5 \
|
||||||
|
--random-input-len 300 \
|
||||||
|
--random-output-len 40 \
|
||||||
|
--random-mm-base-items-per-request 1 \
|
||||||
|
--metric-percentiles 50,90,95,99 \
|
||||||
|
--output-json results.json
|
||||||
|
```
|
||||||
|
|
||||||
|
See [`vllm bench mm-processor`](../cli/bench/mm_processor.md) for the full argument reference.
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
### Embedding Benchmark
|
### Embedding Benchmark
|
||||||
|
|
||||||
Benchmark the performance of embedding requests in vLLM.
|
Benchmark the performance of embedding requests in vLLM.
|
||||||
|
|||||||
@@ -61,11 +61,11 @@ Here is an example using the script to compare result_a and result_b with max co
|
|||||||
***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***
|
***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***
|
||||||
|
|
||||||
| | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
| | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||||
|----|------|-----|-----------|----------|----------|
|
| | -------------------- | --- | -------------------------------- | -------------------------------- | ---------- |
|
||||||
| 0 | 12 | inf | 24.98 | 186.03 | 7.45 |
|
| 0 | 12 | inf | 24.98 | 186.03 | 7.45 |
|
||||||
| 1 | 16 | inf| 25.49 | 246.92 | 9.69 |
|
| 1 | 16 | inf | 25.49 | 246.92 | 9.69 |
|
||||||
| 2 | 24 | inf| 27.74 | 293.34 | 10.57 |
|
| 2 | 24 | inf | 27.74 | 293.34 | 10.57 |
|
||||||
| 3 | 32 | inf| 28.61 |306.69 | 10.72 |
|
| 3 | 32 | inf | 28.61 |306.69 | 10.72 |
|
||||||
|
|
||||||
***compare-json-results.py – Command-Line Parameters***
|
***compare-json-results.py – Command-Line Parameters***
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,51 @@
|
|||||||
# vllm bench mm-processor
|
# vllm bench mm-processor
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
`vllm bench mm-processor` profiles the multimodal input processor pipeline of
|
||||||
|
vision-language models. It measures per-stage latency from the HuggingFace
|
||||||
|
processor through to the encoder forward pass, helping you identify
|
||||||
|
preprocessing bottlenecks and understand how different image resolutions or
|
||||||
|
item counts affect end-to-end request time.
|
||||||
|
|
||||||
|
The benchmark supports two data sources: synthetic random multimodal inputs
|
||||||
|
(`random-mm`) and HuggingFace datasets (`hf`). Warmup requests are run before
|
||||||
|
measurement to ensure stable results.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm bench mm-processor \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--dataset-name random-mm \
|
||||||
|
--num-prompts 50 \
|
||||||
|
--random-input-len 300 \
|
||||||
|
--random-output-len 40 \
|
||||||
|
--random-mm-base-items-per-request 2 \
|
||||||
|
--random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
|
||||||
|
--random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Measured Stages
|
||||||
|
|
||||||
|
| Stage | Description |
|
||||||
|
| ----- | ----------- |
|
||||||
|
| `get_mm_hashes_secs` | Time spent hashing multimodal inputs |
|
||||||
|
| `get_cache_missing_items_secs` | Time spent looking up the processor cache |
|
||||||
|
| `apply_hf_processor_secs` | Time spent in the HuggingFace processor |
|
||||||
|
| `merge_mm_kwargs_secs` | Time spent merging multimodal kwargs |
|
||||||
|
| `apply_prompt_updates_secs` | Time spent updating prompt tokens |
|
||||||
|
| `preprocessor_total_secs` | Total preprocessing time |
|
||||||
|
| `encoder_forward_secs` | Time spent in the encoder model forward pass |
|
||||||
|
| `num_encoder_calls` | Number of encoder invocations per request |
|
||||||
|
|
||||||
|
The benchmark also reports end-to-end latency (TTFT + decode time) per
|
||||||
|
request. Use `--metric-percentiles` to select which percentiles to report
|
||||||
|
(default: p99) and `--output-json` to save results.
|
||||||
|
|
||||||
|
For more examples (HF datasets, warmup, JSON output), see
|
||||||
|
[Benchmarking CLI — Multimodal Processor Benchmark](../../benchmarking/cli.md#multimodal-processor-benchmark).
|
||||||
|
|
||||||
## JSON CLI Arguments
|
## JSON CLI Arguments
|
||||||
|
|
||||||
--8<-- "docs/cli/json_tip.inc.md"
|
--8<-- "docs/cli/json_tip.inc.md"
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
<!-- markdownlint-disable MD041 -->
|
||||||
When passing JSON CLI arguments, the following sets of arguments are equivalent:
|
When passing JSON CLI arguments, the following sets of arguments are equivalent:
|
||||||
|
|
||||||
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
|
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
|
||||||
|
|||||||
@@ -5,6 +5,17 @@ This guide covers optimization strategies and performance tuning for vLLM V1.
|
|||||||
!!! tip
|
!!! tip
|
||||||
Running out of memory? Consult [this guide](./conserving_memory.md) on how to conserve memory.
|
Running out of memory? Consult [this guide](./conserving_memory.md) on how to conserve memory.
|
||||||
|
|
||||||
|
## Optimization Levels
|
||||||
|
|
||||||
|
vLLM provides 4 optimization levels (`-O0`, `-O1`, `-O2`, `-O3`) that allow users to trade off startup time for performance:
|
||||||
|
|
||||||
|
- `-O0`: No optimizations. Fastest startup time, but lowest performance.
|
||||||
|
- `-O1`: Fast optimization. Simple compilation and fast fusions, and PIECEWISE cudagraphs.
|
||||||
|
- `-O2`: Default optimization. Additional compilation ranges, additional fusions, FULL_AND_PIECEWISE cudagraphs.
|
||||||
|
- `-O3`: Aggressive optimization. Currently equal to `-O2`, but may include additional time-consuming or experimental optimizations in the future.
|
||||||
|
|
||||||
|
For more information, see the [optimization level documentation](../design/optimization_levels.md).
|
||||||
|
|
||||||
## Preemption
|
## Preemption
|
||||||
|
|
||||||
Due to the autoregressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
|
Due to the autoregressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
|
||||||
@@ -282,7 +293,7 @@ llm = LLM(
|
|||||||
Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:
|
Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:
|
||||||
|
|
||||||
| mm_processor_cache_type | Cache Type | `P0` Cache | `P1` Engine Cache | `P1` Worker Cache | Max. Memory |
|
| mm_processor_cache_type | Cache Type | `P0` Cache | `P1` Engine Cache | `P1` Worker Cache | Max. Memory |
|
||||||
|-------------------|-------------|------------|------------|-------------|-------------|
|
| ----------------- | ----------- | ---------- | ---------- | ----------- | ----------- |
|
||||||
| lru | Processor Caching | K + V | N/A | N/A | `mm_processor_cache_gb * data_parallel_size` |
|
| lru | Processor Caching | K + V | N/A | N/A | `mm_processor_cache_gb * data_parallel_size` |
|
||||||
| lru | Key-Replicated Caching | K | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
|
| lru | Key-Replicated Caching | K | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
|
||||||
| shm | Shared Memory Caching | K | N/A | V | `mm_processor_cache_gb * api_server_count` |
|
| shm | Shared Memory Caching | K | N/A | V | `mm_processor_cache_gb * api_server_count` |
|
||||||
|
|||||||
@@ -94,7 +94,6 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
|
|||||||
Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:
|
Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pre-commit run --hook-stage manual markdownlint
|
|
||||||
pre-commit run --hook-stage manual mypy-3.10
|
pre-commit run --hook-stage manual mypy-3.10
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -66,9 +66,9 @@ This complicates the process as we cannot use the out-of-the-box
|
|||||||
- Important indexes at the moment include:
|
- Important indexes at the moment include:
|
||||||
|
|
||||||
| Platform | `--extra-index-url` |
|
| Platform | `--extra-index-url` |
|
||||||
|----------|-----------------|
|
| -------- | ------------------- |
|
||||||
| CUDA 12.8| [https://download.pytorch.org/whl/cu128](https://download.pytorch.org/whl/cu128)|
|
| CUDA 12.8 | [https://download.pytorch.org/whl/cu128](https://download.pytorch.org/whl/cu128) |
|
||||||
| CPU | [https://download.pytorch.org/whl/cpu](https://download.pytorch.org/whl/cpu)|
|
| CPU | [https://download.pytorch.org/whl/cpu](https://download.pytorch.org/whl/cpu) |
|
||||||
| ROCm 6.2 | [https://download.pytorch.org/whl/rocm6.2.4](https://download.pytorch.org/whl/rocm6.2.4) |
|
| ROCm 6.2 | [https://download.pytorch.org/whl/rocm6.2.4](https://download.pytorch.org/whl/rocm6.2.4) |
|
||||||
| ROCm 6.3 | [https://download.pytorch.org/whl/rocm6.3](https://download.pytorch.org/whl/rocm6.3) |
|
| ROCm 6.3 | [https://download.pytorch.org/whl/rocm6.3](https://download.pytorch.org/whl/rocm6.3) |
|
||||||
| XPU | [https://download.pytorch.org/whl/xpu](https://download.pytorch.org/whl/xpu) |
|
| XPU | [https://download.pytorch.org/whl/xpu](https://download.pytorch.org/whl/xpu) |
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ stages will be removed.
|
|||||||
Assume a feature is deprecated in `v0.9.0`.
|
Assume a feature is deprecated in `v0.9.0`.
|
||||||
|
|
||||||
| Release | Status |
|
| Release | Status |
|
||||||
|---------------|-------------------------------------------------------------------------------------------------|
|
| ------------- | ----------------------------------------------------------------------------------------------- |
|
||||||
| `v0.9.0` | Feature is deprecated with clear removal version listed. |
|
| `v0.9.0` | Feature is deprecated with clear removal version listed. |
|
||||||
| `v0.10.0` | Feature is now off by default, throws an error when used, and can be re-enabled for legacy use. |
|
| `v0.10.0` | Feature is now off by default, throws an error when used, and can be re-enabled for legacy use. |
|
||||||
| `v0.11.0` | Feature is removed. |
|
| `v0.11.0` | Feature is removed. |
|
||||||
|
|||||||
@@ -5,8 +5,12 @@
|
|||||||
|
|
||||||
## Profile with PyTorch Profiler
|
## Profile with PyTorch Profiler
|
||||||
|
|
||||||
We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
|
We support tracing vLLM workers using different profilers. You can enable profiling by setting the `--profiler-config` flag when launching the server.
|
||||||
when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
|
|
||||||
|
!!! note
|
||||||
|
The `--profiler-config` flag is available in vLLM v0.13.0 and later. If you are using an earlier version, please upgrade to use this feature.
|
||||||
|
|
||||||
|
To use the `torch.profiler` module, set the `profiler` entry to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
|
||||||
|
|
||||||
- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
|
- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
|
||||||
- `torch_profiler_with_memory` to record memory, off by default
|
- `torch_profiler_with_memory` to record memory, off by default
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ chart **including persistent volumes** and deletes the release.
|
|||||||
The following table describes configurable parameters of the chart in `values.yaml`:
|
The following table describes configurable parameters of the chart in `values.yaml`:
|
||||||
|
|
||||||
| Key | Type | Default | Description |
|
| Key | Type | Default | Description |
|
||||||
|-----|------|---------|-------------|
|
| --- | ---- | ------- | ----------- |
|
||||||
| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
|
| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
|
||||||
| autoscaling.enabled | bool | false | Enable autoscaling |
|
| autoscaling.enabled | bool | false | Enable autoscaling |
|
||||||
| autoscaling.maxReplicas | int | 100 | Maximum replicas |
|
| autoscaling.maxReplicas | int | 100 | Maximum replicas |
|
||||||
|
|||||||
87
docs/deployment/frameworks/runpod.md
Normal file
87
docs/deployment/frameworks/runpod.md
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
# RunPod
|
||||||
|
|
||||||
|
vLLM can be deployed on [RunPod](https://www.runpod.io/), a cloud GPU platform that provides on-demand and serverless GPU instances for AI inference workloads.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- A RunPod account with GPU pod access
|
||||||
|
- A GPU pod running a CUDA-compatible template (e.g., `runpod/pytorch`)
|
||||||
|
|
||||||
|
## Starting the Server
|
||||||
|
|
||||||
|
SSH into your RunPod pod and launch the vLLM OpenAI-compatible server:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model <model-name> \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
|
||||||
|
Use `--host 0.0.0.0` to bind to all interfaces so the server is reachable from outside the container.
|
||||||
|
|
||||||
|
## Exposing Port 8000
|
||||||
|
|
||||||
|
RunPod exposes HTTP services through its proxy. To make port 8000 accessible:
|
||||||
|
|
||||||
|
1. In the RunPod dashboard, navigate to your pod settings.
|
||||||
|
2. Add `8000` to the list of exposed HTTP ports.
|
||||||
|
3. After the pod restarts, RunPod provides a public URL in the format:
|
||||||
|
|
||||||
|
```text
|
||||||
|
https://<pod-id>-8000.proxy.runpod.net
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting 502 Bad Gateway
|
||||||
|
|
||||||
|
A `502 Bad Gateway` error from the RunPod proxy typically means the server is not yet listening. Common causes:
|
||||||
|
|
||||||
|
- **Model still loading** — Large models take time to download and load into GPU memory. Check the pod logs for progress.
|
||||||
|
- **Wrong host binding** — Ensure you passed `--host 0.0.0.0`. Binding to `127.0.0.1` (the default) makes the server unreachable from the proxy.
|
||||||
|
- **Port mismatch** — Verify the `--port` value matches the port exposed in the RunPod dashboard.
|
||||||
|
- **Out of GPU memory** — The model may be too large for the allocated GPU. Check logs for CUDA OOM errors and consider using a larger instance or adding `--tensor-parallel-size` for multi-GPU pods.
|
||||||
|
|
||||||
|
## Verifying the Deployment
|
||||||
|
|
||||||
|
Once the server is running, test it with a curl request:
|
||||||
|
|
||||||
|
!!! console "Command"
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://<pod-id>-8000.proxy.runpod.net/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "<model-name>",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Hello, how are you?"}
|
||||||
|
],
|
||||||
|
"max_tokens": 50
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
!!! console "Response"
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "chat-abc123",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "I'm doing well, thank you for asking! How can I help you today?"
|
||||||
|
},
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": "stop"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also check the server health endpoint:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://<pod-id>-8000.proxy.runpod.net/health
|
||||||
|
```
|
||||||
5
docs/deployment/integrations/aibrix.md
Normal file
5
docs/deployment/integrations/aibrix.md
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# AIBrix
|
||||||
|
|
||||||
|
[AIBrix](https://github.com/vllm-project/aibrix) is a cloud-native control plane that integrates with vLLM to simplify Kubernetes deployment, scaling, routing, and LoRA adapter management for large language model inference.
|
||||||
|
|
||||||
|
For installation and usage instructions, please refer to the [AIBrix documentation](https://aibrix.readthedocs.io/).
|
||||||
7
docs/deployment/integrations/dynamo.md
Normal file
7
docs/deployment/integrations/dynamo.md
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# NVIDIA Dynamo
|
||||||
|
|
||||||
|
[NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) is an open-source framework for distributed LLM inference that can run vLLM on Kubernetes with flexible serving architectures (e.g. aggregated/disaggregated, optional router/planner).
|
||||||
|
|
||||||
|
For Kubernetes deployment instructions and examples (including vLLM), see the [Deploying Dynamo on Kubernetes](https://github.com/ai-dynamo/dynamo/blob/main/docs/kubernetes/README.md) guide.
|
||||||
|
|
||||||
|
Background reading: InfoQ news coverage — [NVIDIA Dynamo simplifies Kubernetes deployment for LLM inference](https://www.infoq.com/news/2025/12/nvidia-dynamo-kubernetes/).
|
||||||
@@ -5,6 +5,7 @@
|
|||||||
Please see the Installation Guides for environment specific instructions:
|
Please see the Installation Guides for environment specific instructions:
|
||||||
|
|
||||||
- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
|
- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
|
||||||
|
- [AKS](https://www.kubeai.org/installation/aks/)
|
||||||
- [EKS](https://www.kubeai.org/installation/eks/)
|
- [EKS](https://www.kubeai.org/installation/eks/)
|
||||||
- [GKE](https://www.kubeai.org/installation/gke/)
|
- [GKE](https://www.kubeai.org/installation/gke/)
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ A Ray cluster can be declared in YAML, and the operator then handles pod schedul
|
|||||||
## Why KubeRay instead of manual scripts?
|
## Why KubeRay instead of manual scripts?
|
||||||
|
|
||||||
| Feature | Manual scripts | KubeRay |
|
| Feature | Manual scripts | KubeRay |
|
||||||
|---------|-----------------------------------------------------------|---------|
|
| ------- | --------------------------------------------------------- | ------- |
|
||||||
| Cluster bootstrap | Manually SSH into every node and run a script | One command to create or update the whole cluster: `kubectl apply -f cluster.yaml` |
|
| Cluster bootstrap | Manually SSH into every node and run a script | One command to create or update the whole cluster: `kubectl apply -f cluster.yaml` |
|
||||||
| Autoscaling | Manual | Automatically patches CRDs for adjusting cluster size |
|
| Autoscaling | Manual | Automatically patches CRDs for adjusting cluster size |
|
||||||
| Upgrades | Tear down & re-create manually | Blue/green deployment updates supported |
|
| Upgrades | Tear down & re-create manually | Blue/green deployment updates supported |
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le
|
|||||||
Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||||
|
|
||||||
- [Helm](frameworks/helm.md)
|
- [Helm](frameworks/helm.md)
|
||||||
|
- [NVIDIA Dynamo](integrations/dynamo.md)
|
||||||
- [InftyAI/llmaz](integrations/llmaz.md)
|
- [InftyAI/llmaz](integrations/llmaz.md)
|
||||||
- [llm-d](integrations/llm-d.md)
|
- [llm-d](integrations/llm-d.md)
|
||||||
- [KAITO](integrations/kaito.md)
|
- [KAITO](integrations/kaito.md)
|
||||||
@@ -20,7 +21,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
|||||||
- [kubernetes-sigs/lws](frameworks/lws.md)
|
- [kubernetes-sigs/lws](frameworks/lws.md)
|
||||||
- [meta-llama/llama-stack](integrations/llamastack.md)
|
- [meta-llama/llama-stack](integrations/llamastack.md)
|
||||||
- [substratusai/kubeai](integrations/kubeai.md)
|
- [substratusai/kubeai](integrations/kubeai.md)
|
||||||
- [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
|
- [vllm-project/AIBrix](integrations/aibrix.md)
|
||||||
- [vllm-project/production-stack](integrations/production-stack.md)
|
- [vllm-project/production-stack](integrations/production-stack.md)
|
||||||
|
|
||||||
## Deployment with CPUs
|
## Deployment with CPUs
|
||||||
|
|||||||
@@ -119,10 +119,10 @@ The code can be found in [vllm/v1/engine/coordinator.py](../../vllm/v1/engine/co
|
|||||||
For a deployment with `N` GPUs, `TP` tensor parallel size, `DP` data parallel size, and `A` API server count:
|
For a deployment with `N` GPUs, `TP` tensor parallel size, `DP` data parallel size, and `A` API server count:
|
||||||
|
|
||||||
| Process Type | Count | Notes |
|
| Process Type | Count | Notes |
|
||||||
|---|---|---|
|
| - | - | - |
|
||||||
| API Server | `A` (default `DP`) | Handles HTTP requests and input processing |
|
| API Server | `A` (default `DP`) | Handles HTTP requests and input processing |
|
||||||
| Engine Core | `DP` (default 1) | Scheduler and KV cache management |
|
| Engine Core | `DP` (default 1) | Scheduler and KV cache management |
|
||||||
| GPU Worker | `N` (= `DP x TP`) | One per GPU, executes model forward passes |
|
| GPU Worker | `N` (= `DP x PP x TP`) | One per GPU, executes model forward passes |
|
||||||
| DP Coordinator | 1 if `DP > 1`, else 0 | Load balancing across DP ranks |
|
| DP Coordinator | 1 if `DP > 1`, else 0 | Load balancing across DP ranks |
|
||||||
| **Total** | **`A + DP + N` (+ 1 if DP > 1)** | |
|
| **Total** | **`A + DP + N` (+ 1 if DP > 1)** | |
|
||||||
|
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ Priority is **1 = highest** (tried first).
|
|||||||
**Blackwell (SM 10.x):**
|
**Blackwell (SM 10.x):**
|
||||||
|
|
||||||
| Priority | Backend |
|
| Priority | Backend |
|
||||||
|----------|---------|
|
| -------- | ------- |
|
||||||
| 1 | `FLASHINFER` |
|
| 1 | `FLASHINFER` |
|
||||||
| 2 | `FLASH_ATTN` |
|
| 2 | `FLASH_ATTN` |
|
||||||
| 3 | `TRITON_ATTN` |
|
| 3 | `TRITON_ATTN` |
|
||||||
@@ -110,7 +110,7 @@ Priority is **1 = highest** (tried first).
|
|||||||
**Ampere/Hopper (SM 8.x-9.x):**
|
**Ampere/Hopper (SM 8.x-9.x):**
|
||||||
|
|
||||||
| Priority | Backend |
|
| Priority | Backend |
|
||||||
|----------|---------|
|
| -------- | ------- |
|
||||||
| 1 | `FLASH_ATTN` |
|
| 1 | `FLASH_ATTN` |
|
||||||
| 2 | `FLASHINFER` |
|
| 2 | `FLASHINFER` |
|
||||||
| 3 | `TRITON_ATTN` |
|
| 3 | `TRITON_ATTN` |
|
||||||
@@ -121,7 +121,7 @@ Priority is **1 = highest** (tried first).
|
|||||||
**Blackwell (SM 10.x):**
|
**Blackwell (SM 10.x):**
|
||||||
|
|
||||||
| Priority | Backend |
|
| Priority | Backend |
|
||||||
|----------|---------|
|
| -------- | ------- |
|
||||||
| 1 | `FLASHINFER_MLA` |
|
| 1 | `FLASHINFER_MLA` |
|
||||||
| 2 | `CUTLASS_MLA` |
|
| 2 | `CUTLASS_MLA` |
|
||||||
| 3 | `FLASH_ATTN_MLA` |
|
| 3 | `FLASH_ATTN_MLA` |
|
||||||
@@ -133,7 +133,7 @@ Priority is **1 = highest** (tried first).
|
|||||||
**Ampere/Hopper (SM 8.x-9.x):**
|
**Ampere/Hopper (SM 8.x-9.x):**
|
||||||
|
|
||||||
| Priority | Backend |
|
| Priority | Backend |
|
||||||
|----------|---------|
|
| -------- | ------- |
|
||||||
| 1 | `FLASH_ATTN_MLA` |
|
| 1 | `FLASH_ATTN_MLA` |
|
||||||
| 2 | `FLASHMLA` |
|
| 2 | `FLASHMLA` |
|
||||||
| 3 | `FLASHINFER_MLA` |
|
| 3 | `FLASHINFER_MLA` |
|
||||||
@@ -145,7 +145,7 @@ Priority is **1 = highest** (tried first).
|
|||||||
## Legend
|
## Legend
|
||||||
|
|
||||||
| Column | Description |
|
| Column | Description |
|
||||||
|--------|-------------|
|
| ------ | ----------- |
|
||||||
| **Dtypes** | Supported model data types (fp16, bf16, fp32) |
|
| **Dtypes** | Supported model data types (fp16, bf16, fp32) |
|
||||||
| **KV Dtypes** | Supported KV cache data types (`auto`, `fp8`, `fp8_e4m3`, etc.) |
|
| **KV Dtypes** | Supported KV cache data types (`auto`, `fp8`, `fp8_e4m3`, etc.) |
|
||||||
| **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) |
|
| **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) |
|
||||||
@@ -162,7 +162,7 @@ Priority is **1 = highest** (tried first).
|
|||||||
## Standard Attention (MHA, MQA, GQA) Backends
|
## Standard Attention (MHA, MQA, GQA) Backends
|
||||||
|
|
||||||
| Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | MM Prefix | DCP | Attention Types | Compute Cap. |
|
| Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | MM Prefix | DCP | Attention Types | Compute Cap. |
|
||||||
|---------|---------|--------|-----------|-------------|------------|------|-----------|-----|-----------------|--------------|
|
| ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | --------- | --- | --------------- | ------------ |
|
||||||
| `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
|
| `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
|
||||||
| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
|
| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
|
||||||
| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
|
| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
|
||||||
@@ -171,9 +171,9 @@ Priority is **1 = highest** (tried first).
|
|||||||
| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥10.0 |
|
| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥10.0 |
|
||||||
| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
|
| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
|
||||||
| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
|
| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
|
||||||
| `ROCM_AITER_FA` | | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
|
| `ROCM_AITER_FA` | | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder, Enc-Dec | N/A |
|
||||||
| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | All | N/A |
|
| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | %16 | Any | ✅ | ✅ | ❌ | All | N/A |
|
||||||
| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
|
| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 544 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ✅ | ✅ | ❌ | All | N/A |
|
||||||
| `TREE_ATTN` | | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
|
| `TREE_ATTN` | | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
|
||||||
| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
|
| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
|
||||||
|
|
||||||
@@ -191,7 +191,7 @@ The prefill backend is selected at runtime based on hardware and
|
|||||||
configuration.
|
configuration.
|
||||||
|
|
||||||
| Backend | Description | Compute Cap. | Enable | Disable | Notes |
|
| Backend | Description | Compute Cap. | Enable | Disable | Notes |
|
||||||
|---------|-------------|--------------|--------|---------|-------|
|
| ------- | ----------- | ------------ | ------ | ------- | ----- |
|
||||||
| TRT-LLM Ragged‡ | TensorRT-LLM ragged attention | 10.x | Default on SM100 | `-ac.use_trtllm_ragged_deepseek_prefill=0` | DeepSeek R1 dims only |
|
| TRT-LLM Ragged‡ | TensorRT-LLM ragged attention | 10.x | Default on SM100 | `-ac.use_trtllm_ragged_deepseek_prefill=0` | DeepSeek R1 dims only |
|
||||||
| FlashInfer | FlashInfer CUTLASS backend | 10.x | `-ac.disable_flashinfer_prefill=0` | `-ac.disable_flashinfer_prefill=1` | DeepSeek R1 dims only |
|
| FlashInfer | FlashInfer CUTLASS backend | 10.x | `-ac.disable_flashinfer_prefill=0` | `-ac.disable_flashinfer_prefill=1` | DeepSeek R1 dims only |
|
||||||
| cuDNN | cuDNN-based attention | 10.x | `-ac.use_cudnn_prefill=1` | `-ac.use_cudnn_prefill=0` | |
|
| cuDNN | cuDNN-based attention | 10.x | `-ac.use_cudnn_prefill=1` | `-ac.use_cudnn_prefill=0` | |
|
||||||
@@ -203,14 +203,14 @@ configuration.
|
|||||||
### Decode Backends
|
### Decode Backends
|
||||||
|
|
||||||
| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. |
|
| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. |
|
||||||
|---------|--------|-----------|-------------|------------|------|--------|-----------|-----|-----------------|--------------|
|
| ------- | ------ | --------- | ----------- | ---------- | ---- | ------ | --------- | --- | --------------- | ------------ |
|
||||||
| `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
|
| `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
|
||||||
| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
|
| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
|
||||||
| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
|
| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
|
||||||
| `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
|
| `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
|
||||||
| `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
|
| `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
|
||||||
| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
|
| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
|
||||||
| `ROCM_AITER_MLA` | fp16, bf16 | `auto` | 1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
|
| `ROCM_AITER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
|
||||||
| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto` | Any | 576 | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
|
| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 1 | Any | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
|
||||||
| `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
|
| `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
|
||||||
| `TRITON_MLA` | fp16, bf16 | `auto`, `bfloat16` | Any | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
|
| `TRITON_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ The goal of this structure is to uniquely identify a (padded) batch with minimal
|
|||||||
|
|
||||||
### `CudagraphDispatcher`
|
### `CudagraphDispatcher`
|
||||||
|
|
||||||
The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWarpper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.
|
The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWrapper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.
|
||||||
|
|
||||||
The dispatching keys are initialized through the dispatcher's `initialize_cudagraph_keys` method, which is called by the gpu_model_runner after all possible attention backends are initialized. This is where we can get much fancier in the future and “prepare” all kinds of CUDA Graphs combinations. For now, we just append available keys based on the valid combos of `decode_mode`/`mixed_mode` of `cudagraph_mode` and `cudagraph_capture_sizes` in the compilation config.
|
The dispatching keys are initialized through the dispatcher's `initialize_cudagraph_keys` method, which is called by the gpu_model_runner after all possible attention backends are initialized. This is where we can get much fancier in the future and “prepare” all kinds of CUDA Graphs combinations. For now, we just append available keys based on the valid combos of `decode_mode`/`mixed_mode` of `cudagraph_mode` and `cudagraph_capture_sizes` in the compilation config.
|
||||||
|
|
||||||
@@ -174,18 +174,18 @@ Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that
|
|||||||
The following table lists backends that support full CUDA Graphs at the time of writing.
|
The following table lists backends that support full CUDA Graphs at the time of writing.
|
||||||
|
|
||||||
| Attention Backend | cudagraph_support | Comments |
|
| Attention Backend | cudagraph_support | Comments |
|
||||||
|:---|:---|:---|
|
| :---------------- | :---------------- | :------- |
|
||||||
| FlashAttention v2 | `UNIFORM_BATCH` | Actually `ALWAYS` but workaround to fallback to `FULL_AND_PIECEWISE` for performance reason |
|
| FlashAttention v2 | `UNIFORM_BATCH` | Actually `ALWAYS` but workaround to fallback to `FULL_AND_PIECEWISE` for performance reason |
|
||||||
| FlashAttention v3 | `ALWAYS` | has unified routine for both batches, so `FULL` mode is good |
|
| FlashAttention v3 | `ALWAYS` | has unified routine for both batches, so `FULL` mode is good |
|
||||||
| Triton Attention | `ALWAYS` | prefer `FULL_AND_PIECEWISE` since it has different kernels for prefill/mixed and pure decode batches |
|
| Triton Attention | `ALWAYS` | prefer `FULL_AND_PIECEWISE` since it has different kernels for prefill/mixed and pure decode batches |
|
||||||
| AITER FlashAttention | `UNIFORM_BATCH`| |
|
| AITER FlashAttention | `UNIFORM_BATCH` | |
|
||||||
| FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | Will be set to `UNIFORM_BATCH` when using TRTLLM attention on Blackwell |
|
| FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | Will be set to `UNIFORM_BATCH` when using TRTLLM attention on Blackwell |
|
||||||
| FlashMLA | `UNIFORM_BATCH` | |
|
| FlashMLA | `UNIFORM_BATCH` | |
|
||||||
| FlashInferMLA | `UNIFORM_BATCH` | |
|
| FlashInferMLA | `UNIFORM_BATCH` | |
|
||||||
| FlashInferMLASparse | `UNIFORM_BATCH` | |
|
| FlashInferMLASparse | `UNIFORM_BATCH` | |
|
||||||
| AITER MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
|
| AITER MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
|
||||||
| CUTLASS MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
|
| CUTLASS MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
|
||||||
| Mamba attention| `UNIFORM_SINGLE_TOKEN_DECODE` | |
|
| Mamba attention | `UNIFORM_SINGLE_TOKEN_DECODE` | |
|
||||||
|
|
||||||
Unlisted backends are all declared as `NEVER`.
|
Unlisted backends are all declared as `NEVER`.
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ TL;DR:
|
|||||||
- The vLLM-torch.compile integration is multiple pieces. vLLM exposes flags to turn off each piece:
|
- The vLLM-torch.compile integration is multiple pieces. vLLM exposes flags to turn off each piece:
|
||||||
|
|
||||||
| Online Flag | Offline Flag | Result |
|
| Online Flag | Offline Flag | Result |
|
||||||
|----------|----------|-------------|
|
| ----------- | ------------ | ------ |
|
||||||
| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
|
| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
|
||||||
| -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
|
| -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
|
||||||
| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
|
| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ The TopK Weight Application and Reduction components happen right after the Unpe
|
|||||||
Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
|
Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
|
||||||
|
|
||||||
`FusedMoEPrepareAndFinalizeModular::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
|
`FusedMoEPrepareAndFinalizeModular::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
|
||||||
The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
|
The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPrepareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
|
||||||
|
|
||||||
* `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEExpertsModular` implementation does the weight application and reduction itself.
|
* `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEExpertsModular` implementation does the weight application and reduction itself.
|
||||||
* `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEExpertsModular` implementation needs the `FusedMoEPrepareAndFinalizeModular::finalize()` to do the weight application and reduction.
|
* `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEExpertsModular` implementation needs the `FusedMoEPrepareAndFinalizeModular::finalize()` to do the weight application and reduction.
|
||||||
|
|||||||
339
docs/design/fusions.md
Normal file
339
docs/design/fusions.md
Normal file
@@ -0,0 +1,339 @@
|
|||||||
|
# Fusion torch.compile passes
|
||||||
|
|
||||||
|
vLLM applies a set of kernel/operator fusions at compile time (via custom [`torch.compile`](torch_compile.md) Inductor passes)
|
||||||
|
to separate optimizations from model definitions and avoid breaking layer abstractions in model code.
|
||||||
|
These fusions are controlled by fields in [`PassConfig`][vllm.config.compilation.PassConfig] and are automatically enabled
|
||||||
|
at appropriate [optimization levels](optimization_levels.md).
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
The table below maps each fusion to its controlling flag/config knob, the
|
||||||
|
operations it fuses, what level enables it by default, and an indicative speedup.
|
||||||
|
The Fullgraph column indicates whether the fusion requires the entire model graph to be
|
||||||
|
visible (either via Inductor partition or `splitting_ops=[]`),
|
||||||
|
and the last column indicates whether the fusion activates for all `num_tokens`
|
||||||
|
or just on the low or high end.
|
||||||
|
|
||||||
|
!!! info
|
||||||
|
Speedup depends heavily on the exact model, batch size, and hardware.
|
||||||
|
If tuning performance by hand, always benchmark your exact use-case with and without the fusion to verify the impact.
|
||||||
|
|
||||||
|
| Fusion | `PassConfig` flag | Fused operations | Default at | E2E Speedup | Fullgraph | `num_tokens` |
|
||||||
|
| ------------------------------------------------------------------------------ | ---------------------------- | ---------------------------------------------- | ------------------------------ | ------------------ | --------- | ------------ |
|
||||||
|
| [AllReduce + RMSNorm](#allreduce--rmsnorm-fuse_allreduce_rms) | `fuse_allreduce_rms` | All-reduce → RMSNorm (+residual_add) (→ quant) | O2 (Hopper/Blackwell + TP > 1) | 5-20% | No | Low |
|
||||||
|
| [Attention + Quant](#attention--quantization-fuse_attn_quant) | `fuse_attn_quant` | Attention output → FP8/NVFP4 quant | Off by default | 3-7% | Yes | Always |
|
||||||
|
| [RoPE + KV-Cache Update](#rope--kv-cache-update-fuse_rope_kvcache) | `fuse_rope_kvcache` | Rotary embedding → KV cache write | O1 (ROCm/AITER only) | TBD | No | Low |
|
||||||
|
| [QK Norm + RoPE](#qk-norm--rope-enable_qk_norm_rope_fusion) | `enable_qk_norm_rope_fusion` | Q/K RMSNorm → rotary embedding | Off by default | 2-3% | No | Low |
|
||||||
|
| [Sequence Parallelism](#sequence-parallelism-enable_sp) | `enable_sp` | AllReduce → ReduceScatter + AllGather | Off by default | Prereq for AsyncTP | Yes | High |
|
||||||
|
| [AsyncTP GEMM + collective](#asynctp-gemm--collective-overlap-fuse_gemm_comms) | `fuse_gemm_comms` | GEMM → reduce-scatter / all-gather → GEMM | Off by default | 7-10% | Yes | High |
|
||||||
|
| [RMSNorm + Quant](#rmsnorm--quantization-fuse_norm_quant) | `fuse_norm_quant` | RMSNorm (+residual add) → FP8/FP4 quant | O1 (conditional) | 1-4% | No | Always |
|
||||||
|
| [SiLU+Mul + Quant](#silumul--quantization-fuse_act_quant) | `fuse_act_quant` | SiLU+Mul activation → FP8/FP4 quant | O1 (conditional) | 1-4% | No | Always |
|
||||||
|
| [RMSNorm + Padding](#rmsnorm--padding-fuse_act_padding) | `fuse_act_padding` | Residual add + RMSNorm → padding | O1 (ROCm/AITER only) | TBD | No | Always |
|
||||||
|
|
||||||
|
## Support Matrix
|
||||||
|
|
||||||
|
The table below lists the quantization schemes supported by each fusion on each platform.
|
||||||
|
**—** means the fusion is not available on that platform. The latest and in-progress work is available in the tracking issue:
|
||||||
|
[#36066](https://github.com/vllm-project/vllm/issues/36066)
|
||||||
|
|
||||||
|
| Fusion | SM100 (Blackwell) | SM90 (Hopper) | SM89 (Ada) | SM80 (Ampere) | ROCm |
|
||||||
|
| ---------------------------- | ---------------------------------------- | ---------------------------------------- | ---------------------------------------- | ------------- | ---------------------------------------- |
|
||||||
|
| `fuse_allreduce_rms` | FP16/BF16, FP8 static, NVFP4 | FP16/BF16, FP8 static | — | — | — |
|
||||||
|
| `fuse_attn_quant`\* | FP8 static\*, NVFP4\* | FP8 static\* | FP8 static\* | — | FP8 static\* |
|
||||||
|
| `fuse_rope_kvcache` | — | — | — | — | FP16/BF16 |
|
||||||
|
| `enable_qk_norm_rope_fusion` | FP16/BF16 | FP16/BF16 | FP16/BF16† | FP16/BF16† | — |
|
||||||
|
| `enable_sp` | FP16/BF16, FP8 static† | FP16/BF16, FP8 static | FP16/BF16† | FP16/BF16† | — |
|
||||||
|
| `fuse_gemm_comms` | FP16/BF16, FP8 static† | FP16/BF16, FP8 static | FP16/BF16† | FP16/BF16† | — |
|
||||||
|
| `fuse_norm_quant` | FP8 static, FP8 per-token, FP8 per-group | FP8 static, FP8 per-token, FP8 per-group | FP8 static, FP8 per-token, FP8 per-group | — | FP8 static, FP8 per-token, FP8 per-group |
|
||||||
|
| `fuse_act_quant` | FP8 static, NVFP4 | FP8 static | FP8 static | — | FP8 per-group |
|
||||||
|
| `fuse_act_padding` | — | — | — | — | FP16/BF16 |
|
||||||
|
|
||||||
|
\* `fuse_attn_quant` support depends on the attention backend in use; not all backends support
|
||||||
|
fused quantization output. See the [`fuse_attn_quant` section](#attention--quantization-fuse_attn_quant)
|
||||||
|
for per-backend details.
|
||||||
|
|
||||||
|
† `enable_sp` and `fuse_gemm_comms` are only autoconfigured for SM90 today;
|
||||||
|
other architectures support requires setting `PassConfig.sp_min_token_num` explicitly.
|
||||||
|
SM100 support also requires setting `VLLM_DISABLED_KERNELS=FlashInferFP8ScaledMMLinearKernel`.
|
||||||
|
|
||||||
|
## Enabling / Disabling Fusions
|
||||||
|
|
||||||
|
Fusions are exposed through `PassConfig`, which is nested inside `CompilationConfig`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
from vllm.config import CompilationConfig, PassConfig
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model="...",
|
||||||
|
optimization_level=2, # Default optimization level
|
||||||
|
compilation_config=CompilationConfig(
|
||||||
|
pass_config=PassConfig(
|
||||||
|
fuse_norm_quant=True,
|
||||||
|
fuse_act_quant=True,
|
||||||
|
fuse_allreduce_rms=False, # disable a specific fusion
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Fusions can also be enabled using command-line flags with any `vllm ...` command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Enable O2 defaults, but turn off allreduce fusion
|
||||||
|
vllm serve meta-llama/Llama-3.1-8B-Instruct -O2 -cc.pass_config.fuse_allreduce_rms=False
|
||||||
|
|
||||||
|
# The above is equivalent to the more verbose:
|
||||||
|
vllm serve meta-llama/Llama-3.1-8B-Instruct -O2 --compilation-config '{"pass_config": {"fuse_allreduce_rms": false}}'
|
||||||
|
|
||||||
|
# Same syntax in other commands, e.g. vllm bench:
|
||||||
|
vllm bench latency --model=meta-llama/Llama-3.1-8B-Instruct -O2 -cc.pass_config.fuse_allreduce_rms=False
|
||||||
|
```
|
||||||
|
|
||||||
|
Fields set explicitly by the user always take precedence over optimization-level defaults.
|
||||||
|
|
||||||
|
## Fusion Details
|
||||||
|
|
||||||
|
### AllReduce + RMSNorm (`fuse_allreduce_rms`)
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
TP+DP and TP+PP combinations are currently broken
|
||||||
|
([#34458](https://github.com/vllm-project/vllm/issues/34458) and
|
||||||
|
[#35426](https://github.com/vllm-project/vllm/issues/35426)).
|
||||||
|
Only supported on NVIDIA Hopper (SM90) and Blackwell (SM100) with FlashInfer installed.
|
||||||
|
|
||||||
|
**What it fuses.** Fuses the tensor-parallel all-reduce collective with the subsequent residual add,
|
||||||
|
RMSNorm, and optionally a quantization step into a single FlashInfer / TRT-LLM communication kernel.
|
||||||
|
This fusion is only profitable for small `num_tokens`,
|
||||||
|
so the fusion is only performed in the lower compiled range.
|
||||||
|
|
||||||
|
Patterns covered:
|
||||||
|
|
||||||
|
- `AllReduce → RMSNorm(+residual_add)`: CUDA sm90+ with FlashInfer
|
||||||
|
- `AllReduce → RMSNorm(+residual_add) → FP8 static quant`: CUDA sm90+ with FlashInfer
|
||||||
|
- `AllReduce → RMSNorm(+residual_add) → NVFP4 dynamic quant`: CUDA sm100+ with FlashInfer
|
||||||
|
|
||||||
|
The maximum tensor size below which the fused kernel is used is hardware-dependent (64 MB for TP=2
|
||||||
|
on SM90/SM100) and configurable via `PassConfig.fi_allreduce_fusion_max_size_mb`.
|
||||||
|
|
||||||
|
**Code locations.**
|
||||||
|
|
||||||
|
- Pass: [`vllm/compilation/passes/fusion/allreduce_rms_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/allreduce_rms_fusion.py)
|
||||||
|
- FlashInfer all-reduce: [`vllm/distributed/device_communicators/flashinfer_all_reduce.py`](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/device_communicators/flashinfer_all_reduce.py)
|
||||||
|
- Benchmark: [`benchmarks/kernels/benchmark_fused_collective.py`](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_fused_collective.py)
|
||||||
|
|
||||||
|
### Attention + Quantization (`fuse_attn_quant`)
|
||||||
|
|
||||||
|
!!! info
|
||||||
|
`fuse_attn_quant` is currently not enabled at any optimization level by default and must be set
|
||||||
|
explicitly. It requires the full model graph to be visible (Inductor partition or `splitting_ops=[]`).
|
||||||
|
|
||||||
|
**What it fuses.** Fuses the attention output quantization directly after the attention computation,
|
||||||
|
eliminating a full-precision memory round-trip of the attention output. Patterns covered:
|
||||||
|
|
||||||
|
`Attention → FP8 static quant`:
|
||||||
|
|
||||||
|
- `TRITON_ATTN`: CUDA, ROCm
|
||||||
|
- `FLASHINFER`: CUDA sm100+ with FlashInfer installed
|
||||||
|
- `ROCM_ATTN`: ROCm
|
||||||
|
- `ROCM_AITER_UNIFIED_ATTN`: ROCm with AITER
|
||||||
|
|
||||||
|
`Attention → NVFP4 dynamic quant`:
|
||||||
|
|
||||||
|
- `FLASHINFER`: CUDA sm100+ with FlashInfer installed
|
||||||
|
|
||||||
|
Other attention backends do not support fused output quantization yet.
|
||||||
|
|
||||||
|
**Code locations.**
|
||||||
|
|
||||||
|
- Pass: [`vllm/compilation/passes/fusion/attn_quant_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/attn_quant_fusion.py)
|
||||||
|
- Attention backends: [`vllm/v1/attention/backends/`](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/)
|
||||||
|
|
||||||
|
### RoPE + KV-Cache Update (`fuse_rope_kvcache`)
|
||||||
|
|
||||||
|
!!! info
|
||||||
|
ROCm/AITER-only. Not available on NVIDIA CUDA or CPU. The fusion is only enabled for
|
||||||
|
`num_tokens ≤ 256` by default due to AITER fused kernel performance issues.
|
||||||
|
This threshold is configurable via `PassConfig.rope_kvcache_fusion_max_token_num`.
|
||||||
|
|
||||||
|
**What it fuses.** Fuses the rotary positional embedding kernel with the KV-cache scatter/write into
|
||||||
|
a single kernel, avoiding separate reads and writes of the key and value tensors.
|
||||||
|
|
||||||
|
Requires: AMD ROCm with AITER enabled, the `rotary_embedding` custom op active (automatic),
|
||||||
|
and the `kv_cache` update op visible in the graph: either by using Inductor graph partition
|
||||||
|
or removed from `splitting_ops`.
|
||||||
|
If these conditions are set, the fusion is enabled automatically for optimization level O1 and above.
|
||||||
|
|
||||||
|
**Code locations.**
|
||||||
|
|
||||||
|
- Pass: [`vllm/compilation/passes/fusion/rope_kvcache_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rope_kvcache_fusion.py)
|
||||||
|
|
||||||
|
### Sequence Parallelism (`enable_sp`)
|
||||||
|
|
||||||
|
**What it fuses.** Replaces all-reduce collectives with reduce-scatter + local RMSNorm + all-gather,
|
||||||
|
splitting the sequence dimension across TP ranks. This restructures the graph so the subsequent AsyncTP
|
||||||
|
pass can fuse the reduce-scatter / all-gather with the surrounding GEMMs.
|
||||||
|
|
||||||
|
Sequence Parallelism itself does not directly improve performance; it is a prerequisite for the
|
||||||
|
AsyncTP pass (`fuse_gemm_comms`). SP is only applied above a minimum token threshold that is
|
||||||
|
autoconfigured based on device capability and model `hidden_size`. Currently only active on
|
||||||
|
H100/SM90 for models with `hidden_size >= 8192`. The threshold is configurable via
|
||||||
|
`PassConfig.sp_min_token_num`.
|
||||||
|
|
||||||
|
The general transformation:
|
||||||
|
|
||||||
|
```text
|
||||||
|
Input → AllReduce → RMSNorm → Output
|
||||||
|
becomes:
|
||||||
|
Input → ReduceScatter → local RMSNorm → AllGather → Output
|
||||||
|
```
|
||||||
|
|
||||||
|
Patterns covered:
|
||||||
|
|
||||||
|
- First block: `AllReduce → RMSNorm` → `ReduceScatter → RMSNorm → AllGather`
|
||||||
|
- Middle blocks: `AllReduce → fused_add_RMSNorm` → `ReduceScatter → fused_add_RMSNorm → AllGather`
|
||||||
|
- Both with optional `→ FP8 static quant` suffix
|
||||||
|
|
||||||
|
Requires: `use_inductor_graph_partition=True` **or** piecewise compilation with static sizes
|
||||||
|
divisible by `tensor_parallel_size`.
|
||||||
|
|
||||||
|
Supported hardware: Only tested on NVIDIA CUDA, possibly works on ROCm. FP8 all-gather requires sm90+.
|
||||||
|
|
||||||
|
**Code locations.**
|
||||||
|
|
||||||
|
- Pass: [`vllm/compilation/passes/fusion/sequence_parallelism.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/sequence_parallelism.py)
|
||||||
|
|
||||||
|
### AsyncTP GEMM + Collective Overlap (`fuse_gemm_comms`)
|
||||||
|
|
||||||
|
!!! info
|
||||||
|
Requires `enable_sp=True` (enabled automatically). This pass is a no-op if Sequence Parallelism has not been applied.
|
||||||
|
|
||||||
|
**What it fuses.** After Sequence Parallelism transforms the graph, fuses GEMM kernels with the
|
||||||
|
surrounding reduce-scatter (output projection) and all-gather (input projection) using
|
||||||
|
`torch.ops.symm_mem` symmetric-memory primitives, overlapping communication and computation.
|
||||||
|
This overlap is only profitable for large `num_tokens`, so the fusion (and preceding SP)
|
||||||
|
is only performed in the higher compiled range above `PassConfig.sp_min_token_num`.
|
||||||
|
|
||||||
|
Patterns covered:
|
||||||
|
|
||||||
|
- `GEMM → reduce-scatter` → `fused_matmul_reduce_scatter`
|
||||||
|
- `all-gather → GEMM` → `all_gather_matmul`
|
||||||
|
- FP8 scaled variants of both patterns
|
||||||
|
|
||||||
|
Supported hardware: NVIDIA CUDA with symmetric-memory (`torch.distributed._symmetric_memory`) support.
|
||||||
|
|
||||||
|
On B200, pattern-matching fp8 FlashInfer scaled MM is not supported, so it must be disabled
|
||||||
|
([#27893](https://github.com/vllm-project/vllm/issues/27893))
|
||||||
|
|
||||||
|
```shell
|
||||||
|
VLLM_DISABLED_KERNELS=FlashInferFP8ScaledMMLinearKernel ...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Code locations.**
|
||||||
|
|
||||||
|
- Pass: [`vllm/compilation/passes/fusion/collective_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/collective_fusion.py)
|
||||||
|
- Sequence parallelism pass: [`vllm/compilation/passes/fusion/sequence_parallelism.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/sequence_parallelism.py)
|
||||||
|
|
||||||
|
### QK Norm + RoPE (`enable_qk_norm_rope_fusion`)
|
||||||
|
|
||||||
|
!!! info
|
||||||
|
Only applicable to models that apply per-head RMSNorm to Q and K before rotary positional
|
||||||
|
embedding (e.g. Qwen). Not enabled by default at any optimization level due to perf issues on H100:
|
||||||
|
[#34391](https://github.com/vllm-project/vllm/issues/34391)
|
||||||
|
|
||||||
|
**What it fuses.** Fuses the sequence: split QKV → reshape → Q/K RMSNorm → reshape → rotary
|
||||||
|
embedding into a single `fused_qk_norm_rope` CUDA kernel.
|
||||||
|
|
||||||
|
```text
|
||||||
|
# Unfused:
|
||||||
|
q, k, v = split(qkv)
|
||||||
|
q_norm = rms_norm(q.view(heads))
|
||||||
|
k_norm = rms_norm(k.view(kv_heads))
|
||||||
|
q_rope, k_rope = rotary_embedding(q_norm, k_norm, ...)
|
||||||
|
|
||||||
|
# Fused:
|
||||||
|
fused_qk_norm_rope(qkv, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
Supported hardware: CUDA (sm80+) only, tested only on sm90 and sm100.
|
||||||
|
|
||||||
|
**Code locations.**
|
||||||
|
|
||||||
|
- Pass: [`vllm/compilation/passes/fusion/qk_norm_rope_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/qk_norm_rope_fusion.py)
|
||||||
|
- CUDA kernel: [`csrc/ops.h`](https://github.com/vllm-project/vllm/blob/main/csrc/ops.h) (`fused_qk_norm_rope`)
|
||||||
|
|
||||||
|
### RMSNorm + Quantization (`fuse_norm_quant`)
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
On NVIDIA, Inductor actually generates a faster fused kernel than our custom CUDA kernel.
|
||||||
|
Hence, this fusion is only enabled when either `rms_norm` or `quant_fp8` is using a custom kernel.
|
||||||
|
|
||||||
|
**What it fuses.** Combines the custom `rms_norm` / `fused_add_rms_norm`
|
||||||
|
operations with subsequent quantization into a single fused kernel,
|
||||||
|
eliminating an intermediate read/write of the full-precision activation tensor.
|
||||||
|
Two variants are fused:
|
||||||
|
|
||||||
|
- *Plain RMSNorm + quant*: `rms_norm(x) → quant_fp8(y)`
|
||||||
|
- *Fused-add RMSNorm + quant*: `fused_add_rms_norm(x, residual) → quant_fp8(y)` — also updates the residual in-place.
|
||||||
|
|
||||||
|
Note that AITER fusions are currently in a separate pass in `vllm.compilation.passes.fusion.rocm_aiter_fusion`.
|
||||||
|
|
||||||
|
Supported quantization scheme/hardware combinations:
|
||||||
|
|
||||||
|
- FP8 static per-tensor: CUDA & HIP kernel
|
||||||
|
- FP8 dynamic per-token: CUDA & HIP kernel, AITER
|
||||||
|
- FP8 dynamic per-token-group (128/64): CUDA & HIP kernel, AITER
|
||||||
|
|
||||||
|
**Code locations.**
|
||||||
|
|
||||||
|
- Pass: [`vllm/compilation/passes/fusion/rms_quant_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rms_quant_fusion.py)
|
||||||
|
- ROCm AITER pass: [`vllm/compilation/passes/fusion/rocm_aiter_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rocm_aiter_fusion.py)
|
||||||
|
- CUDA/HIP kernels: [`csrc/layernorm_quant_kernels.cu`](https://github.com/vllm-project/vllm/blob/main/csrc/layernorm_quant_kernels.cu)
|
||||||
|
|
||||||
|
### SiLU+Mul + Quantization (`fuse_act_quant`)
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
Same as `fuse_norm_quant`: on NVIDIA, Inductor generates a faster fused kernel than our custom ops.
|
||||||
|
This fusion is only enabled when either `silu_and_mul` or `quant_fp8` are using a custom kernel,
|
||||||
|
or for NVFP4-quantized models (where FP4 quant is always a custom op).
|
||||||
|
|
||||||
|
**What it fuses.** Fuses the `silu_and_mul` gate-up projection activation with subsequent quantization into a single kernel,
|
||||||
|
avoiding materialization of the full-precision post-activation tensor.
|
||||||
|
|
||||||
|
Note that AITER fusions are in a separate pass in `vllm.compilation.passes.fusion.rocm_aiter_fusion`.
|
||||||
|
|
||||||
|
Supported quantization scheme/hardware combinations:
|
||||||
|
|
||||||
|
- FP8 static per-tensor: CUDA & HIP kernel
|
||||||
|
- NVFP4 dynamic: CUDA sm100+ only with FlashInfer
|
||||||
|
- FP8 per-token-group (128): ROCm AITER only
|
||||||
|
|
||||||
|
**Code locations.**
|
||||||
|
|
||||||
|
- Pass: [`vllm/compilation/passes/fusion/act_quant_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/act_quant_fusion.py)
|
||||||
|
- ROCm AITER pass: [`vllm/compilation/passes/fusion/rocm_aiter_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rocm_aiter_fusion.py)
|
||||||
|
- CUDA/HIP kernels: [`csrc/quantization/`](https://github.com/vllm-project/vllm/blob/main/csrc/quantization/)
|
||||||
|
|
||||||
|
### RMSNorm + Padding (`fuse_act_padding`)
|
||||||
|
|
||||||
|
!!! info
|
||||||
|
ROCm/AITER-only. Targeted at GPT-OSS models.
|
||||||
|
|
||||||
|
**What it fuses.** Fuses a residual add + RMSNorm with a subsequent padding operation that pads
|
||||||
|
the hidden dimension to a multiple required by downstream AITER Triton GEMM kernels.
|
||||||
|
|
||||||
|
Requires: AMD ROCm with AITER RMSNorm enabled. Enabled by default in optimization level O1 and above
|
||||||
|
when the hidden size is 2880 and AITER Triton GEMMs *not* enabled.
|
||||||
|
|
||||||
|
**Code locations.**
|
||||||
|
|
||||||
|
- Pass: [`vllm/compilation/passes/fusion/rocm_aiter_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rocm_aiter_fusion.py) (`RocmAiterTritonAddRMSNormPadFusionPass`)
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [Optimization Levels](optimization_levels.md) — high-level presets that set
|
||||||
|
fusion defaults.
|
||||||
|
- [torch.compile in vLLM](torch_compile.md) — how the Inductor pass pipeline
|
||||||
|
works.
|
||||||
|
- [Attention Backends](attention_backends.md) — attention-specific kernel
|
||||||
|
selection.
|
||||||
@@ -352,7 +352,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests,
|
|||||||
(s, d, UNIDIRECTIONAL or SWAP)
|
(s, d, UNIDIRECTIONAL or SWAP)
|
||||||
```
|
```
|
||||||
|
|
||||||
* If the Move specifies `UNIDRECTIONAL`:
|
* If the Move specifies `UNIDIRECTIONAL`:
|
||||||
|
|
||||||
* The request at index `s` is moved to index `d`; index `s` becomes an empty slot
|
* The request at index `s` is moved to index `d`; index `s` becomes an empty slot
|
||||||
|
|
||||||
|
|||||||
@@ -507,10 +507,10 @@ longer relevant in v1:
|
|||||||
- `vllm:num_requests_swapped`
|
- `vllm:num_requests_swapped`
|
||||||
- `vllm:cpu_cache_usage_perc`
|
- `vllm:cpu_cache_usage_perc`
|
||||||
|
|
||||||
In this mode, when a request is preempted (e.g. to make room in KV
|
In this mode, when a request was preempted (e.g. to make room in KV
|
||||||
cache to complete other requests), we swap kv cache blocks out to CPU
|
cache to complete other requests), kv cache blocks were swapped out to
|
||||||
memory. This is also known as "KV cache offloading" and is configured
|
CPU memory. The `--swap-space` flag has been removed as this feature
|
||||||
with `--swap-space` and `--preemption-mode`.
|
is no longer used in V1.
|
||||||
|
|
||||||
Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
|
Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
|
||||||
SequenceGroup encapsulated the idea of N Sequences which
|
SequenceGroup encapsulated the idea of N Sequences which
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ V1 was not originally designed with async scheduling in mind, and support requir
|
|||||||
|
|
||||||
## 3. Removing Async Barrier
|
## 3. Removing Async Barrier
|
||||||
|
|
||||||
A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.cuda.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided.
|
A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.accelerator.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided.
|
||||||
|
|
||||||
However, async execution can introduce race conditions when CPU and GPU concurrently touch the same memory.
|
However, async execution can introduce race conditions when CPU and GPU concurrently touch the same memory.
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ th {
|
|||||||
</style>
|
</style>
|
||||||
|
|
||||||
| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
|
| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
|
||||||
|---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
|
| ------- | ------------------ | ------------ | ------------- | ----- | --------------------- | --------- |
|
||||||
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] |
|
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] |
|
||||||
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
|
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
|
||||||
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
|
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
|
||||||
@@ -78,7 +78,7 @@ Most experts flavors include an equivalent modular interface which will be a sub
|
|||||||
To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats.
|
To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats.
|
||||||
|
|
||||||
| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
|
| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
|
||||||
|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
|
| ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ |
|
||||||
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
|
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
|
||||||
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
|
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
|
||||||
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
|
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
|
||||||
@@ -105,7 +105,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
|
|||||||
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
|
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
|
||||||
|
|
||||||
| backend | `FusedMoEPrepareAndFinalizeModular` subclasses | `FusedMoEExpertsModular` subclasses |
|
| backend | `FusedMoEPrepareAndFinalizeModular` subclasses | `FusedMoEExpertsModular` subclasses |
|
||||||
|---------|-----------------------------------------|----------------------------------------------|
|
| ------- | ---------------------------------------------- | ----------------------------------- |
|
||||||
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
||||||
| deepep_low_latency | `DeepEPLLPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
|
| deepep_low_latency | `DeepEPLLPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
|
||||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
||||||
|
|||||||
@@ -12,9 +12,8 @@ page for information on known issues and how to solve them.
|
|||||||
|
|
||||||
The use of Python multiprocessing in vLLM is complicated by:
|
The use of Python multiprocessing in vLLM is complicated by:
|
||||||
|
|
||||||
- The use of vLLM as a library and the inability to control the code using vLLM
|
- using vLLM as a library, which limits control over its internal code;
|
||||||
- Varying levels of incompatibilities between multiprocessing methods and vLLM
|
- incompatibilities between certain multiprocessing methods and vLLM dependencies.
|
||||||
dependencies
|
|
||||||
|
|
||||||
This document describes how vLLM deals with these challenges.
|
This document describes how vLLM deals with these challenges.
|
||||||
|
|
||||||
@@ -22,11 +21,9 @@ This document describes how vLLM deals with these challenges.
|
|||||||
|
|
||||||
[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
|
[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
|
||||||
|
|
||||||
- `spawn` - spawn a new Python process. The default on Windows and macOS.
|
- `spawn` - Spawn a new Python process. The default on Windows and macOS.
|
||||||
|
|
||||||
- `fork` - Use `os.fork()` to fork the Python interpreter. The default on
|
- `fork` - Use `os.fork()` to fork the Python interpreter. The default on
|
||||||
Linux for Python versions prior to 3.14.
|
Linux for Python versions prior to 3.14.
|
||||||
|
|
||||||
- `forkserver` - Spawn a server process that will fork a new process on request.
|
- `forkserver` - Spawn a server process that will fork a new process on request.
|
||||||
The default on Linux for Python version 3.14 and newer.
|
The default on Linux for Python version 3.14 and newer.
|
||||||
|
|
||||||
@@ -36,8 +33,8 @@ This document describes how vLLM deals with these challenges.
|
|||||||
threads. If you are under macOS, using `fork` may cause the process to crash.
|
threads. If you are under macOS, using `fork` may cause the process to crash.
|
||||||
|
|
||||||
`spawn` is more compatible with dependencies, but can be problematic when vLLM
|
`spawn` is more compatible with dependencies, but can be problematic when vLLM
|
||||||
is used as a library. If the consuming code does not use a `__main__` guard (`if
|
is used as a library. If the consuming code does not use a `__main__` guard
|
||||||
__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
|
(`if __name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
|
||||||
spawns a new process. This can lead to infinite recursion, among other problems.
|
spawns a new process. This can lead to infinite recursion, among other problems.
|
||||||
|
|
||||||
`forkserver` will spawn a new server process that will fork new processes on
|
`forkserver` will spawn a new server process that will fork new processes on
|
||||||
@@ -57,8 +54,7 @@ Multiple vLLM dependencies indicate either a preference or requirement for using
|
|||||||
- <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
|
- <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
|
||||||
- <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
|
- <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
|
||||||
|
|
||||||
It is perhaps more accurate to say that there are known problems with using
|
Known issues exist when using `fork` after initializing these dependencies.
|
||||||
`fork` after initializing these dependencies.
|
|
||||||
|
|
||||||
## Current State (v0)
|
## Current State (v0)
|
||||||
|
|
||||||
@@ -66,8 +62,8 @@ The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control w
|
|||||||
|
|
||||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
|
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
|
||||||
|
|
||||||
When we know we own the process because the `vllm` command was used, we use
|
If the main process is controlled via the `vllm` command,
|
||||||
`spawn` because it's the most widely compatible.
|
`spawn` is used because it's the most widely compatible.
|
||||||
|
|
||||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
|
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
|
||||||
|
|
||||||
@@ -104,8 +100,8 @@ dependencies and code using vLLM as a library.
|
|||||||
### Changes Made in v1
|
### Changes Made in v1
|
||||||
|
|
||||||
There is not an easy solution with Python's `multiprocessing` that will work
|
There is not an easy solution with Python's `multiprocessing` that will work
|
||||||
everywhere. As a first step, we can get v1 into a state where it does "best
|
everywhere. As a first step, we can get v1 into a state where it does
|
||||||
effort" choice of multiprocessing method to maximize compatibility.
|
"best effort" choice of multiprocessing method to maximize compatibility.
|
||||||
|
|
||||||
- Default to `fork`.
|
- Default to `fork`.
|
||||||
- Use `spawn` when we know we control the main process (`vllm` was executed).
|
- Use `spawn` when we know we control the main process (`vllm` was executed).
|
||||||
@@ -154,8 +150,8 @@ RuntimeError:
|
|||||||
### Detect if a `__main__` guard is present
|
### Detect if a `__main__` guard is present
|
||||||
|
|
||||||
It has been suggested that we could behave better if we could detect whether
|
It has been suggested that we could behave better if we could detect whether
|
||||||
code using vLLM as a library has a `__main__` guard in place. This [post on
|
code using vLLM as a library has a `__main__` guard in place. This
|
||||||
stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
|
[post on Stack Overflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
|
||||||
was from a library author facing the same question.
|
was from a library author facing the same question.
|
||||||
|
|
||||||
It is possible to detect whether we are in the original, `__main__` process, or
|
It is possible to detect whether we are in the original, `__main__` process, or
|
||||||
@@ -192,4 +188,4 @@ that works around these challenges.
|
|||||||
2. We can explore other libraries that may better suit our needs. Examples to
|
2. We can explore other libraries that may better suit our needs. Examples to
|
||||||
consider:
|
consider:
|
||||||
|
|
||||||
- <https://github.com/joblib/loky>
|
- <https://github.com/joblib/loky>
|
||||||
|
|||||||
@@ -1,29 +1,18 @@
|
|||||||
<!-- markdownlint-disable -->
|
|
||||||
|
|
||||||
# Optimization Levels
|
# Optimization Levels
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechanism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out-of-the-box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
|
vLLM provides 4 optimization levels (`-O0`, `-O1`, `-O2`, `-O3`) that allow users to trade off startup time for performance:
|
||||||
|
|
||||||
|
- `-O0`: No optimization. Fastest startup time, but lowest performance.
|
||||||
|
- `-O1`: Fast optimization. Simple compilation and fast fusions, and PIECEWISE cudagraphs.
|
||||||
|
- `-O2`: Default optimization. Additional compilation ranges, additional fusions, FULL_AND_PIECEWISE cudagraphs.
|
||||||
|
- `-O3`: Aggressive optimization. Currently equal to `-O2`, but may include additional time-consuming or experimental optimizations in the future.
|
||||||
|
|
||||||
|
All optimization level defaults can be achieved by manually setting the underlying flags.
|
||||||
|
User-set flags take precedence over optimization level defaults.
|
||||||
|
|
||||||
## Level Summaries and Usage Examples
|
## Level Summaries and Usage Examples
|
||||||
```bash
|
|
||||||
# CLI usage
|
|
||||||
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
|
|
||||||
|
|
||||||
# Python API usage
|
|
||||||
from vllm.entrypoints.llm import LLM
|
|
||||||
|
|
||||||
llm = LLM(
|
|
||||||
model="RedHatAI/Llama-3.2-1B-FP8",
|
|
||||||
optimization_level=0
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### `-O1`: Quick Optimizations
|
|
||||||
- **Startup**: Moderate startup time
|
|
||||||
- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
|
|
||||||
- **Use case**: Balance for most development scenarios
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# CLI usage
|
# CLI usage
|
||||||
@@ -34,31 +23,59 @@ from vllm.entrypoints.llm import LLM
|
|||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="RedHatAI/Llama-3.2-1B-FP8",
|
model="RedHatAI/Llama-3.2-1B-FP8",
|
||||||
optimization_level=1
|
optimization_level=2 # equivalent to -O2
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### `-O2`: Full Optimizations (Default)
|
### `-O0`: No Optimization
|
||||||
- **Startup**: Longer startup time
|
|
||||||
- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
|
|
||||||
- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that noop & fusion flags are enabled.
|
|
||||||
|
|
||||||
```bash
|
Startup as fast as possible - no autotuning, no compilation, and no cudagraphs.
|
||||||
# CLI usage (default, so optional)
|
This level is good for initial phases of development and debugging.
|
||||||
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
|
|
||||||
|
|
||||||
# Python API usage
|
Settings:
|
||||||
from vllm.entrypoints.llm import LLM
|
|
||||||
|
|
||||||
llm = LLM(
|
- `-cc.cudagraph_mode=NONE`
|
||||||
model="RedHatAI/Llama-3.2-1B-FP8",
|
- `-cc.mode=NONE` (also resulting in `-cc.custom_ops=["none"]`)
|
||||||
optimization_level=2 # This is the default
|
- `-cc.pass_config.fuse_...=False` (all fusions disabled)
|
||||||
)
|
- `--kernel-config.enable_flashinfer_autotune=False`
|
||||||
```
|
|
||||||
|
|
||||||
#### `-O3`: Full Optimization
|
### `-O1`: Fast Optimization
|
||||||
Still in development. Added infrastructure to prevent changing API in future
|
|
||||||
release. Currently behaves the same O2.
|
Prioritize fast startup, but still enable basic optimizations like compilation and cudagraphs.
|
||||||
|
This level is a good balance for most development scenarios where you want faster startup but
|
||||||
|
still make sure your code does not break cudagraphs or compilation.
|
||||||
|
|
||||||
|
Settings:
|
||||||
|
|
||||||
|
- `-cc.cudagraph_mode=PIECEWISE`
|
||||||
|
- `-cc.mode=VLLM_COMPILE`
|
||||||
|
- `--kernel-config.enable_flashinfer_autotune=True`
|
||||||
|
|
||||||
|
Fusions:
|
||||||
|
|
||||||
|
- `-cc.pass_config.fuse_norm_quant=True`*
|
||||||
|
- `-cc.pass_config.fuse_act_quant=True`*
|
||||||
|
- `-cc.pass_config.fuse_act_padding=True`†
|
||||||
|
- `-cc.pass_config.fuse_rope_kvcache=True`† (will be moved to O2)
|
||||||
|
|
||||||
|
\* These fusions are only enabled when either op is using a custom kernel, otherwise Inductor fusion is better.</br>
|
||||||
|
† These fusions are ROCm-only and require AITER.
|
||||||
|
|
||||||
|
### `-O2`: Full Optimization (Default)
|
||||||
|
|
||||||
|
Prioritize performance at the expense of additional startup time.
|
||||||
|
This level is recommended for production workloads and is hence the default.
|
||||||
|
Fusions in this level _may_ take longer due to additional compile ranges.
|
||||||
|
|
||||||
|
Settings (on top of `-O1`):
|
||||||
|
|
||||||
|
- `-cc.cudagraph_mode=FULL_AND_PIECEWISE`
|
||||||
|
- `-cc.pass_config.fuse_allreduce_rms=True`
|
||||||
|
|
||||||
|
### `-O3`: Aggressive Optimization
|
||||||
|
|
||||||
|
This level is currently the same as `-O2`, but may include additional optimizations
|
||||||
|
in the future that are more time-consuming or experimental.
|
||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
|
|
||||||
|
|||||||
@@ -141,7 +141,7 @@ Every plugin has three parts:
|
|||||||
- triton ops
|
- triton ops
|
||||||
Custom way doesn't work for triton ops now.
|
Custom way doesn't work for triton ops now.
|
||||||
|
|
||||||
7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
|
7. (optional) Implement other pluggable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
|
||||||
|
|
||||||
## Compatibility Guarantee
|
## Compatibility Guarantee
|
||||||
|
|
||||||
@@ -155,4 +155,4 @@ The interface for the model/module may change during vLLM's development. If you
|
|||||||
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
|
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
|
||||||
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
|
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
|
||||||
- `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
|
- `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
|
||||||
- `prompt` in `Platform.validate_request` is deprecated and will be removed in v0.18.0.
|
- `prompt` in `Platform.validate_request` is deprecated. It has been removed in v0.18.0.
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ This feature is off by default, but can be enabled by setting `compile_mm_encode
|
|||||||
|
|
||||||
To compile a multimodal component such as an encoder, we follow the same mechanism as the LLM text backbone, with a few additional scaffoldings:
|
To compile a multimodal component such as an encoder, we follow the same mechanism as the LLM text backbone, with a few additional scaffoldings:
|
||||||
|
|
||||||
1. The `@support_torch_compile` decorator should include `enable_if=should_torch_compile_mm_vit`. This will gate the compilation behind our
|
1. The `@support_torch_compile` decorator should include `enable_if=should_torch_compile_mm_encoder`. This will gate the compilation behind our
|
||||||
`compile_mm_encoder` configuration
|
`compile_mm_encoder` configuration
|
||||||
|
|
||||||
2. `with set_model_tag("<component_name>", is_encoder=True)` context manager should be used around the nn.Module's instantiation. Since torch.compile
|
2. `with set_model_tag("<component_name>", is_encoder=True)` context manager should be used around the nn.Module's instantiation. Since torch.compile
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ th:not(:first-child) {
|
|||||||
</style>
|
</style>
|
||||||
|
|
||||||
| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
| - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
|
||||||
| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
|
| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
|
||||||
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
||||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||||
@@ -60,7 +60,7 @@ th:not(:first-child) {
|
|||||||
### Feature x Hardware
|
### Feature x Hardware
|
||||||
|
|
||||||
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | Intel GPU |
|
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | Intel GPU |
|
||||||
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------| ------------|
|
| ------- | ----- | ------ | ------ | --- | ------ | --- | --- | --------- |
|
||||||
| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user